From 5f1c6d79468e0c834f9925b3b6554406e0ea7ef5 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 15:47:44 -0700
Subject: [PATCH 01/50] rebase to latest main

---
 install_requirements.py | 161 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 153 insertions(+), 8 deletions(-)

diff --git a/install_requirements.py b/install_requirements.py
index cbae175e276..844ada0c7da 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -59,8 +59,16 @@ def python_is_compatible():
 
 
 # The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"
+# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
+TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
+# Supported CUDA versions - modify this to add/remove supported versions
+# Format: tuple of (major, minor) version numbers
+SUPPORTED_CUDA_VERSIONS = [
+    (12, 6),
+    (12, 8),
+    (12, 9),
+]
 
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
@@ -71,7 +79,137 @@ def python_is_compatible():
 #
 # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
 # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/
-NIGHTLY_VERSION = "dev20250906"
+#
+# NOTE: If you're changing, make the corresponding supported CUDA versions in
+# SUPPORTED_CUDA_VERSIONS above if needed.
+NIGHTLY_VERSION = "dev20250915"
+
+
+def _check_cuda_enabled():
+    """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable."""
+    cmake_args = os.environ.get("CMAKE_ARGS", "")
+    return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args
+
+
+def _cuda_version_to_pytorch_suffix(major, minor):
+    """
+    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
+
+    Args:
+        major: CUDA major version (e.g., 12)
+        minor: CUDA minor version (e.g., 6)
+
+    Returns:
+        PyTorch wheel suffix string (e.g., "cu126")
+    """
+    return f"cu{major}{minor}"
+
+
+def _get_cuda_version():
+    """
+    Get the CUDA version installed on the system using nvcc command.
+    Returns a tuple (major, minor).
+
+    Raises:
+        RuntimeError: if nvcc is not found or version cannot be parsed
+    """
+    try:
+        # Get CUDA version from nvcc (CUDA compiler)
+        nvcc_result = subprocess.run(
+            ["nvcc", "--version"], capture_output=True, text=True, check=True
+        )
+        # Parse nvcc output for CUDA version
+        # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68"
+        match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+
+            # Check if the detected version is supported
+            if (major, minor) not in SUPPORTED_CUDA_VERSIONS:
+                available_versions = ", ".join(
+                    [f"{maj}.{min}" for maj, min in SUPPORTED_CUDA_VERSIONS]
+                )
+                raise RuntimeError(
+                    f"Detected CUDA version {major}.{minor} is not supported. "
+                    f"Only the following CUDA versions are supported: {available_versions}. "
+                    f"Please install a supported CUDA version or try on CPU-only delegates."
+                )
+
+            return (major, minor)
+        else:
+            raise RuntimeError(
+                "CUDA delegate is enabled but could not parse CUDA version from nvcc output. "
+                "Please ensure CUDA is properly installed or try on CPU-only delegates."
+            )
+    except FileNotFoundError:
+        raise RuntimeError(
+            "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. "
+            "Please install CUDA toolkit or try on CPU-only delegates."
+        )
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(
+            f"CUDA delegate is enabled but nvcc command failed with error: {e}. "
+            "Please ensure CUDA is properly installed or try on CPU-only delegates."
+        )
+
+
+def _get_pytorch_cuda_url(cuda_version):
+    """
+    Get the appropriate PyTorch CUDA URL for the given CUDA version.
+
+    Args:
+        cuda_version: tuple of (major, minor) version numbers
+
+    Returns:
+        URL string for PyTorch CUDA packages
+    """
+    major, minor = cuda_version
+    # Generate CUDA suffix (version validation is already done in _get_cuda_version)
+    cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor)
+
+    return f"{TORCH_NIGHTLY_URL_BASE}/{cuda_suffix}"
+
+
+# url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url().
+# please do not directly rely on it, but use _determine_torch_url() instead.
+_torch_url = None
+
+
+def _determine_torch_url():
+    """
+    Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS.
+    Uses caching to avoid redundant CUDA detection and print statements.
+
+    Returns:
+        URL string for PyTorch packages
+    """
+    global _torch_url
+
+    # Return cached URL if already determined
+    if _torch_url is not None:
+        return _torch_url
+
+    # Check if CUDA delegate is enabled
+    if not _check_cuda_enabled():
+        print("CUDA delegate not enabled, using CPU-only PyTorch")
+        _torch_url = f"{TORCH_NIGHTLY_URL_BASE}/cpu"
+        return _torch_url
+
+    print("CUDA delegate enabled, detecting CUDA version...")
+
+    # Get CUDA version
+    cuda_version = _get_cuda_version()
+
+    major, minor = cuda_version
+    print(f"Detected CUDA version: {major}.{minor}")
+
+    # Get appropriate PyTorch CUDA URL
+    torch_url = _get_pytorch_cuda_url(cuda_version)
+    print(f"Using PyTorch URL: {torch_url}")
+
+    # Cache the result
+    _torch_url = torch_url
+    return torch_url
 
 
 def install_requirements(use_pytorch_nightly):
@@ -84,12 +222,16 @@ def install_requirements(use_pytorch_nightly):
         )
         sys.exit(1)
 
+    # Determine the appropriate PyTorch URL based on CUDA delegate status
+    torch_url = _determine_torch_url()
+
     # pip packages needed by exir.
     TORCH_PACKAGE = [
         # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
-        f"torch==2.9.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torch==2.10.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
+        f"torchao==0.14.0{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchao",
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -105,13 +247,13 @@ def install_requirements(use_pytorch_nightly):
             "requirements-dev.txt",
             *TORCH_PACKAGE,
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
         ],
         check=True,
     )
 
     LOCAL_REQUIREMENTS = [
-        "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
+        # "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
     ] + (
         [
             "extension/llm/tokenizers",  # TODO(larryliu0820): Setup a pypi package for this.
@@ -147,10 +289,13 @@ def install_requirements(use_pytorch_nightly):
 
 
 def install_optional_example_requirements(use_pytorch_nightly):
+    # Determine the appropriate PyTorch URL based on CUDA delegate status
+    torch_url = _determine_torch_url()
+
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
         (
-            f"torchvision==0.24.0.{NIGHTLY_VERSION}"
+            f"torchvision==0.25.0.{NIGHTLY_VERSION}"
             if use_pytorch_nightly
             else "torchvision"
         ),
@@ -165,7 +310,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
             "install",
             *DOMAIN_LIBRARIES,
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
         ],
         check=True,
     )
@@ -180,7 +325,7 @@ def install_optional_example_requirements(use_pytorch_nightly):
             "-r",
             "requirements-examples.txt",
             "--extra-index-url",
-            TORCH_NIGHTLY_URL,
+            torch_url,
             "--upgrade-strategy",
             "only-if-needed",
         ],

From 94d400140b7c74f095ae7ff61dc79e5871c763c2 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 16:41:16 -0700
Subject: [PATCH 02/50] add github ci for gpu pt install check

---
 .github/workflows/test-cuda-builds.yml | 68 ++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 .github/workflows/test-cuda-builds.yml

diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/test-cuda-builds.yml
new file mode 100644
index 00000000000..eef3287a920
--- /dev/null
+++ b/.github/workflows/test-cuda-builds.yml
@@ -0,0 +1,68 @@
+# Test ExecutorTorch CUDA Build Compatibility
+# This workflow tests whether ExecutorTorch can be successfully built with CUDA support
+# across different CUDA versions (12.6, 12.8, 12.9) using the command:
+# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh
+#
+# Note: ExecutorTorch automatically detects the system CUDA version using nvcc and
+# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed.
+
+name: Test CUDA Builds
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda-builds:
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda-version: ["12.6", "12.8", "12.9"]
+
+    name: test-executorch-cuda-build-${{ matrix.cuda-version }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        if [ -n "$CONDA_ENV" ]; then
+          conda activate "${CONDA_ENV}"
+        fi
+
+        # Test ExecutorTorch CUDA build - ExecutorTorch will automatically detect CUDA version
+        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}"
+
+  # This job will fail if any of the CUDA versions fail
+  check-all-cuda-builds:
+    needs: test-cuda-builds
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Check if all CUDA builds succeeded
+        run: |
+          if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then
+            echo "ERROR: One or more ExecutorTorch CUDA builds failed!"
+            echo "CUDA build results: ${{ needs.test-cuda-builds.result }}"
+            exit 1
+          else
+            echo "SUCCESS: All ExecutorTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!"
+          fi

From a0332ffb10743e563019dacc6bf77fa9e475a486 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 16:41:39 -0700
Subject: [PATCH 03/50] add github ci for gpu pt install check

---
 .ci/scripts/test-cuda-build.sh | 84 ++++++++++++++++++++++++++++++++++
 1 file changed, 84 insertions(+)
 create mode 100755 .ci/scripts/test-cuda-build.sh

diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
new file mode 100755
index 00000000000..8a9fedc4d7a
--- /dev/null
+++ b/.ci/scripts/test-cuda-build.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+CUDA_VERSION=${1:-"12.6"}
+
+echo "=== Testing ExecutorTorch CUDA ${CUDA_VERSION} Build ==="
+
+# Function to build and test ExecutorTorch with CUDA support
+test_executorch_cuda_build() {
+    local cuda_version=$1
+
+    echo "Building ExecutorTorch with CUDA ${cuda_version} support..."
+    echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel"
+
+    # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+
+    # Install ExecutorTorch with CUDA support - this will automatically:
+    # 1. Detect CUDA version using nvcc
+    # 2. Install appropriate PyTorch wheel for the detected CUDA version
+    # 3. Build ExecutorTorch with CUDA support
+    ./install_executorch.sh
+
+    echo "SUCCESS: ExecutorTorch CUDA build completed"
+
+    # Verify the installation
+    echo "=== Verifying ExecutorTorch CUDA Installation ==="
+
+    # Test that ExecutorTorch was built successfully
+    python -c "
+import executorch
+print('SUCCESS: ExecutorTorch imported successfully')
+"
+
+    # Test CUDA availability and show details
+    python -c "
+try:
+    import torch
+    print('INFO: PyTorch version:', torch.__version__)
+    print('INFO: CUDA available:', torch.cuda.is_available())
+
+    if torch.cuda.is_available():
+        print('SUCCESS: CUDA is available for ExecutorTorch')
+        print('INFO: CUDA version:', torch.version.cuda)
+        print('INFO: GPU device count:', torch.cuda.device_count())
+        print('INFO: Current GPU device:', torch.cuda.current_device())
+        print('INFO: GPU device name:', torch.cuda.get_device_name())
+
+        # Test basic CUDA tensor operation
+        device = torch.device('cuda')
+        x = torch.randn(10, 10).to(device)
+        y = torch.randn(10, 10).to(device)
+        z = torch.mm(x, y)
+        print('SUCCESS: CUDA tensor operation completed on device:', z.device)
+        print('INFO: Result tensor shape:', z.shape)
+
+        print('SUCCESS: ExecutorTorch CUDA integration verified')
+    else:
+        print('WARNING: CUDA not detected, but ExecutorTorch built successfully')
+        exit(1)
+except Exception as e:
+    print('ERROR: ExecutorTorch CUDA test failed:', e)
+    exit(1)
+"
+
+    echo "SUCCESS: ExecutorTorch CUDA ${cuda_version} build and verification completed successfully"
+}
+
+# Main execution
+echo "Current working directory: $(pwd)"
+echo "Directory contents:"
+ls -la
+
+# Run the CUDA build test
+test_executorch_cuda_build "${CUDA_VERSION}"

From 433c239b9639963b37604e3d410a9c0965c281a4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 16:58:32 -0700
Subject: [PATCH 04/50] recover torchao

---
 .ci/docker/ci_commit_pins/pytorch.txt | 2 +-
 install_requirements.py               | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 8c9330d6f2c..e3a53c8bcb5 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e
+53a2908a10f414a2f85caa06703a26a40e873869
diff --git a/install_requirements.py b/install_requirements.py
index 844ada0c7da..32303f80842 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -231,7 +231,6 @@ def install_requirements(use_pytorch_nightly):
         # that we don't need to set any version number there because they have already
         # been installed on CI before this step, so pip won't reinstall them
         f"torch==2.10.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch",
-        f"torchao==0.14.0{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchao",
     ]
 
     # Install the requirements for core ExecuTorch package.
@@ -253,7 +252,7 @@ def install_requirements(use_pytorch_nightly):
     )
 
     LOCAL_REQUIREMENTS = [
-        # "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
+        "third-party/ao",  # We need the latest kernels for fast iteration, so not relying on pypi.
     ] + (
         [
             "extension/llm/tokenizers",  # TODO(larryliu0820): Setup a pypi package for this.

From db7bef766e48b461c333438c1a95b1b79e103657 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 21:38:34 -0700
Subject: [PATCH 05/50] solve lint issue

---
 .ci/scripts/test-cuda-build.sh         | 24 +++++++++++++++++++-----
 .github/workflows/test-cuda-builds.yml |  2 +-
 install_requirements.py                |  4 ++--
 3 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh
index 8a9fedc4d7a..a9f8e7ec14f 100755
--- a/.ci/scripts/test-cuda-build.sh
+++ b/.ci/scripts/test-cuda-build.sh
@@ -21,14 +21,28 @@ test_executorch_cuda_build() {
     echo "Building ExecutorTorch with CUDA ${cuda_version} support..."
     echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel"
 
+    # Check available resources before starting
+    echo "=== System Information ==="
+    echo "Available memory: $(free -h | grep Mem | awk '{print $2}')"
+    echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')"
+    echo "CPU cores: $(nproc)"
+    echo "CUDA version check:"
+    nvcc --version || echo "nvcc not found"
+    nvidia-smi || echo "nvidia-smi not found"
+
     # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically
     export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
 
-    # Install ExecutorTorch with CUDA support - this will automatically:
-    # 1. Detect CUDA version using nvcc
-    # 2. Install appropriate PyTorch wheel for the detected CUDA version
-    # 3. Build ExecutorTorch with CUDA support
-    ./install_executorch.sh
+    echo "=== Starting ExecutorTorch Installation ==="
+    # Install ExecutorTorch with CUDA support with timeout and error handling
+    timeout 5400 ./install_executorch.sh || {
+        local exit_code=$?
+        echo "ERROR: install_executorch.sh failed with exit code: $exit_code"
+        if [ $exit_code -eq 124 ]; then
+            echo "ERROR: Installation timed out after 90 minutes"
+        fi
+        exit $exit_code
+    }
 
     echo "SUCCESS: ExecutorTorch CUDA build completed"
 
diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/test-cuda-builds.yml
index eef3287a920..eff26e72c67 100644
--- a/.github/workflows/test-cuda-builds.yml
+++ b/.github/workflows/test-cuda-builds.yml
@@ -17,7 +17,7 @@ on:
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
+  cancel-in-progress: false
 
 jobs:
   test-cuda-builds:
diff --git a/install_requirements.py b/install_requirements.py
index 32303f80842..e5a7c29c482 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -172,7 +172,7 @@ def _get_pytorch_cuda_url(cuda_version):
 
 # url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url().
 # please do not directly rely on it, but use _determine_torch_url() instead.
-_torch_url = None
+_torch_url = ""
 
 
 def _determine_torch_url():
@@ -186,7 +186,7 @@ def _determine_torch_url():
     global _torch_url
 
     # Return cached URL if already determined
-    if _torch_url is not None:
+    if _torch_url:
         return _torch_url
 
     # Check if CUDA delegate is enabled

From 3d324c7c1799b87509b8ccc5af40825152f73bfa Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 17 Sep 2025 12:45:25 -0700
Subject: [PATCH 06/50] create install_utils.py for better structure

---
 install_requirements.py | 190 +------------------------------------
 install_utils.py        | 201 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+), 187 deletions(-)
 create mode 100644 install_utils.py

diff --git a/install_requirements.py b/install_requirements.py
index e5a7c29c482..409ed083970 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -7,56 +7,10 @@
 
 import argparse
 import os
-import platform
-import re
 import subprocess
 import sys
 
-
-def python_is_compatible():
-    # Scrape the version range from pyproject.toml, which should be in the current directory.
-    version_specifier = None
-    with open("pyproject.toml", "r") as file:
-        for line in file:
-            if line.startswith("requires-python"):
-                match = re.search(r'"([^"]*)"', line)
-                if match:
-                    version_specifier = match.group(1)
-                    break
-
-    if not version_specifier:
-        print(
-            "WARNING: Skipping python version check: version range not found",
-            file=sys.stderr,
-        )
-        return False
-
-    # Install the packaging module if necessary.
-    try:
-        import packaging
-    except ImportError:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "packaging"], check=True
-        )
-    # Compare the current python version to the range in version_specifier. Exits
-    # with status 1 if the version is not compatible, or with status 0 if the
-    # version is compatible or the logic itself fails.
-    try:
-        import packaging.specifiers
-        import packaging.version
-
-        python_version = packaging.version.parse(platform.python_version())
-        version_range = packaging.specifiers.SpecifierSet(version_specifier)
-        if python_version not in version_range:
-            print(
-                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
-                file=sys.stderr,
-            )
-            return False
-    except Exception as e:
-        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
-    return True
-
+from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
 # The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
@@ -85,133 +39,6 @@ def python_is_compatible():
 NIGHTLY_VERSION = "dev20250915"
 
 
-def _check_cuda_enabled():
-    """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable."""
-    cmake_args = os.environ.get("CMAKE_ARGS", "")
-    return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args
-
-
-def _cuda_version_to_pytorch_suffix(major, minor):
-    """
-    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
-
-    Args:
-        major: CUDA major version (e.g., 12)
-        minor: CUDA minor version (e.g., 6)
-
-    Returns:
-        PyTorch wheel suffix string (e.g., "cu126")
-    """
-    return f"cu{major}{minor}"
-
-
-def _get_cuda_version():
-    """
-    Get the CUDA version installed on the system using nvcc command.
-    Returns a tuple (major, minor).
-
-    Raises:
-        RuntimeError: if nvcc is not found or version cannot be parsed
-    """
-    try:
-        # Get CUDA version from nvcc (CUDA compiler)
-        nvcc_result = subprocess.run(
-            ["nvcc", "--version"], capture_output=True, text=True, check=True
-        )
-        # Parse nvcc output for CUDA version
-        # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68"
-        match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout)
-        if match:
-            major, minor = int(match.group(1)), int(match.group(2))
-
-            # Check if the detected version is supported
-            if (major, minor) not in SUPPORTED_CUDA_VERSIONS:
-                available_versions = ", ".join(
-                    [f"{maj}.{min}" for maj, min in SUPPORTED_CUDA_VERSIONS]
-                )
-                raise RuntimeError(
-                    f"Detected CUDA version {major}.{minor} is not supported. "
-                    f"Only the following CUDA versions are supported: {available_versions}. "
-                    f"Please install a supported CUDA version or try on CPU-only delegates."
-                )
-
-            return (major, minor)
-        else:
-            raise RuntimeError(
-                "CUDA delegate is enabled but could not parse CUDA version from nvcc output. "
-                "Please ensure CUDA is properly installed or try on CPU-only delegates."
-            )
-    except FileNotFoundError:
-        raise RuntimeError(
-            "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. "
-            "Please install CUDA toolkit or try on CPU-only delegates."
-        )
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(
-            f"CUDA delegate is enabled but nvcc command failed with error: {e}. "
-            "Please ensure CUDA is properly installed or try on CPU-only delegates."
-        )
-
-
-def _get_pytorch_cuda_url(cuda_version):
-    """
-    Get the appropriate PyTorch CUDA URL for the given CUDA version.
-
-    Args:
-        cuda_version: tuple of (major, minor) version numbers
-
-    Returns:
-        URL string for PyTorch CUDA packages
-    """
-    major, minor = cuda_version
-    # Generate CUDA suffix (version validation is already done in _get_cuda_version)
-    cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor)
-
-    return f"{TORCH_NIGHTLY_URL_BASE}/{cuda_suffix}"
-
-
-# url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url().
-# please do not directly rely on it, but use _determine_torch_url() instead.
-_torch_url = ""
-
-
-def _determine_torch_url():
-    """
-    Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS.
-    Uses caching to avoid redundant CUDA detection and print statements.
-
-    Returns:
-        URL string for PyTorch packages
-    """
-    global _torch_url
-
-    # Return cached URL if already determined
-    if _torch_url:
-        return _torch_url
-
-    # Check if CUDA delegate is enabled
-    if not _check_cuda_enabled():
-        print("CUDA delegate not enabled, using CPU-only PyTorch")
-        _torch_url = f"{TORCH_NIGHTLY_URL_BASE}/cpu"
-        return _torch_url
-
-    print("CUDA delegate enabled, detecting CUDA version...")
-
-    # Get CUDA version
-    cuda_version = _get_cuda_version()
-
-    major, minor = cuda_version
-    print(f"Detected CUDA version: {major}.{minor}")
-
-    # Get appropriate PyTorch CUDA URL
-    torch_url = _get_pytorch_cuda_url(cuda_version)
-    print(f"Using PyTorch URL: {torch_url}")
-
-    # Cache the result
-    _torch_url = torch_url
-    return torch_url
-
-
 def install_requirements(use_pytorch_nightly):
     # Skip pip install on Intel macOS if using nightly.
     if use_pytorch_nightly and is_intel_mac_os():
@@ -223,7 +50,7 @@ def install_requirements(use_pytorch_nightly):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = _determine_torch_url()
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
@@ -289,7 +116,7 @@ def install_requirements(use_pytorch_nightly):
 
 def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = _determine_torch_url()
+    torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
@@ -332,17 +159,6 @@ def install_optional_example_requirements(use_pytorch_nightly):
     )
 
 
-# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source.
-# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024).
-def is_intel_mac_os():
-    # Returns True if running on Intel macOS.
-    return platform.system().lower() == "darwin" and platform.machine().lower() in (
-        "x86",
-        "x86_64",
-        "i386",
-    )
-
-
 def main(args):
     parser = argparse.ArgumentParser()
     parser.add_argument(
diff --git a/install_utils.py b/install_utils.py
new file mode 100644
index 00000000000..19da1b2193b
--- /dev/null
+++ b/install_utils.py
@@ -0,0 +1,201 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-25 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import platform
+import re
+import subprocess
+
+
+def _is_cuda_enabled():
+    """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable."""
+    cmake_args = os.environ.get("CMAKE_ARGS", "")
+    return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args
+
+
+def _cuda_version_to_pytorch_suffix(major, minor):
+    """
+    Generate PyTorch CUDA wheel suffix from CUDA version numbers.
+
+    Args:
+        major: CUDA major version (e.g., 12)
+        minor: CUDA minor version (e.g., 6)
+
+    Returns:
+        PyTorch wheel suffix string (e.g., "cu126")
+    """
+    return f"cu{major}{minor}"
+
+
+def _get_cuda_version(supported_cuda_versions):
+    """
+    Get the CUDA version installed on the system using nvcc command.
+    Returns a tuple (major, minor).
+
+    Args:
+        supported_cuda_versions: List of supported CUDA versions as tuples
+
+    Raises:
+        RuntimeError: if nvcc is not found or version cannot be parsed
+    """
+    try:
+        # Get CUDA version from nvcc (CUDA compiler)
+        nvcc_result = subprocess.run(
+            ["nvcc", "--version"], capture_output=True, text=True, check=True
+        )
+        # Parse nvcc output for CUDA version
+        # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68"
+        match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+
+            # Check if the detected version is supported
+            if (major, minor) not in supported_cuda_versions:
+                available_versions = ", ".join(
+                    [f"{maj}.{min}" for maj, min in supported_cuda_versions]
+                )
+                raise RuntimeError(
+                    f"Detected CUDA version {major}.{minor} is not supported. "
+                    f"Only the following CUDA versions are supported: {available_versions}. "
+                    f"Please install a supported CUDA version or try on CPU-only delegates."
+                )
+
+            return (major, minor)
+        else:
+            raise RuntimeError(
+                "CUDA delegate is enabled but could not parse CUDA version from nvcc output. "
+                "Please ensure CUDA is properly installed or try on CPU-only delegates."
+            )
+    except FileNotFoundError:
+        raise RuntimeError(
+            "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. "
+            "Please install CUDA toolkit or try on CPU-only delegates."
+        )
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(
+            f"CUDA delegate is enabled but nvcc command failed with error: {e}. "
+            "Please ensure CUDA is properly installed or try on CPU-only delegates."
+        )
+
+
+def _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base):
+    """
+    Get the appropriate PyTorch CUDA URL for the given CUDA version.
+
+    Args:
+        cuda_version: tuple of (major, minor) version numbers
+        torch_nightly_url_base: Base URL for PyTorch nightly packages
+
+    Returns:
+        URL string for PyTorch CUDA packages
+    """
+    major, minor = cuda_version
+    # Generate CUDA suffix (version validation is already done in _get_cuda_version)
+    cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor)
+
+    return f"{torch_nightly_url_base}/{cuda_suffix}"
+
+
+# Global variable for caching torch URL
+_torch_url_cache = ""
+
+
+def determine_torch_url(torch_nightly_url_base, supported_cuda_versions):
+    """
+    Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS.
+    Uses caching to avoid redundant CUDA detection and print statements.
+
+    Args:
+        torch_nightly_url_base: Base URL for PyTorch nightly packages
+        supported_cuda_versions: List of supported CUDA versions as tuples
+
+    Returns:
+        URL string for PyTorch packages
+    """
+    global _torch_url_cache
+
+    # Return cached URL if already determined
+    if _torch_url_cache:
+        return _torch_url_cache
+
+    # Check if CUDA delegate is enabled
+    if not _is_cuda_enabled():
+        print("CUDA delegate not enabled, using CPU-only PyTorch")
+        _torch_url_cache = f"{torch_nightly_url_base}/cpu"
+        return _torch_url_cache
+
+    print("CUDA delegate enabled, detecting CUDA version...")
+
+    # Get CUDA version
+    cuda_version = _get_cuda_version(supported_cuda_versions)
+
+    major, minor = cuda_version
+    print(f"Detected CUDA version: {major}.{minor}")
+
+    # Get appropriate PyTorch CUDA URL
+    torch_url = _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base)
+    print(f"Using PyTorch URL: {torch_url}")
+
+    # Cache the result
+    _torch_url_cache = torch_url
+    return torch_url
+
+
+# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source.
+# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024).
+def is_intel_mac_os():
+    # Returns True if running on Intel macOS.
+    return platform.system().lower() == "darwin" and platform.machine().lower() in (
+        "x86",
+        "x86_64",
+        "i386",
+    )
+
+
+def python_is_compatible():
+    # Scrape the version range from pyproject.toml, which should be in the current directory.
+    version_specifier = None
+    with open("pyproject.toml", "r") as file:
+        for line in file:
+            if line.startswith("requires-python"):
+                match = re.search(r'"([^"]*)"', line)
+                if match:
+                    version_specifier = match.group(1)
+                    break
+
+    if not version_specifier:
+        print(
+            "WARNING: Skipping python version check: version range not found",
+            file=sys.stderr,
+        )
+        return False
+
+    # Install the packaging module if necessary.
+    try:
+        import packaging
+    except ImportError:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "packaging"], check=True
+        )
+    # Compare the current python version to the range in version_specifier. Exits
+    # with status 1 if the version is not compatible, or with status 0 if the
+    # version is compatible or the logic itself fails.
+    try:
+        import packaging.specifiers
+        import packaging.version
+
+        python_version = packaging.version.parse(platform.python_version())
+        version_range = packaging.specifiers.SpecifierSet(version_specifier)
+        if python_version not in version_range:
+            print(
+                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
+                file=sys.stderr,
+            )
+            return False
+    except Exception as e:
+        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
+    return True

From a6e1918a77c1b53f36e9ebd110f31b66d9f8cb1c Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Mon, 4 Aug 2025 22:53:41 -0700
Subject: [PATCH 07/50] Add skeleton code

---
 CMakeLists.txt                        |   6 +
 backends/aoti/CMakeLists.txt          |  47 +++
 backends/aoti/README.md               |   2 +
 backends/aoti/aoti_backend.py         |  43 ++
 backends/aoti/aoti_partitioner.py     |  74 ++++
 backends/aoti/runtime/AotiBackend.cpp | 570 ++++++++++++++++++++++++++
 backends/aoti/runtime/TARGETS         |   3 +
 backends/aoti/runtime/targets.bzl     |  18 +
 install_requirements.py               |   1 -
 tools/cmake/executorch-config.cmake   |   1 +
 tools/cmake/preset/default.cmake      |   9 +
 11 files changed, 773 insertions(+), 1 deletion(-)
 create mode 100644 backends/aoti/CMakeLists.txt
 create mode 100644 backends/aoti/README.md
 create mode 100644 backends/aoti/aoti_backend.py
 create mode 100644 backends/aoti/aoti_partitioner.py
 create mode 100644 backends/aoti/runtime/AotiBackend.cpp
 create mode 100644 backends/aoti/runtime/TARGETS
 create mode 100644 backends/aoti/runtime/targets.bzl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fc427d517a9..4497fa133c0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -587,6 +587,12 @@ endif()
 
 if(EXECUTORCH_BUILD_CORTEX_M)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+  list(APPEND _executorch_backends coretex_m_backend)
+endif()
+
+if(EXECUTORCH_BUILD_AOTI)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
+  list(APPEND _executorch_backends aoti_backend)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
new file mode 100644
index 00000000000..12886bc0cac
--- /dev/null
+++ b/backends/aoti/CMakeLists.txt
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+
+find_package(CUDA)
+
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+set(_aoti_sources runtime/AotiBackend.cpp)
+
+add_library(aoti_backend STATIC ${_aoti_sources})
+target_include_directories(
+  aoti_backend PUBLIC ${_common_include_directories} ${CUDA_INCLUDE_DIRS}
+)
+
+target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC)
+target_link_libraries(aoti_backend PUBLIC extension_tensor ${CUDA_LIBRARIES})
+executorch_target_link_options_shared_lib(aoti_backend)
+
+install(
+  TARGETS aoti_backend
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+  INCLUDES
+  DESTINATION ${_common_include_directories}
+)
diff --git a/backends/aoti/README.md b/backends/aoti/README.md
new file mode 100644
index 00000000000..9df05c99e07
--- /dev/null
+++ b/backends/aoti/README.md
@@ -0,0 +1,2 @@
+## Experimental AOTI backend
+Proceed with caution. This is an experimental backend that is not yet ready for production use.
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
new file mode 100644
index 00000000000..d1e8a5b4896
--- /dev/null
+++ b/backends/aoti/aoti_backend.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+
+from subprocess import check_call
+from typing import final, List
+
+import torch
+from executorch.exir.backend.backend_details import (
+    BackendDetails,
+    ExportedProgram,
+    PreprocessResult,
+)
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+
+@final
+class AotiBackend(BackendDetails):
+    @staticmethod
+    def preprocess(
+        edge_program: ExportedProgram,
+        compile_specs: List[CompileSpec],
+    ) -> PreprocessResult:
+        print("entering  the lowerable parts in AotiBackend.preprocess....")
+
+        print("here", edge_program.example_inputs)
+        copy_edge_program = copy.deepcopy(edge_program)
+        graph_module = copy_edge_program.graph_module
+        args, kwargs = copy_edge_program.example_inputs
+        so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
+        print(so_path)
+        check_call(
+            f"patchelf --remove-needed libtorch.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}",
+            shell=True,
+        )
+
+        with open(so_path, "rb") as f:
+            data = f.read()
+        return PreprocessResult(data)
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
new file mode 100644
index 00000000000..6cb7c6cc38a
--- /dev/null
+++ b/backends/aoti/aoti_partitioner.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast, final, List
+
+import torch
+from executorch.backends.aoti.aoti_backend import AotiBackend  # usort: skip
+from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.backend.partitioner import (
+    DelegationSpec,
+    Partitioner,
+    PartitionResult,
+)
+from executorch.exir.backend.utils import tag_constant_data
+from executorch.exir.dialects._ops import ops as exir_ops
+from torch.export.exported_program import ExportedProgram
+from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+from torch.fx.passes.operator_support import OperatorSupportBase
+
+
+class AOTISupportedOperators(OperatorSupportBase):
+    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
+        supported = node.op == "call_function" and node.target in [
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten._to_copy.default,
+        ]
+
+        return supported
+
+    def is_node_supported_custom(self, node: torch.fx.Node) -> bool:
+        if node.target == exir_ops.edge.aten.mean.dim:
+            keep_dim = node.args[2] if len(node.args) > 2 else False
+            return cast(bool, keep_dim)
+        if node.target == exir_ops.edge.aten.var.correction:
+            keep_dim = node.kwargs.get("keepdim", False)
+            return cast(bool, keep_dim)
+        return True
+
+
+@final
+class AotiPartitioner(Partitioner):
+    def __init__(self, compile_spec: List[CompileSpec]) -> None:
+        self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec)
+        print(self.delegation_spec)
+
+    def partition(self, exported_program: ExportedProgram) -> PartitionResult:
+        # Run the CapabilityBasedPartitioner to return the largest possible
+        # subgraphs containing the nodes with the tags
+        # logger.info("AotiPartitioner::partition")
+        partition_tags = {}
+
+        capability_partitioner = CapabilityBasedPartitioner(
+            exported_program.graph_module,
+            AOTISupportedOperators(),
+            allows_single_node_partition=True,
+        )
+        partition_list = capability_partitioner.propose_partitions()
+        for partition in partition_list:
+            for node in partition.nodes:
+                tag = f"tag{partition.id}"
+                node.meta["delegation_tag"] = tag
+                partition_tags[tag] = self.delegation_spec
+
+        tag_constant_data(exported_program)
+
+        return PartitionResult(
+            tagged_exported_program=exported_program, partition_tags=partition_tags
+        )
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
new file mode 100644
index 00000000000..0044a4155d6
--- /dev/null
+++ b/backends/aoti/runtime/AotiBackend.cpp
@@ -0,0 +1,570 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/arm/runtime/VelaBinStream.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "cuda_runtime.h"
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Here is where the aoti bouncers are going to be defined.
+// I define the globals aoti generated compiled code calls
+// They can be backed by ET systems
+
+using namespace std;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+using AOTITensorHandle = Tensor*;
+
+// TODO: We should get a proper one
+struct CUDAStreamGuardOpaque;
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
+struct AOTInductorModelContainerOpaque;
+using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
+using AOTInductorStreamHandle = void*;
+using AOTIProxyExecutorHandle = void*;
+
+using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir);
+
+using AOTInductorModelContainerDeleteFunc =
+    AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle);
+
+using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AOTITensorHandle*
+        output_handles, // array for writing output AOTITensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+AOTInductorModelContainerCreateWithDeviceFunc
+    AOTInductorModelContainerCreateWithDevice = nullptr;
+AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr;
+AOTInductorModelContainerGetNumInputsFunc
+    AOTInductorModelContainerGetNumInputs = nullptr;
+AOTInductorModelContainerGetNumOutputsFunc
+    AOTInductorModelContainerGetNumOutputs = nullptr;
+AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+std::unordered_set<std::shared_ptr<Tensor>> tensors;
+
+int32_t aoti_torch_grad_mode_is_enabled() {
+  // No autograd ever
+  return false;
+}
+
+void aoti_torch_grad_mode_set_enabled(bool enabled) {
+  if (enabled) {
+    throw std::runtime_error("Cannot enable autograd");
+  }
+}
+
+AOTITorchError aoti_torch_get_data_ptr(
+    AOTITensorHandle tensor,
+    void** ret_data_ptr) {
+  *ret_data_ptr = tensor->mutable_data_ptr();
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_storage_offset(
+    AOTITensorHandle tensor,
+    int64_t* ret_storage_offset) {
+  // Storage offset is always 0 in ET
+  *ret_storage_offset = 0;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_strides(
+    AOTITensorHandle tensor,
+    int64_t** ret_strides) {
+  auto it = tensor_to_strides.find(tensor);
+  if (it == tensor_to_strides.end()) {
+    std::vector<int64_t> strides(tensor->dim());
+    auto tensor_strides = tensor->strides();
+    for (int i = 0; i < tensor->dim(); i++) {
+      strides[i] = tensor_strides[i];
+    }
+    it = tensor_to_strides.emplace(tensor, std::move(strides)).first;
+  }
+  *ret_strides = it->second.data();
+  std::cout << "getting strides from tensor " << tensor << " with dim "
+            << tensor->dim() << std::endl;
+  for (int i = 0; i < tensor->dim(); i++) {
+    std::cout << "strides " << i << " = " << *ret_strides[i] << std::endl;
+  }
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dtype(
+    AOTITensorHandle tensor,
+    int32_t* ret_dtype) {
+  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_sizes(
+    AOTITensorHandle tensor,
+    int64_t** ret_sizes) {
+  auto it = tensor_to_sizes.find(tensor);
+  if (it == tensor_to_sizes.end()) {
+    std::vector<int64_t> sizes(tensor->dim());
+    auto tensor_sizes = tensor->sizes();
+    for (int i = 0; i < tensor->dim(); i++) {
+      sizes[i] = tensor_sizes[i];
+    }
+    it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
+  }
+  *ret_sizes = it->second.data();
+  std::cout << "getting sizes from tensor " << tensor << " with dim "
+            << tensor->dim() << std::endl;
+  for (int i = 0; i < tensor->dim(); i++) {
+    std::cout << "size " << i << " = " << *ret_sizes[i] << std::endl;
+  }
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_storage_size(
+    AOTITensorHandle tensor,
+    int64_t* ret_size) {
+  throw std::runtime_error("Cannot get storage size on ETensor");
+}
+
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size) {
+  throw std::runtime_error("Not creating Tensor from blob here");
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  std::cout << "Entering stream guard for device " << device_index << std::endl;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  std::cout << "Exiting stream guard" << std::endl;
+  return Error::Ok;
+}
+
+int aoti_torch_device_type_cpu() {
+  // Let's say cpu is 0 for ET as well
+  return 0;
+}
+
+__attribute__((__visibility__("default"))) int32_t
+aoti_torch_device_type_cuda() {
+  // Let's say cuda is 1 for ET as well
+  return 1;
+}
+
+__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
+  // Let assume the dtype here is all we will support
+  return 6;
+}
+
+AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
+  std::cout << "Deleting " << tensor << std::endl;
+  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+    if (it->get() == tensor) {
+      tensors.erase(it);
+      break; // Exit the loop once the element is found and removed
+    }
+  }
+  return Error::Ok;
+}
+AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor) {
+  throw std::runtime_error("Should never create from blob");
+}
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor) {
+  // This requires us to reserve CUDA memory and put it into a ETensor
+  void* ptr;
+  int64_t numel = 1;
+  for (int i = 0; i < ndim; i++) {
+    numel *= sizes_ptr[i];
+  }
+
+  if (dtype != 6) { // throw if not float32
+    throw std::runtime_error("Need to implement empty_strided for non-float32");
+  }
+
+  int64_t nbytes = numel * 4;
+
+  if (device_type == 1) { // cuda
+    std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl;
+    cudaError_t err = cudaMalloc(&ptr, nbytes);
+    if (err != cudaSuccess) {
+      std::cout << "failed to allocate " << nbytes << std::endl;
+      throw std::runtime_error("Failed to call cudaMalloc");
+    }
+  } else if (device_type == 0) { // cpu
+    std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl;
+    ptr = malloc(nbytes);
+    if (ptr == nullptr) {
+      throw std::runtime_error("Failed to call malloc");
+    }
+  } else {
+    throw std::runtime_error(
+        "Need to implement empty_strided for non-CUDA non-CPU");
+  }
+  std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr "
+            << sizes_ptr << std::endl;
+
+  // ETensor sizes
+  std::vector<int32_t> sizes(ndim);
+  for (int i = 0; i < ndim; i++) {
+    sizes[i] = sizes_ptr[i];
+  }
+  // ETensor creation
+  auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr);
+
+  // Store the tensor
+  tensors.insert(tensor);
+
+  std::cout << "sizes.data(): " << sizes.data()
+            << ", tensor->sizes().data(): " << tensor->sizes().data()
+            << std::endl;
+  std::cout << "Size[0] of tensor " << tensor.get() << " is "
+            << tensor->sizes()[0] << std::endl;
+  *ret_new_tensor = tensor.get();
+  return Error::Ok;
+}
+
+void checkCudaError(cudaError_t err, const char* msg) {
+  if (err != cudaSuccess) {
+    std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")"
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+AOTITorchError aoti_torch_copy_(
+    AOTITensorHandle self,
+    AOTITensorHandle src,
+    int32_t non_blocking) {
+  // check if size is the same
+  if (self->dim() != src->dim()) {
+    std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim()
+              << std::endl;
+    throw std::runtime_error("self.dim() != src.dim()");
+  }
+  std::cout << "self->data_ptr(): " << self->data_ptr()
+            << " sizes: " << self->sizes().data() << std::endl;
+  std::cout << "src->data_ptr(): " << src->data_ptr()
+            << " sizes: " << src->sizes().data() << std::endl;
+  for (int i = 0; i < self->dim(); i++) {
+    if (self->sizes()[i] != src->sizes()[i]) {
+      std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] "
+                << src->sizes()[i] << std::endl;
+      throw std::runtime_error("size mismatch");
+    }
+  }
+
+  int size = src->nbytes();
+  // should check for device
+  cudaPointerAttributes srcAttributes, dstAttributes;
+  cudaError_t err;
+  // Get attributes of the source pointer
+  err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr());
+  checkCudaError(err, "Failed to get source pointer attributes");
+  // Get attributes of the destination pointer
+  err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr());
+  checkCudaError(err, "Failed to get destination pointer attributes");
+  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
+  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
+  // Determine the memory locations and perform the appropriate copy
+  if (srcIsDevice && dstIsDevice) {
+    // Device to Device copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyDeviceToDevice);
+    checkCudaError(err, "Failed to copy from device to device");
+  } else if (srcIsDevice && !dstIsDevice) {
+    // Device to Host copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyDeviceToHost);
+    std::cout << "Device to Host copy, self data: "
+              << ((float*)self->data_ptr())[0] << std::endl;
+    checkCudaError(err, "Failed to copy from device to host");
+  } else if (!srcIsDevice && dstIsDevice) {
+    // Host to Device copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyHostToDevice);
+    std::cout << "Host to Device copy, src data: "
+              << ((float*)src->data_ptr())[0] << std::endl;
+    checkCudaError(err, "Failed to copy from host to device");
+  } else if (!srcIsDevice && !dstIsDevice) {
+    // Host to Host copy
+    std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0]
+              << std::endl;
+    std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size);
+  } else {
+    std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type
+              << ", src: " << srcAttributes.type << std::endl;
+    throw std::runtime_error("Unknown memory type");
+  }
+  // print first value of src and self
+  return Error::Ok;
+}
+}
+
+struct AOTIDelegateHandle {
+  void* so_handle;
+  AOTInductorModelContainerHandle container_handle;
+};
+
+class AOTIBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  // Once in program
+  AOTIBackend() {
+    ET_LOG(Info, "AOTIBackend ctor");
+  }
+
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be the buffer from aoti_backend
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    // We could load the .so content directly. But I don't want to deal with
+    // relocation. So dumping a file and using dlopen
+
+    // Create a temporary file
+    std::ofstream outfile("/tmp/test.so", std::ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    outfile.write((char*)processed->data(), sizeof(void*) * processed->size());
+
+    // Finish writing the file to disk
+    outfile.close();
+
+    // Free the in-memory buffer
+    processed->Free();
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen("/tmp/test.so", RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      std::cout << dlerror() << std::endl;
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      perror("dlsym1");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      perror("dlsym2");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      perror("dlsym3");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      perror("dlsym4");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      perror("dlsym5");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    AOTIRuntimeError err;
+
+    err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr);
+    printf("container_handle=%p\n", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      EValue** args) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    size_t num_inputs;
+    AOTInductorModelContainerGetNumInputs(
+        handle->container_handle, &num_inputs);
+
+    size_t num_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &num_outputs);
+
+    std::vector<AOTITensorHandle> inputs(num_inputs);
+    std::vector<AOTITensorHandle> outputs(num_outputs);
+
+    for (int i = 0; i < num_inputs; i++) {
+      auto tensor_in = args[i]->toTensor();
+      inputs[i] = &tensor_in;
+    }
+
+    for (int i = num_inputs; i < num_inputs + num_outputs; i++) {
+      auto tensor_out = args[i]->toTensor();
+      outputs[i - num_inputs] = &tensor_out;
+    }
+
+    AOTInductorModelContainerRun(
+        handle->container_handle,
+        inputs.data(),
+        num_inputs,
+        outputs.data(),
+        num_outputs,
+        // Should these last two be something?
+        nullptr,
+        nullptr);
+
+    // Still need to copy the output to args, because they are malloc'ed but
+    // not using the data_ptr from outputs.
+    for (int i = 0; i < num_outputs; i++) {
+      auto args_out = args[i + num_inputs]->toTensor();
+      aoti_torch_copy_(&args_out, outputs[i], 0);
+    }
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    dlclose(handle->so_handle);
+    AOTInductorModelContainerDelete(handle->container_handle);
+    free(handle);
+    tensor_to_sizes.clear();
+    tensor_to_strides.clear();
+  }
+};
+
+} // namespace aoti
+
+namespace {
+auto cls = aoti::AOTIBackend();
+executorch::runtime::Backend backend{"AotiBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/TARGETS b/backends/aoti/runtime/TARGETS
new file mode 100644
index 00000000000..77871de4469
--- /dev/null
+++ b/backends/aoti/runtime/TARGETS
@@ -0,0 +1,3 @@
+load("targets.bzl", "define_common_targets")
+
+define_common_targets()
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl
new file mode 100644
index 00000000000..d51097f306d
--- /dev/null
+++ b/backends/aoti/runtime/targets.bzl
@@ -0,0 +1,18 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    runtime.cxx_library(
+        name = "aoti_backend",
+        srcs = ["AotiBackend.cpp"],
+        headers = [],
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        # Constructor needed for backend registration.
+        compiler_flags = ["-Wno-global-constructors"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        deps = [
+            "//executorch/runtime/backend:interface",
+            "//executorch/runtime/core:core",
+        ],
+    )
diff --git a/install_requirements.py b/install_requirements.py
index 409ed083970..0e0084fe3dd 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,7 +12,6 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-# The pip repository that hosts nightly torch packages.
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
 
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 6c27e8ba616..ba9a686ccb9 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -53,6 +53,7 @@ set(EXECUTORCH_FOUND ON)
 include("${CMAKE_CURRENT_LIST_DIR}/ExecuTorchTargets.cmake")
 
 set(optional_lib_list
+    aoti_backend
     flatccrt
     etdump
     bundled_program
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index fb0dc0a4ade..6911aea3e9b 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -160,6 +160,11 @@ define_overridable_option(
   OFF
 )
 
+define_overridable_option(
+  EXECUTORCH_BUILD_AOTI "Build the AOTI backend" BOOL OFF
+)
+
+
 if(EXECUTORCH_BUILD_ARM_BAREMETAL)
   set(_default_executorch_build_pthreadpool OFF)
   set(_default_executorch_build_cpuinfo OFF)
@@ -317,6 +322,10 @@ check_required_options_on(
   EXECUTORCH_BUILD_PTHREADPOOL
 )
 
+check_required_options_on(
+  IF_ON EXECUTORCH_BUILD_AOTI REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+)
+
 check_conflicting_options_on(
   IF_ON EXECUTORCH_BUILD_ARM_BAREMETAL CONFLICTS_WITH
   EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO

From 687688b0886850881b51897a3c8fb766185d2a77 Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu@meta.com>
Date: Tue, 5 Aug 2025 23:06:56 -0700
Subject: [PATCH 08/50] Add export_add.py

---
 backends/aoti/aoti_partitioner.py |  2 +-
 exir/backend/backend_api.py       |  3 ++
 exir/lowered_backend_module.py    |  1 +
 export_add.py                     | 31 +++++++++++++++++++
 install_requirements.py           | 50 +++++++++++++++++++++++++++++++
 5 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 export_add.py

diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index 6cb7c6cc38a..e1524480698 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -28,7 +28,7 @@ class AOTISupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported = node.op == "call_function" and node.target in [
             exir_ops.edge.aten.add.Tensor,
-            exir_ops.edge.aten._to_copy.default,
+            exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
         ]
 
         return supported
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index dd8d97d66ac..95c7c9caa6d 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -720,6 +720,9 @@ def to_backend(
             fake_edge_program = copy.deepcopy(edge_program)
         partitioner_result = partitioner_instance(fake_edge_program)
         tagged_exported_program = partitioner_result.tagged_exported_program
+        # Make sure tagged_exported_program has the same example_inputs as edge_program
+        tagged_exported_program.example_inputs = edge_program.example_inputs
+
         method_to_tagged_exported_program[method_name] = tagged_exported_program
 
         # Check that the partitioner did not modify the original graph
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 61414990703..2e889c6d81d 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -735,6 +735,7 @@ def create_exported_program_from_submodule(
                     ),
                 )
             ],
+            example_inputs=owning_program.example_inputs,
             constants=subgraph_constants,
             verifiers=[owning_program.verifier],
         ),
diff --git a/export_add.py b/export_add.py
new file mode 100644
index 00000000000..cfaf9ab1c96
--- /dev/null
+++ b/export_add.py
@@ -0,0 +1,31 @@
+import torch
+from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
+from executorch.exir import to_edge
+from torch.export import export
+
+
+# Start with a PyTorch model that adds two input tensors (matrices)
+class Add(torch.nn.Module):
+    def __init__(self):
+        super(Add, self).__init__()
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        # return triton_transpose_acc(x, y)
+        return (x.cuda() + y.cuda()).cpu()
+
+
+# 1. torch.export: Defines the program with the ATen operator set.
+aten_dialect = export(
+    Add(), (torch.ones(10, device="cpu"), torch.ones(10, device="cpu"))
+)
+# 2. to_edge: Make optimizations for Edge devices
+edge_program = to_edge(aten_dialect)
+
+edge_program = edge_program.to_backend(AotiPartitioner([]))
+
+# 3. to_executorch: Convert the graph to an ExecuTorch program
+executorch_program = edge_program.to_executorch()
+
+# 4. Save the compiled .pte program
+with open("add.pte", "wb") as file:
+    file.write(executorch_program.buffer)
diff --git a/install_requirements.py b/install_requirements.py
index 0e0084fe3dd..7e6d3010f93 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,8 +12,58 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
+<<<<<<< HEAD
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
+=======
+def python_is_compatible():
+    # Scrape the version range from pyproject.toml, which should be in the current directory.
+    version_specifier = None
+    with open("pyproject.toml", "r") as file:
+        for line in file:
+            if line.startswith("requires-python"):
+                match = re.search(r'"([^"]*)"', line)
+                if match:
+                    version_specifier = match.group(1)
+                    break
+
+    if not version_specifier:
+        print(
+            "WARNING: Skipping python version check: version range not found",
+            file=sys.stderr,
+        )
+        return False
+
+    # Install the packaging module if necessary.
+    try:
+        import packaging
+    except ImportError:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "packaging"], check=True
+        )
+    # Compare the current python version to the range in version_specifier. Exits
+    # with status 1 if the version is not compatible, or with status 0 if the
+    # version is compatible or the logic itself fails.
+    try:
+        import packaging.specifiers
+        import packaging.version
+
+        python_version = packaging.version.parse(platform.python_version())
+        version_range = packaging.specifiers.SpecifierSet(version_specifier)
+        if python_version not in version_range:
+            print(
+                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
+                file=sys.stderr,
+            )
+            return False
+    except Exception as e:
+        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
+    return True
+
+
+# The pip repository that hosts nightly torch packages.
+TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cu126"
+>>>>>>> fe438f9c92 (Add export_add.py)
 
 # Supported CUDA versions - modify this to add/remove supported versions
 # Format: tuple of (major, minor) version numbers

From 0ce692887dd2f7d0e188790361846566988bcba4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 8 Aug 2025 00:19:57 -0700
Subject: [PATCH 09/50] prototype e2e works on latest ET

---
 CMakeLists.txt                        |  2 ++
 backends/aoti/CMakeLists.txt          | 31 +++++++++++++++++----------
 backends/aoti/aoti_backend.py         |  2 +-
 backends/aoti/runtime/AotiBackend.cpp |  1 -
 export_and_run_aoti.sh                | 11 ++++++++++
 requirements-dev.txt                  |  1 +
 6 files changed, 35 insertions(+), 13 deletions(-)
 create mode 100644 export_and_run_aoti.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4497fa133c0..f98246e1851 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,8 @@
 # https://github.com/google/XNNPACK/commit/c690daa67f883e1b627aadf7684c06797e9a0684
 cmake_minimum_required(VERSION 3.29)
 project(executorch)
+# project(executorch LANGUAGES CXX CUDA)
+
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 12886bc0cac..36059f16fe4 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -21,27 +21,36 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+# include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 
-find_package(CUDA)
-
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+find_package(CUDAToolkit REQUIRED)
 
 set(_aoti_sources runtime/AotiBackend.cpp)
-
 add_library(aoti_backend STATIC ${_aoti_sources})
 target_include_directories(
-  aoti_backend PUBLIC ${_common_include_directories} ${CUDA_INCLUDE_DIRS}
+  aoti_backend
+  PUBLIC
+    ${CUDAToolkit_INCLUDE_DIRS}
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
 )
-
 target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC)
-target_link_libraries(aoti_backend PUBLIC extension_tensor ${CUDA_LIBRARIES})
+# Ensure symbols are exported properly
+target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic)
+# Link against CUDA::cudart (the CUDA runtime library)
+target_link_libraries(
+  aoti_backend
+  PUBLIC
+    extension_tensor
+    CUDA::cudart
+    ${CMAKE_DL_LIBS}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...)
+# If you have a custom function, keep it
 executorch_target_link_options_shared_lib(aoti_backend)
-
 install(
   TARGETS aoti_backend
   EXPORT ExecuTorchTargets
   DESTINATION lib
-  INCLUDES
-  DESTINATION ${_common_include_directories}
 )
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index d1e8a5b4896..b9491f023e3 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -34,7 +34,7 @@ def preprocess(
         so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
         print(so_path)
         check_call(
-            f"patchelf --remove-needed libtorch.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}",
+            f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}",
             shell=True,
         )
 
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
index 0044a4155d6..61b54cdc554 100644
--- a/backends/aoti/runtime/AotiBackend.cpp
+++ b/backends/aoti/runtime/AotiBackend.cpp
@@ -6,7 +6,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/arm/runtime/VelaBinStream.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
new file mode 100644
index 00000000000..7113e44dfe5
--- /dev/null
+++ b/export_and_run_aoti.sh
@@ -0,0 +1,11 @@
+./install_executorch.sh
+python export_add.py
+./install_executorch.sh --clean
+mkdir -p cmake-out
+cd cmake-out
+cmake -DEXECUTORCH_BUILD_AOTI=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+      ..
+cd ..
+cmake --build cmake-out -j9
+./cmake-out/executor_runner --model_path add.pte
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 9df5e7b93ed..8c8f518a5ea 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -9,3 +9,4 @@ zstd  # Imported by resolve_buck.py.
 certifi  # Imported by resolve_buck.py.
 lintrunner==0.12.7
 lintrunner-adapters==0.12.6
+patchelf

From 9afb4ac62620d8b974b00617981330b5b824349e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 11 Aug 2025 13:01:06 -0700
Subject: [PATCH 10/50] hacky support .so file seperation by hardcoding file
 path

---
 backends/aoti/aoti_backend.py         | 18 ++++++++++++------
 backends/aoti/runtime/AotiBackend.cpp | 22 +++++++++++++---------
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index b9491f023e3..c0691d5c075 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -16,7 +16,8 @@
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-
+import os
+import shutil
 
 @final
 class AotiBackend(BackendDetails):
@@ -31,13 +32,18 @@ def preprocess(
         copy_edge_program = copy.deepcopy(edge_program)
         graph_module = copy_edge_program.graph_module
         args, kwargs = copy_edge_program.example_inputs
-        so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
-        print(so_path)
+        temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
+        so_path = os.path.join(os.getcwd(), 'aoti.so')
+        print("so_path after aot_compile: ", temp_so_path)
+        print("so path we will using ", so_path)
+        shutil.copyfile(temp_so_path, so_path)
+
         check_call(
             f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}",
             shell=True,
         )
 
-        with open(so_path, "rb") as f:
-            data = f.read()
-        return PreprocessResult(data)
+        # with open(so_path, "rb") as f:
+        #     data = f.read()
+
+        return PreprocessResult(so_path.encode("utf-8"))
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
index 61b54cdc554..42c58394f22 100644
--- a/backends/aoti/runtime/AotiBackend.cpp
+++ b/backends/aoti/runtime/AotiBackend.cpp
@@ -430,20 +430,24 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     // We could load the .so content directly. But I don't want to deal with
     // relocation. So dumping a file and using dlopen
 
-    // Create a temporary file
-    std::ofstream outfile("/tmp/test.so", std::ios::binary);
+    // // Create a temporary file
+    // std::ofstream outfile("/tmp/test.so", std::ios::binary);
 
-    // Write the ELF buffer to the temporary file
-    outfile.write((char*)processed->data(), sizeof(void*) * processed->size());
+    // // Write the ELF buffer to the temporary file
+    // outfile.write((char*)processed->data(), sizeof(void*) * processed->size());
 
-    // Finish writing the file to disk
-    outfile.close();
+    // // Finish writing the file to disk
+    // outfile.close();
 
-    // Free the in-memory buffer
-    processed->Free();
+    // // Free the in-memory buffer
+    // processed->Free();
+
+    const char* so_path = static_cast<const char*>(processed->data());
+
+    printf("so path: %s\n", so_path);
 
     // Load the ELF using dlopen
-    void* so_handle = dlopen("/tmp/test.so", RTLD_LAZY | RTLD_LOCAL);
+    void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
     if (so_handle == nullptr) {
       std::cout << dlerror() << std::endl;
       return Error::AccessFailed;

From cef27e1254e85d9dfeb866503db5a4a8c8fa7e95 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 11 Aug 2025 13:04:43 -0700
Subject: [PATCH 11/50] hacky support .so file seperation by hardcoding file
 path

---
 backends/aoti/aoti_backend.py         |  3 ---
 backends/aoti/runtime/AotiBackend.cpp | 17 ++---------------
 2 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index c0691d5c075..2653820e914 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -43,7 +43,4 @@ def preprocess(
             shell=True,
         )
 
-        # with open(so_path, "rb") as f:
-        #     data = f.read()
-
         return PreprocessResult(so_path.encode("utf-8"))
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
index 42c58394f22..5a945403b3a 100644
--- a/backends/aoti/runtime/AotiBackend.cpp
+++ b/backends/aoti/runtime/AotiBackend.cpp
@@ -427,21 +427,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       FreeableBuffer* processed, // This will be the buffer from aoti_backend
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    // We could load the .so content directly. But I don't want to deal with
-    // relocation. So dumping a file and using dlopen
-
-    // // Create a temporary file
-    // std::ofstream outfile("/tmp/test.so", std::ios::binary);
-
-    // // Write the ELF buffer to the temporary file
-    // outfile.write((char*)processed->data(), sizeof(void*) * processed->size());
-
-    // // Finish writing the file to disk
-    // outfile.close();
-
-    // // Free the in-memory buffer
-    // processed->Free();
-
     const char* so_path = static_cast<const char*>(processed->data());
 
     printf("so path: %s\n", so_path);
@@ -453,6 +438,8 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       return Error::AccessFailed;
     }
 
+    processed->Free();
+
     AOTInductorModelContainerCreateWithDevice =
         reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
             dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));

From c85fc4278734bd3486b0aa0afff2d7659e849a4e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 12 Aug 2025 14:11:22 -0700
Subject: [PATCH 12/50] support latest backend interface

---
 backends/aoti/aoti_backend.py         |  7 ++++---
 backends/aoti/aoti_partitioner.py     | 10 ++++++----
 backends/aoti/runtime/AotiBackend.cpp |  3 ++-
 export_add.py                         |  2 +-
 export_and_run_aoti.sh                |  4 ++--
 export_mv2.py                         | 28 +++++++++++++++++++++++++++
 6 files changed, 43 insertions(+), 11 deletions(-)
 create mode 100644 export_mv2.py

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index 2653820e914..efc61006fc2 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -5,6 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import copy
+import os
+import shutil
 
 from subprocess import check_call
 from typing import final, List
@@ -16,8 +18,7 @@
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
-import os
-import shutil
+
 
 @final
 class AotiBackend(BackendDetails):
@@ -33,7 +34,7 @@ def preprocess(
         graph_module = copy_edge_program.graph_module
         args, kwargs = copy_edge_program.example_inputs
         temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
-        so_path = os.path.join(os.getcwd(), 'aoti.so')
+        so_path = os.path.join(os.getcwd(), "aoti.so")
         print("so_path after aot_compile: ", temp_so_path)
         print("so path we will using ", so_path)
         shutil.copyfile(temp_so_path, so_path)
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index e1524480698..836a8fcb8c4 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -23,13 +23,15 @@
 
 from torch.fx.passes.operator_support import OperatorSupportBase
 
+supported_fallback_operators = []
+
 
 class AOTISupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        supported = node.op == "call_function" and node.target in [
-            exir_ops.edge.aten.add.Tensor,
-            exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
-        ]
+        supported = (
+            node.op == "call_function"
+            and node.target not in supported_fallback_operators
+        )
 
         return supported
 
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
index 5a945403b3a..94e15f0596f 100644
--- a/backends/aoti/runtime/AotiBackend.cpp
+++ b/backends/aoti/runtime/AotiBackend.cpp
@@ -50,6 +50,7 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
 using executorch::runtime::Result;
 using executorch::runtime::etensor::Tensor;
+using executorch::runtime::Span;
 
 extern "C" {
 using AOTITensorHandle = Tensor*;
@@ -494,7 +495,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   Error execute(
       BackendExecutionContext& context,
       DelegateHandle* handle_,
-      EValue** args) const override {
+      Span<EValue*> args) const override {
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
     size_t num_inputs;
diff --git a/export_add.py b/export_add.py
index cfaf9ab1c96..d0d2489b885 100644
--- a/export_add.py
+++ b/export_add.py
@@ -27,5 +27,5 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
 executorch_program = edge_program.to_executorch()
 
 # 4. Save the compiled .pte program
-with open("add.pte", "wb") as file:
+with open("aoti_model.pte", "wb") as file:
     file.write(executorch_program.buffer)
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 7113e44dfe5..01c023f0e8f 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -1,5 +1,5 @@
 ./install_executorch.sh
-python export_add.py
+python $1
 ./install_executorch.sh --clean
 mkdir -p cmake-out
 cd cmake-out
@@ -8,4 +8,4 @@ cmake -DEXECUTORCH_BUILD_AOTI=ON \
       ..
 cd ..
 cmake --build cmake-out -j9
-./cmake-out/executor_runner --model_path add.pte
+./cmake-out/executor_runner --model_path aoti_model.pte
diff --git a/export_mv2.py b/export_mv2.py
new file mode 100644
index 00000000000..fa84084088f
--- /dev/null
+++ b/export_mv2.py
@@ -0,0 +1,28 @@
+import torch
+from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
+from executorch.examples.models.mobilenet_v2 import MV2Model
+from executorch.exir import to_edge
+from torch.export import export
+from torchvision import models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+
+mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
+mv2 = mv2.eval()
+
+model_inputs = (torch.randn(1, 3, 224, 224),)
+
+
+# 1. torch.export: Defines the program with the ATen operator set.
+aten_dialect = export(mv2, model_inputs)
+
+# 2. to_edge: Make optimizations for Edge devices
+edge_program = to_edge(aten_dialect)
+
+edge_program = edge_program.to_backend(AotiPartitioner([]))
+
+# 3. to_executorch: Convert the graph to an ExecuTorch program
+executorch_program = edge_program.to_executorch()
+
+# 4. Save the compiled .pte program
+with open("aoti_model.pte", "wb") as file:
+    file.write(executorch_program.buffer)

From 2ab79f09f042c5b66f7b5bfd2519509ce2ff7302 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 19 Aug 2025 15:45:46 -0700
Subject: [PATCH 13/50] temp submit for execute model with weight

---
 backends/aoti/CMakeLists.txt                  |    6 +-
 backends/aoti/aoti_backend.py                 |   28 +-
 backends/aoti/aoti_partitioner.py             |  180 +-
 backends/aoti/runtime/AotiBackend.cpp         |  561 --
 backends/aoti/runtime/aoti_backend.cpp        |  302 +
 .../aoti/runtime/aoti_model_container.cpp     |   34 +
 backends/aoti/runtime/aoti_model_container.h  |   91 +
 backends/aoti/runtime/shims/memory.cpp        |  389 ++
 backends/aoti/runtime/shims/memory.h          |  102 +
 .../aoti/runtime/shims/tensor_attribute.cpp   |  137 +
 .../aoti/runtime/shims/tensor_attribute.h     |   76 +
 backends/aoti/runtime/targets.bzl             |   13 +-
 ...ky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin |  Bin 0 -> 11320 bytes
 ...rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin |  Bin 0 -> 10048 bytes
 ...x5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin |  Bin 0 -> 10816 bytes
 ...ci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin |  Bin 0 -> 10176 bytes
 ...c3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp |    6 +
 ...3am2yslkkhyp4e7oaf7ej.kernel_metadata.json |    1 +
 ...vspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin |  Bin 0 -> 11320 bytes
 ...6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin |  Bin 0 -> 10936 bytes
 ...xauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin |  Bin 0 -> 11320 bytes
 ...47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin |  Bin 0 -> 10944 bytes
 ...ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin |  Bin 0 -> 11320 bytes
 ...2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp |  965 +++
 ...kmcvkgx3hnjvysymcgms.wrapper_metadata.json |    1 +
 ...nhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin |  Bin 0 -> 10936 bytes
 ...jd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp | 6144 +++++++++++++++++
 ...6ndtpaca5r3ct3piucq7.wrapper_metadata.json |    1 +
 ...x6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin |  Bin 0 -> 11320 bytes
 ...f6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp |  965 +++
 ...x5fugystcn2wozmmxwaf.wrapper_metadata.json |    1 +
 ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin |  Bin 0 -> 8968 bytes
 ...retoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin |  Bin 0 -> 9784 bytes
 ...ugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin |  Bin 0 -> 11320 bytes
 ...2j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin |  Bin 0 -> 13832 bytes
 ...hvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin |  Bin 0 -> 11320 bytes
 ...6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin |  Bin 0 -> 10296 bytes
 ...kxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin |  Bin 0 -> 13240 bytes
 ...lgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp |    6 +
 ...uw6cqsbpvx756nf43k7mq.kernel_metadata.json |    1 +
 ...ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin |  Bin 0 -> 9528 bytes
 ...ksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp |    6 +
 ...4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json |    1 +
 ...rsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin |  Bin 0 -> 9528 bytes
 ...pudstbhsobm3wlczsly46p5oeax43spr3eab.cubin |  Bin 0 -> 21056 bytes
 ...reaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin |  Bin 0 -> 10296 bytes
 ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin |  Bin 0 -> 11656 bytes
 ...oylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin |  Bin 0 -> 10296 bytes
 ...wncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin |  Bin 0 -> 15624 bytes
 ...47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin |  Bin 0 -> 10816 bytes
 ...zkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin |  Bin 0 -> 11320 bytes
 ...jkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin |  Bin 0 -> 9528 bytes
 ...zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin |  Bin 0 -> 11400 bytes
 ...zc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin |  Bin 0 -> 13832 bytes
 ...vkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin |  Bin 0 -> 6280 bytes
 exir/program/_program.py                      |    2 +
 export_add.py                                 |   31 -
 export_and_run_aoti.sh                        |  141 +-
 export_aoti.py                                |  178 +
 export_mv2.py                                 |   28 -
 runtime/executor/method.cpp                   |    2 +
 61 files changed, 9754 insertions(+), 645 deletions(-)
 delete mode 100644 backends/aoti/runtime/AotiBackend.cpp
 create mode 100644 backends/aoti/runtime/aoti_backend.cpp
 create mode 100644 backends/aoti/runtime/aoti_model_container.cpp
 create mode 100644 backends/aoti/runtime/aoti_model_container.h
 create mode 100644 backends/aoti/runtime/shims/memory.cpp
 create mode 100644 backends/aoti/runtime/shims/memory.h
 create mode 100644 backends/aoti/runtime/shims/tensor_attribute.cpp
 create mode 100644 backends/aoti/runtime/shims/tensor_attribute.h
 create mode 100644 c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin
 create mode 100644 c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin
 create mode 100644 c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin
 create mode 100644 c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin
 create mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
 create mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
 create mode 100644 c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin
 create mode 100644 c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin
 create mode 100644 c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin
 create mode 100644 cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin
 create mode 100644 caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin
 create mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
 create mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
 create mode 100644 cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin
 create mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
 create mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
 create mode 100644 cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin
 create mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
 create mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
 create mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
 create mode 100644 cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin
 create mode 100644 cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin
 create mode 100644 ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin
 create mode 100644 cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin
 create mode 100644 ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin
 create mode 100644 cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin
 create mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
 create mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
 create mode 100644 cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin
 create mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
 create mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
 create mode 100644 crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin
 create mode 100644 csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin
 create mode 100644 ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin
 create mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
 create mode 100644 cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin
 create mode 100644 cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin
 create mode 100644 cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin
 create mode 100644 cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin
 create mode 100644 cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin
 create mode 100644 cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin
 create mode 100644 cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin
 create mode 100644 czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin
 delete mode 100644 export_add.py
 create mode 100644 export_aoti.py
 delete mode 100644 export_mv2.py

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 36059f16fe4..1c596fef6e6 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -25,7 +25,11 @@ endif()
 
 find_package(CUDAToolkit REQUIRED)
 
-set(_aoti_sources runtime/AotiBackend.cpp)
+set(_aoti_sources
+    runtime/aoti_backend.cpp
+    runtime/aoti_model_container.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp)
 add_library(aoti_backend STATIC ${_aoti_sources})
 target_include_directories(
   aoti_backend
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index efc61006fc2..a0c4a2aa005 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -7,6 +7,7 @@
 import copy
 import os
 import shutil
+import typing
 
 from subprocess import check_call
 from typing import final, List
@@ -29,18 +30,29 @@ def preprocess(
     ) -> PreprocessResult:
         print("entering  the lowerable parts in AotiBackend.preprocess....")
 
-        print("here", edge_program.example_inputs)
+        # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
-        graph_module = copy_edge_program.graph_module
+        # graph_module = copy_edge_program.graph_module
+        edge_program_module = copy_edge_program.module()
         args, kwargs = copy_edge_program.example_inputs
-        temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={})  # type: ignore[arg-type]
-        so_path = os.path.join(os.getcwd(), "aoti.so")
-        print("so_path after aot_compile: ", temp_so_path)
-        print("so path we will using ", so_path)
-        shutil.copyfile(temp_so_path, so_path)
+        # print("args, kwargs", args, kwargs)
+        print("len(args)", len(args))
+        print("args[0].shape", args[0].shape)
+        print("len(kwargs)", len(kwargs))
+
+        output_path = os.path.join(os.getcwd(), "aoti.so")
+
+        options: dict[str, typing.Any] = {
+            "aot_inductor.package_constants_in_so": True,
+            "aot_inductor.output_path": output_path,
+        }
+
+        so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
+
+        assert so_path == output_path, f"Expected {output_path} but got {so_path}"
 
         check_call(
-            f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}",
+            f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}",
             shell=True,
         )
 
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index 836a8fcb8c4..f72b97f0253 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -6,6 +6,7 @@
 
 # pyre-unsafe
 
+import operator
 from typing import cast, final, List
 
 import torch
@@ -25,14 +26,180 @@
 
 supported_fallback_operators = []
 
+inductor_fallback_ops: dict[str, dict[str, list[str]]] = {
+    "aten._adaptive_avg_pool2d_backward.default": {},
+    "aten._adaptive_avg_pool2d.default": {},
+    "aten._adaptive_avg_pool3d_backward.default": {},
+    "aten._adaptive_avg_pool3d.default": {},
+    "aten._addmm_activation.default": {},
+    "aten._cdist_backward.default": {},
+    "aten._cdist_forward.default": {},
+    "aten._cudnn_rnn.default": {},
+    "aten._dyn_quant_matmul_4bit.default": {},
+    "aten._dyn_quant_pack_4bit_weight.default": {},
+    "aten._efficient_attention_backward.default": {},
+    "aten._efficient_attention_forward.default": {},
+    "aten._efficientzerotensor.default": {},
+    "aten._embedding_bag_dense_backward.default": {},
+    "aten._embedding_bag_forward_only.default": {},
+    "aten._embedding_bag_per_sample_weights_backward.default": {},
+    "aten._embedding_bag.default": {},
+    "aten._fft_c2c.default": {},
+    "aten._fft_r2c.default": {},
+    "aten._flash_attention_backward.default": {},
+    "aten._flash_attention_forward.default": {},
+    "aten._fused_moving_avg_obs_fq_helper_functional.default": {},
+    "aten._fused_moving_avg_obs_fq_helper.default": {},
+    "aten._fused_rms_norm.default": {},
+    "aten._histogramdd_from_bin_cts.default": {},
+    "aten._int_mm.out": {},
+    "aten._pdist_backward.default": {},
+    "aten._pdist_forward.default": {},
+    "aten._scaled_dot_product_attention_math_for_mps.default": {},
+    "aten._scaled_dot_product_cudnn_attention_backward.default": {},
+    "aten._scaled_dot_product_cudnn_attention.default": {},
+    "aten._scaled_dot_product_efficient_attention_backward.default": {},
+    "aten._scaled_dot_product_efficient_attention.default": {},
+    "aten._scaled_dot_product_flash_attention_backward.default": {},
+    "aten._scaled_dot_product_flash_attention_for_cpu_backward.default": {},
+    "aten._scaled_dot_product_flash_attention_for_cpu.default": {},
+    "aten._scaled_dot_product_flash_attention.default": {},
+    "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {},
+    "aten._scaled_dot_product_fused_attention_overrideable.default": {},
+    "aten._scaled_mm.default": {},
+    "aten._scaled_mm.out": {},
+    "aten._segment_reduce_backward.default": {},
+    "aten._thnn_fused_lstm_cell.default": {},
+    "aten._to_sparse.default": {},
+    "aten._trilinear.default": {},
+    "aten._weight_int4pack_mm.default": {},
+    "aten._weight_int8pack_mm.default": {},
+    "aten.abs.default": {},
+    "aten.adaptive_max_pool2d_backward.default": {},
+    "aten.adaptive_max_pool2d.default": {},
+    "aten.adaptive_max_pool3d_backward.default": {},
+    "aten.adaptive_max_pool3d.default": {},
+    "aten.add.Scalar": {},
+    "aten.add.Tensor": {},
+    "aten.addbmm.default": {},
+    "aten.addmm.out": {},
+    "aten.addmv.default": {},
+    "aten.angle.default": {},
+    "aten.avg_pool2d_backward.default": {},
+    "aten.avg_pool2d.default": {},
+    "aten.avg_pool3d_backward.default": {},
+    "aten.avg_pool3d.default": {},
+    "aten.baddbmm.out": {},
+    "aten.bernoulli_.float": {},
+    "aten.bernoulli_.Tensor": {},
+    "aten.bmm.out": {},
+    "aten.bucketize.Tensor": {},
+    "aten.cat.default": {},
+    "aten.cholesky_inverse.default": {},
+    "aten.cholesky_solve.default": {},
+    "aten.convolution_backward.default": {},
+    "aten.convolution.default": {},
+    "aten.cummax.default": {},
+    "aten.cummin.default": {},
+    "aten.cumprod.default": {},
+    "aten.cumsum.default": {},
+    "aten.exponential.default": {},
+    "aten.fill_.Scalar": {},
+    "aten.fractional_max_pool2d_backward.default": {},
+    "aten.fractional_max_pool2d.default": {},
+    "aten.fractional_max_pool3d_backward.default": {},
+    "aten.fractional_max_pool3d.default": {},
+    "aten.gcd.default": {},
+    "aten.geqrf.default": {},
+    "aten.grid_sampler_2d_backward.default": {},
+    "aten.hann_window.default": {},
+    "aten.histc.default": {},
+    "aten.histogram.bin_ct": {},
+    "aten.index_put.default": {},
+    "aten.index_reduce.default": {},
+    "aten.index.Tensor": {},
+    "aten.kthvalue.default": {},
+    "aten.logcumsumexp.default": {},
+    "aten.lu_unpack.default": {},
+    "aten.masked_scatter_backward.default": {},
+    "aten.masked_scatter.default": {},
+    "aten.masked_select.default": {},
+    "aten.max_pool2d_with_indices_backward.default": {},
+    "aten.max_pool2d_with_indices.default": {},
+    "aten.max_pool3d_with_indices_backward.default": {},
+    "aten.max_pool3d_with_indices.default": {},
+    "aten.max_unpool2d.default": {},
+    "aten.max_unpool3d.default": {},
+    "aten.median.default": {},
+    "aten.mm.out": {},
+    "aten.mode.default": {},
+    "aten.mul.Scalar": {},
+    "aten.mul.Tensor": {},
+    "aten.nanmedian.default": {},
+    "aten.narrow.default": {},
+    "aten.native_dropout.default": {},
+    "aten.nonzero.default": {},
+    "aten.normal_functional.default": {},
+    "aten.ormqr.default": {},
+    "aten.pad.default": {},
+    "aten.permute.default": {},
+    "aten.polar.default": {},
+    "aten.pow.Scalar": {},
+    "aten.pow.Tensor_Scalar": {},
+    "aten.pow.Tensor_Tensor": {},
+    "aten.rand.default": {},
+    "aten.rand.generator": {},
+    "aten.randint.default": {},
+    "aten.randint.generator": {},
+    "aten.randint.low_out": {},
+    "aten.randint.low": {},
+    "aten.randn.default": {},
+    "aten.randn.generator": {},
+    "aten.randperm.default": {},
+    "aten.repeat_interleave.Tensor": {},
+    "aten.replication_pad1d_backward.default": {},
+    "aten.replication_pad2d_backward.default": {},
+    "aten.reshape.default": {},
+    "aten.resize_.default": {},
+    "aten.resize_as_.default": {},
+    "aten.scatter_reduce.two_out": {},
+    "aten.scatter.src_out": {},
+    "aten.scatter.value_out": {},
+    "aten.searchsorted.Scalar": {},
+    "aten.searchsorted.Tensor": {},
+    "aten.segment_reduce.default": {},
+    "aten.set_.source_Tensor": {},
+    "aten.slice.Tensor": {},
+    "aten.soft_margin_loss_backward.default": {},
+    "aten.sort.default": {},
+    "aten.sort.stable": {},
+    "aten.squeeze.dim": {},
+    "aten.to_sparse.default": {},
+    "aten.topk.default": {},
+    "aten.triangular_solve.default": {},
+    "aten.uniform.default": {},
+    "aten.upsample_bicubic2d_backward.default": {},
+    "aten.upsample_linear1d_backward.default": {},
+    "aten.upsample_trilinear3d_backward.default": {},
+    "aten.view_as_complex.default": {},
+    "aten.view_as_real.default": {},
+    "aten.view.dtype": {},
+    "aten._weight_int4pack_mm_with_scales_and_zeros.default": {},
+}
+
 
 class AOTISupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        supported = (
-            node.op == "call_function"
-            and node.target not in supported_fallback_operators
+        supported = node.op == "call_function" and (
+            node.target == operator.getitem
+            or node.target._op not in inductor_fallback_ops
         )
 
+        # if node.op == "call_function" and node.target != operator.getitem:
+        #     print(node.target._op)
+        #     print(supported)
+        #     print('------------------')
+
         return supported
 
     def is_node_supported_custom(self, node: torch.fx.Node) -> bool:
@@ -55,6 +222,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
         # subgraphs containing the nodes with the tags
         # logger.info("AotiPartitioner::partition")
+        print("entering partitioner...")
+
         partition_tags = {}
 
         capability_partitioner = CapabilityBasedPartitioner(
@@ -63,6 +232,11 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
             allows_single_node_partition=True,
         )
         partition_list = capability_partitioner.propose_partitions()
+
+        assert len(partition_list) == 1, "Graph break is not supported yet"
+
+        print(f"graph breaks into {len(partition_list)} parts")
+
         for partition in partition_list:
             for node in partition.nodes:
                 tag = f"tag{partition.id}"
diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp
deleted file mode 100644
index 94e15f0596f..00000000000
--- a/backends/aoti/runtime/AotiBackend.cpp
+++ /dev/null
@@ -1,561 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/backend/interface.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/evalue.h>
-
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "cuda_runtime.h"
-
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-// Here is where the aoti bouncers are going to be defined.
-// I define the globals aoti generated compiled code calls
-// They can be backed by ET systems
-
-using namespace std;
-
-using executorch::aten::ScalarType;
-using executorch::runtime::ArrayRef;
-using executorch::runtime::Backend;
-using executorch::runtime::BackendExecutionContext;
-using executorch::runtime::BackendInitContext;
-using executorch::runtime::CompileSpec;
-using executorch::runtime::DelegateHandle;
-using executorch::runtime::Error;
-using executorch::runtime::EValue;
-using executorch::runtime::FreeableBuffer;
-using executorch::runtime::MemoryAllocator;
-using executorch::runtime::Result;
-using executorch::runtime::etensor::Tensor;
-using executorch::runtime::Span;
-
-extern "C" {
-using AOTITensorHandle = Tensor*;
-
-// TODO: We should get a proper one
-struct CUDAStreamGuardOpaque;
-using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
-
-using AOTIRuntimeError = Error;
-using AOTITorchError = Error;
-
-struct AOTInductorModelContainerOpaque;
-using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
-using AOTInductorStreamHandle = void*;
-using AOTIProxyExecutorHandle = void*;
-
-using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    const char* device_str,
-    const char* cubin_dir);
-
-using AOTInductorModelContainerDeleteFunc =
-    AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle);
-
-using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants);
-
-using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants);
-
-using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
-    AOTInductorModelContainerHandle container_handle,
-    AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AOTITensorHandle*
-        output_handles, // array for writing output AOTITensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle);
-
-AOTInductorModelContainerCreateWithDeviceFunc
-    AOTInductorModelContainerCreateWithDevice = nullptr;
-AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr;
-AOTInductorModelContainerGetNumInputsFunc
-    AOTInductorModelContainerGetNumInputs = nullptr;
-AOTInductorModelContainerGetNumOutputsFunc
-    AOTInductorModelContainerGetNumOutputs = nullptr;
-AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
-std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
-std::unordered_set<std::shared_ptr<Tensor>> tensors;
-
-int32_t aoti_torch_grad_mode_is_enabled() {
-  // No autograd ever
-  return false;
-}
-
-void aoti_torch_grad_mode_set_enabled(bool enabled) {
-  if (enabled) {
-    throw std::runtime_error("Cannot enable autograd");
-  }
-}
-
-AOTITorchError aoti_torch_get_data_ptr(
-    AOTITensorHandle tensor,
-    void** ret_data_ptr) {
-  *ret_data_ptr = tensor->mutable_data_ptr();
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_storage_offset(
-    AOTITensorHandle tensor,
-    int64_t* ret_storage_offset) {
-  // Storage offset is always 0 in ET
-  *ret_storage_offset = 0;
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_strides(
-    AOTITensorHandle tensor,
-    int64_t** ret_strides) {
-  auto it = tensor_to_strides.find(tensor);
-  if (it == tensor_to_strides.end()) {
-    std::vector<int64_t> strides(tensor->dim());
-    auto tensor_strides = tensor->strides();
-    for (int i = 0; i < tensor->dim(); i++) {
-      strides[i] = tensor_strides[i];
-    }
-    it = tensor_to_strides.emplace(tensor, std::move(strides)).first;
-  }
-  *ret_strides = it->second.data();
-  std::cout << "getting strides from tensor " << tensor << " with dim "
-            << tensor->dim() << std::endl;
-  for (int i = 0; i < tensor->dim(); i++) {
-    std::cout << "strides " << i << " = " << *ret_strides[i] << std::endl;
-  }
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_dtype(
-    AOTITensorHandle tensor,
-    int32_t* ret_dtype) {
-  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_sizes(
-    AOTITensorHandle tensor,
-    int64_t** ret_sizes) {
-  auto it = tensor_to_sizes.find(tensor);
-  if (it == tensor_to_sizes.end()) {
-    std::vector<int64_t> sizes(tensor->dim());
-    auto tensor_sizes = tensor->sizes();
-    for (int i = 0; i < tensor->dim(); i++) {
-      sizes[i] = tensor_sizes[i];
-    }
-    it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
-  }
-  *ret_sizes = it->second.data();
-  std::cout << "getting sizes from tensor " << tensor << " with dim "
-            << tensor->dim() << std::endl;
-  for (int i = 0; i < tensor->dim(); i++) {
-    std::cout << "size " << i << " = " << *ret_sizes[i] << std::endl;
-  }
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_get_storage_size(
-    AOTITensorHandle tensor,
-    int64_t* ret_size) {
-  throw std::runtime_error("Cannot get storage size on ETensor");
-}
-
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size) {
-  throw std::runtime_error("Not creating Tensor from blob here");
-}
-
-AOTITorchError aoti_torch_create_cuda_stream_guard(
-    void* stream,
-    int32_t device_index,
-    CUDAStreamGuardHandle* ret_guard) {
-  std::cout << "Entering stream guard for device " << device_index << std::endl;
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_delete_cuda_stream_guard(
-    CUDAStreamGuardHandle guard) {
-  std::cout << "Exiting stream guard" << std::endl;
-  return Error::Ok;
-}
-
-int aoti_torch_device_type_cpu() {
-  // Let's say cpu is 0 for ET as well
-  return 0;
-}
-
-__attribute__((__visibility__("default"))) int32_t
-aoti_torch_device_type_cuda() {
-  // Let's say cuda is 1 for ET as well
-  return 1;
-}
-
-__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
-  // Let assume the dtype here is all we will support
-  return 6;
-}
-
-AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
-  std::cout << "Deleting " << tensor << std::endl;
-  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
-    if (it->get() == tensor) {
-      tensors.erase(it);
-      break; // Exit the loop once the element is found and removed
-    }
-  }
-  return Error::Ok;
-}
-AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor) {
-  throw std::runtime_error("Should never create from blob");
-}
-
-AOTITorchError aoti_torch_empty_strided(
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor) {
-  // This requires us to reserve CUDA memory and put it into a ETensor
-  void* ptr;
-  int64_t numel = 1;
-  for (int i = 0; i < ndim; i++) {
-    numel *= sizes_ptr[i];
-  }
-
-  if (dtype != 6) { // throw if not float32
-    throw std::runtime_error("Need to implement empty_strided for non-float32");
-  }
-
-  int64_t nbytes = numel * 4;
-
-  if (device_type == 1) { // cuda
-    std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl;
-    cudaError_t err = cudaMalloc(&ptr, nbytes);
-    if (err != cudaSuccess) {
-      std::cout << "failed to allocate " << nbytes << std::endl;
-      throw std::runtime_error("Failed to call cudaMalloc");
-    }
-  } else if (device_type == 0) { // cpu
-    std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl;
-    ptr = malloc(nbytes);
-    if (ptr == nullptr) {
-      throw std::runtime_error("Failed to call malloc");
-    }
-  } else {
-    throw std::runtime_error(
-        "Need to implement empty_strided for non-CUDA non-CPU");
-  }
-  std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr "
-            << sizes_ptr << std::endl;
-
-  // ETensor sizes
-  std::vector<int32_t> sizes(ndim);
-  for (int i = 0; i < ndim; i++) {
-    sizes[i] = sizes_ptr[i];
-  }
-  // ETensor creation
-  auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr);
-
-  // Store the tensor
-  tensors.insert(tensor);
-
-  std::cout << "sizes.data(): " << sizes.data()
-            << ", tensor->sizes().data(): " << tensor->sizes().data()
-            << std::endl;
-  std::cout << "Size[0] of tensor " << tensor.get() << " is "
-            << tensor->sizes()[0] << std::endl;
-  *ret_new_tensor = tensor.get();
-  return Error::Ok;
-}
-
-void checkCudaError(cudaError_t err, const char* msg) {
-  if (err != cudaSuccess) {
-    std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")"
-              << std::endl;
-    exit(EXIT_FAILURE);
-  }
-}
-
-AOTITorchError aoti_torch_copy_(
-    AOTITensorHandle self,
-    AOTITensorHandle src,
-    int32_t non_blocking) {
-  // check if size is the same
-  if (self->dim() != src->dim()) {
-    std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim()
-              << std::endl;
-    throw std::runtime_error("self.dim() != src.dim()");
-  }
-  std::cout << "self->data_ptr(): " << self->data_ptr()
-            << " sizes: " << self->sizes().data() << std::endl;
-  std::cout << "src->data_ptr(): " << src->data_ptr()
-            << " sizes: " << src->sizes().data() << std::endl;
-  for (int i = 0; i < self->dim(); i++) {
-    if (self->sizes()[i] != src->sizes()[i]) {
-      std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] "
-                << src->sizes()[i] << std::endl;
-      throw std::runtime_error("size mismatch");
-    }
-  }
-
-  int size = src->nbytes();
-  // should check for device
-  cudaPointerAttributes srcAttributes, dstAttributes;
-  cudaError_t err;
-  // Get attributes of the source pointer
-  err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr());
-  checkCudaError(err, "Failed to get source pointer attributes");
-  // Get attributes of the destination pointer
-  err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr());
-  checkCudaError(err, "Failed to get destination pointer attributes");
-  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
-  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
-  // Determine the memory locations and perform the appropriate copy
-  if (srcIsDevice && dstIsDevice) {
-    // Device to Device copy
-    err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyDeviceToDevice);
-    checkCudaError(err, "Failed to copy from device to device");
-  } else if (srcIsDevice && !dstIsDevice) {
-    // Device to Host copy
-    err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyDeviceToHost);
-    std::cout << "Device to Host copy, self data: "
-              << ((float*)self->data_ptr())[0] << std::endl;
-    checkCudaError(err, "Failed to copy from device to host");
-  } else if (!srcIsDevice && dstIsDevice) {
-    // Host to Device copy
-    err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyHostToDevice);
-    std::cout << "Host to Device copy, src data: "
-              << ((float*)src->data_ptr())[0] << std::endl;
-    checkCudaError(err, "Failed to copy from host to device");
-  } else if (!srcIsDevice && !dstIsDevice) {
-    // Host to Host copy
-    std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0]
-              << std::endl;
-    std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size);
-  } else {
-    std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type
-              << ", src: " << srcAttributes.type << std::endl;
-    throw std::runtime_error("Unknown memory type");
-  }
-  // print first value of src and self
-  return Error::Ok;
-}
-}
-
-struct AOTIDelegateHandle {
-  void* so_handle;
-  AOTInductorModelContainerHandle container_handle;
-};
-
-class AOTIBackend final : public ::executorch::runtime::BackendInterface {
- public:
-  // Once in program
-  AOTIBackend() {
-    ET_LOG(Info, "AOTIBackend ctor");
-  }
-
-  bool is_available() const override {
-    return 1;
-  }
-
-  // Once per loaded binary blob
-  Result<DelegateHandle*> init(
-      BackendInitContext& context,
-      FreeableBuffer* processed, // This will be the buffer from aoti_backend
-      ArrayRef<CompileSpec> compile_specs // This will be my empty list
-  ) const override {
-    const char* so_path = static_cast<const char*>(processed->data());
-
-    printf("so path: %s\n", so_path);
-
-    // Load the ELF using dlopen
-    void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
-    if (so_handle == nullptr) {
-      std::cout << dlerror() << std::endl;
-      return Error::AccessFailed;
-    }
-
-    processed->Free();
-
-    AOTInductorModelContainerCreateWithDevice =
-        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
-    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
-      perror("dlsym1");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerDelete =
-        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerDelete"));
-    if (AOTInductorModelContainerDelete == nullptr) {
-      perror("dlsym2");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerGetNumInputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
-    if (AOTInductorModelContainerGetNumInputs == nullptr) {
-      perror("dlsym3");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerGetNumOutputs =
-        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
-    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
-      perror("dlsym4");
-      return Error::AccessFailed;
-    }
-    AOTInductorModelContainerRun =
-        reinterpret_cast<AOTInductorModelContainerRunFunc>(
-            dlsym(so_handle, "AOTInductorModelContainerRun"));
-    if (AOTInductorModelContainerRun == nullptr) {
-      perror("dlsym5");
-      return Error::AccessFailed;
-    }
-
-    AOTInductorModelContainerHandle container_handle = nullptr;
-
-    AOTIRuntimeError err;
-
-    err = AOTInductorModelContainerCreateWithDevice(
-        &container_handle, 1, "cuda", nullptr);
-    printf("container_handle=%p\n", container_handle);
-
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
-    handle->so_handle = so_handle;
-    handle->container_handle = container_handle;
-    return (DelegateHandle*)handle; // Return the handle post-processing
-  }
-
-  // Once per execution
-  Error execute(
-      BackendExecutionContext& context,
-      DelegateHandle* handle_,
-      Span<EValue*> args) const override {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-
-    size_t num_inputs;
-    AOTInductorModelContainerGetNumInputs(
-        handle->container_handle, &num_inputs);
-
-    size_t num_outputs;
-    AOTInductorModelContainerGetNumOutputs(
-        handle->container_handle, &num_outputs);
-
-    std::vector<AOTITensorHandle> inputs(num_inputs);
-    std::vector<AOTITensorHandle> outputs(num_outputs);
-
-    for (int i = 0; i < num_inputs; i++) {
-      auto tensor_in = args[i]->toTensor();
-      inputs[i] = &tensor_in;
-    }
-
-    for (int i = num_inputs; i < num_inputs + num_outputs; i++) {
-      auto tensor_out = args[i]->toTensor();
-      outputs[i - num_inputs] = &tensor_out;
-    }
-
-    AOTInductorModelContainerRun(
-        handle->container_handle,
-        inputs.data(),
-        num_inputs,
-        outputs.data(),
-        num_outputs,
-        // Should these last two be something?
-        nullptr,
-        nullptr);
-
-    // Still need to copy the output to args, because they are malloc'ed but
-    // not using the data_ptr from outputs.
-    for (int i = 0; i < num_outputs; i++) {
-      auto args_out = args[i + num_inputs]->toTensor();
-      aoti_torch_copy_(&args_out, outputs[i], 0);
-    }
-    return Error::Ok;
-  }
-
-  void destroy(DelegateHandle* handle_) const override {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-    dlclose(handle->so_handle);
-    AOTInductorModelContainerDelete(handle->container_handle);
-    free(handle);
-    tensor_to_sizes.clear();
-    tensor_to_strides.clear();
-  }
-};
-
-} // namespace aoti
-
-namespace {
-auto cls = aoti::AOTIBackend();
-executorch::runtime::Backend backend{"AotiBackend", &cls};
-static executorch::runtime::Error success_with_compiler =
-    register_backend(backend);
-} // namespace
-
-} // namespace backends
-} // namespace executorch
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
new file mode 100644
index 00000000000..65d28a7a1ff
--- /dev/null
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/backend/interface.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+
+#include <cuda_runtime.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+// Include our shim layer headers
+#include "aoti_model_container.h"
+#include "shims/memory.h"
+#include "shims/tensor_attribute.h"
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using namespace std;
+
+using executorch::aten::ScalarType;
+using executorch::runtime::ArrayRef;
+using executorch::runtime::Backend;
+using executorch::runtime::BackendExecutionContext;
+using executorch::runtime::BackendInitContext;
+using executorch::runtime::CompileSpec;
+using executorch::runtime::DelegateHandle;
+using executorch::runtime::Error;
+using executorch::runtime::EValue;
+using executorch::runtime::FreeableBuffer;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Result;
+using executorch::runtime::Span;
+using executorch::runtime::etensor::Tensor;
+
+class AOTIBackend final : public ::executorch::runtime::BackendInterface {
+ public:
+  // Once in program
+  AOTIBackend() {
+    ET_LOG(Info, "AOTIBackend ctor");
+  }
+
+  bool is_available() const override {
+    return 1;
+  }
+
+  // Once per loaded binary blob
+  Result<DelegateHandle*> init(
+      BackendInitContext& context,
+      FreeableBuffer* processed, // This will be the buffer from aoti_backend
+      ArrayRef<CompileSpec> compile_specs // This will be my empty list
+  ) const override {
+    const char* so_path = static_cast<const char*>(processed->data());
+
+    printf("so path: %s\n", so_path);
+
+    // Load the ELF using dlopen
+    void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+    if (so_handle == nullptr) {
+      std::cout << dlerror() << std::endl;
+      return Error::AccessFailed;
+    }
+
+    processed->Free();
+
+    AOTInductorModelContainerCreateWithDevice =
+        reinterpret_cast<AOTInductorModelContainerCreateWithDeviceFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice"));
+    if (AOTInductorModelContainerCreateWithDevice == nullptr) {
+      perror("dlsym1");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerDelete =
+        reinterpret_cast<AOTInductorModelContainerDeleteFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerDelete"));
+    if (AOTInductorModelContainerDelete == nullptr) {
+      perror("dlsym2");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerGetNumInputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumInputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumInputs"));
+    if (AOTInductorModelContainerGetNumInputs == nullptr) {
+      perror("dlsym3");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumConstants =
+        reinterpret_cast<AOTInductorModelContainerGetNumConstantsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumConstants"));
+    if (AOTInductorModelContainerGetNumConstants == nullptr) {
+      perror("dlsym AOTInductorModelContainerGetNumConstants");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerGetNumOutputs =
+        reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
+    if (AOTInductorModelContainerGetNumOutputs == nullptr) {
+      perror("dlsym4");
+      return Error::AccessFailed;
+    }
+    AOTInductorModelContainerRun =
+        reinterpret_cast<AOTInductorModelContainerRunFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerRun"));
+    if (AOTInductorModelContainerRun == nullptr) {
+      perror("dlsym5");
+      return Error::AccessFailed;
+    }
+
+    AOTInductorModelContainerHandle container_handle = nullptr;
+
+    AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice(
+        &container_handle, 1, "cuda", nullptr);
+    if (err != Error::Ok) {
+      return err;
+    }
+    printf("container_handle = %p\n", container_handle);
+
+    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    handle->so_handle = so_handle;
+    handle->container_handle = container_handle;
+    return (DelegateHandle*)handle; // Return the handle post-processing
+  }
+
+  // Once per execution
+  Error execute(
+      BackendExecutionContext& context,
+      DelegateHandle* handle_,
+      Span<EValue*> args) const override {
+    ET_LOG(Debug, "AOTIBackend execute");
+
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+
+    ET_LOG(Debug, "AOTIBackend Handle generated");
+
+    size_t n_inputs, n_constants;
+    AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
+
+    AOTInductorModelContainerGetNumConstants(
+        handle->container_handle, &n_constants);
+    size_t n_user_inputs = n_inputs - n_constants;
+
+    if (n_user_inputs != n_inputs) {
+      ET_LOG(
+          Error,
+          "number of user input does not match number of inputs. n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.",
+          n_user_inputs,
+          n_constants,
+          n_inputs);
+      return Error::InvalidArgument;
+    }
+
+    ET_LOG(
+        Debug,
+        "AOTIBackend n_inputs %zd generated, where %zd is constant input, %zd is user input",
+        n_inputs,
+        n_constants,
+        n_user_inputs);
+
+    size_t n_outputs;
+    AOTInductorModelContainerGetNumOutputs(
+        handle->container_handle, &n_outputs);
+
+    ET_LOG(Debug, "AOTIBackend n_outputs %zd generated", n_outputs);
+
+    if (n_inputs + n_outputs != args.size()) {
+      ET_LOG(
+          Error,
+          "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.",
+          n_inputs,
+          n_outputs,
+          args.size());
+      return Error::InvalidArgument;
+    }
+
+    ET_LOG(
+        Debug,
+        "number of user input %zd and output %zd generated from AOT Inductor matches ET runner's %zd.",
+        n_inputs,
+        n_outputs,
+        args.size());
+
+    std::vector<AOTITensorHandle> inputs(n_inputs);
+    std::vector<AOTITensorHandle> outputs(n_outputs);
+
+    ET_LOG(Debug, "AOTIBackend input/output vectors generated");
+
+    for (int i = 0; i < n_inputs; i++) {
+      ET_LOG(Debug, "Copying input %d from args to inputs vector", i);
+      ET_LOG(
+          Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor()));
+      inputs[i] = &(args[i]->toTensor());
+    }
+
+    ET_LOG(Debug, "AOTIBackend input generated");
+
+    for (int i = 0; i < n_outputs; i++) {
+      outputs[i] = &(args[i + n_inputs]->toTensor());
+    }
+
+    ET_LOG(Debug, "AOTIBackend output generated");
+
+    // Create a CUDA stream for this execution
+    cudaStream_t cuda_stream;
+    cudaError_t stream_err = cudaStreamCreate(&cuda_stream);
+    if (stream_err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "Failed to create CUDA stream: %s",
+          cudaGetErrorString(stream_err));
+      return Error::Internal;
+    }
+
+    ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream);
+
+    // Run AOTI container with the stream (AOTI will create its own stream guard
+    // internally)
+    AOTIRuntimeError error = AOTInductorModelContainerRun(
+        handle->container_handle,
+        inputs.data(),
+        n_inputs,
+        outputs.data(),
+        n_outputs,
+        cuda_stream, // Pass the actual CUDA stream!
+        nullptr); // proxy_executor_handle can remain nullptr
+
+    if (error != Error::Ok) {
+      ET_LOG(
+          Error,
+          "AOTInductorModelContainerRun failed with error code %d",
+          error);
+      return Error::Internal;
+    }
+
+    ET_LOG(Debug, "AOTIBackend running done");
+
+    // Synchronize and destroy the CUDA stream
+    cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
+    if (sync_err != cudaSuccess) {
+      ET_LOG(
+          Error,
+          "Failed to synchronize CUDA stream: %s",
+          cudaGetErrorString(sync_err));
+      // Continue anyway to avoid fatal errors
+    }
+
+    cudaStreamDestroy(cuda_stream);
+    ET_LOG(Debug, "CUDA stream synchronized and destroyed");
+
+    // Still need to copy the output to args, because they are malloc'ed but
+    // not using the data_ptr from outputs.
+    for (int i = 0; i < n_outputs; i++) {
+      auto args_out = args[i + n_inputs]->toTensor();
+      aoti_torch_copy_(&args_out, outputs[i], 0);
+    }
+
+    ET_LOG(Debug, "AOTIBackend output copied");
+
+    return Error::Ok;
+  }
+
+  void destroy(DelegateHandle* handle_) const override {
+    ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
+    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    dlclose(handle->so_handle);
+    AOTInductorModelContainerDelete(handle->container_handle);
+    free(handle);
+    cleanup_memory();
+    cleanup_tensor_metadata();
+  }
+};
+
+} // namespace aoti
+
+namespace {
+auto cls = aoti::AOTIBackend();
+executorch::runtime::Backend backend{"AotiBackend", &cls};
+static executorch::runtime::Error success_with_compiler =
+    register_backend(backend);
+} // namespace
+
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/runtime/aoti_model_container.cpp
new file mode 100644
index 00000000000..0809a677a81
--- /dev/null
+++ b/backends/aoti/runtime/aoti_model_container.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "aoti_model_container.h"
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+extern "C" {
+
+// Global function pointers for AOT Inductor model container operations
+// These will be loaded dynamically from the shared library
+AOTInductorModelContainerCreateWithDeviceFunc
+    AOTInductorModelContainerCreateWithDevice = nullptr;
+AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr;
+AOTInductorModelContainerGetNumInputsFunc
+    AOTInductorModelContainerGetNumInputs = nullptr;
+AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants = nullptr;
+AOTInductorModelContainerGetNumOutputsFunc
+    AOTInductorModelContainerGetNumOutputs = nullptr;
+AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr;
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/runtime/aoti_model_container.h
new file mode 100644
index 00000000000..2078490022d
--- /dev/null
+++ b/backends/aoti/runtime/aoti_model_container.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include "shims/memory.h"
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Type definitions
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+
+// Forward declarations for AOT Inductor model container
+struct AOTInductorModelContainerOpaque;
+using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*;
+using AOTInductorStreamHandle = void*;
+using AOTIProxyExecutorHandle = void*;
+
+// Function pointer types for AOT Inductor model container operations
+using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir);
+
+using AOTInductorModelContainerDeleteFunc =
+    AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle);
+
+using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants);
+
+using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AOTITensorHandle*
+        output_handles, // array for writing output AOTITensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t n_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle);
+
+// Global function pointers (will be loaded dynamically)
+extern AOTInductorModelContainerCreateWithDeviceFunc
+    AOTInductorModelContainerCreateWithDevice;
+extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete;
+extern AOTInductorModelContainerGetNumInputsFunc
+    AOTInductorModelContainerGetNumInputs;
+extern AOTInductorModelContainerGetNumConstantsFunc
+    AOTInductorModelContainerGetNumConstants;
+extern AOTInductorModelContainerGetNumOutputsFunc
+    AOTInductorModelContainerGetNumOutputs;
+extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun;
+
+} // extern "C"
+
+// AOTI Delegate Handle structure
+struct AOTIDelegateHandle {
+  void* so_handle;
+  AOTInductorModelContainerHandle container_handle;
+};
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
new file mode 100644
index 00000000000..cadd021f51f
--- /dev/null
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "memory.h"
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "tensor_attribute.h"
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+// Global storage for tensors and their metadata
+std::unordered_set<std::shared_ptr<Tensor>> tensors;
+std::unordered_map<Tensor*, bool> is_tensor_own_memory;
+
+extern "C" {
+
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size) {
+  std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim
+            << ", dtype: " << dtype << ", device_type: " << device_type
+            << std::endl;
+
+  // Convert sizes to the format expected by ExecutorTorch
+  std::vector<int32_t> sizes(ndim);
+  for (int i = 0; i < ndim; i++) {
+    sizes[i] = static_cast<int32_t>(sizes_ptr[i]);
+    std::cout << "Size[" << i << "] = " << sizes[i] << std::endl;
+  }
+
+  // check the tensor format
+  // Only support contiguous format for now
+  int64_t expected_stride = 1;
+  for (int i = ndim - 1; i >= 0; --i) {
+    if (strides_ptr[i] != expected_stride) {
+      std::cout
+          << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error"
+          << std::endl;
+      return Error::InvalidArgument;
+    }
+    expected_stride *= sizes_ptr[i];
+  }
+
+  // Adjust data pointer by storage_offset if needed
+  void* adjusted_data = data;
+  if (storage_offset > 0) {
+    // Calculate byte offset based on dtype size
+    size_t dtype_size =
+        4; // Assuming float32 for now, you may need to handle other dtypes
+    if (dtype == 6) { // float32
+      dtype_size = 4;
+    } else {
+      std::cout << "Error: Unhandled dtype " << dtype << std::endl;
+      return Error::NotImplemented;
+    }
+    adjusted_data = static_cast<char*>(data) + (storage_offset * dtype_size);
+  }
+
+  // Create ExecutorTorch tensor that wraps the existing memory
+  // Note: We're NOT copying the data, just wrapping it
+  auto tensor = executorch::extension::make_tensor_ptr(
+      sizes, // tensor dimensions
+      adjusted_data, // existing memory (don't copy!)
+      executorch::aten::ScalarType::Float // only supported dtype
+  );
+
+  if (!tensor) {
+    std::cerr << "Failed to create tensor from blob" << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Store the tensor so it doesn't get destroyed
+  tensors.insert(tensor);
+
+  *ret_new_tensor = tensor.get();
+
+  is_tensor_own_memory[tensor.get()] = false;
+
+  std::cout << "Successfully created tensor from blob: " << tensor.get()
+            << " wrapping data at: " << adjusted_data << std::endl;
+
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor) {
+  throw std::runtime_error("Should never create from blob");
+}
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor) {
+  // This requires us to reserve CUDA memory and put it into a ETensor
+  void* ptr;
+  int64_t numel = 1;
+  for (int i = 0; i < ndim; i++) {
+    numel *= sizes_ptr[i];
+  }
+
+  if (dtype != 6) { // throw if not float32
+    throw std::runtime_error("Need to implement empty_strided for non-float32");
+  }
+
+  int64_t nbytes = numel * 4;
+
+  if (device_type == 1) { // cuda
+    std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl;
+    cudaError_t err = cudaMalloc(&ptr, nbytes);
+    if (err != cudaSuccess) {
+      std::cout << "failed to allocate " << nbytes << std::endl;
+      throw std::runtime_error("Failed to call cudaMalloc");
+    }
+  } else if (device_type == 0) { // cpu
+    std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl;
+    ptr = malloc(nbytes);
+    if (ptr == nullptr) {
+      throw std::runtime_error("Failed to call malloc");
+    }
+  } else {
+    throw std::runtime_error(
+        "Need to implement empty_strided for non-CUDA non-CPU");
+  }
+  std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr "
+            << sizes_ptr << std::endl;
+
+  // ETensor sizes
+  std::vector<int32_t> sizes(ndim);
+  for (int i = 0; i < ndim; i++) {
+    sizes[i] = sizes_ptr[i];
+  }
+  // ETensor creation
+  auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr);
+
+  // Store the tensor
+  tensors.insert(tensor);
+
+  std::cout << "sizes.data(): " << sizes.data()
+            << ", tensor->sizes().data(): " << tensor->sizes().data()
+            << std::endl;
+  std::cout << "Size[0] of tensor " << tensor.get() << " is "
+            << tensor->sizes()[0] << std::endl;
+  *ret_new_tensor = tensor.get();
+  is_tensor_own_memory[tensor.get()] = true;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
+  std::cout << "Called aoti_torch_delete_tensor_object for tensor " << tensor
+            << std::endl;
+
+  // Check ownership before cleaning up metadata
+  auto ownership_it = is_tensor_own_memory.find(tensor);
+  bool owns_memory = (ownership_it != is_tensor_own_memory.end())
+      ? ownership_it->second
+      : false;
+
+  // Clean up ALL metadata maps immediately to prevent use-after-free
+  tensor_to_sizes.erase(tensor);
+  tensor_to_strides.erase(tensor);
+  is_tensor_own_memory.erase(tensor);
+
+  if (!owns_memory) {
+    std::cout << "Tensor " << tensor << " does not own memory. Skipped \n\n"
+              << std::endl;
+    return Error::Ok;
+  }
+
+  for (auto it = tensors.begin(); it != tensors.end(); ++it) {
+    if (it->get() == tensor) {
+      // Get the tensor before erasing
+      auto tensor_ptr = *it;
+
+      void* data_ptr = tensor_ptr->mutable_data_ptr();
+
+      // Determine if it's GPU memory
+      cudaPointerAttributes attributes;
+      cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr);
+
+      // et tensor does not own data; need to free them manually.
+      if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
+        // This is GPU memory - free with proper synchronization
+        std::cout << "Freeing GPU memory at " << data_ptr << std::endl;
+        cudaDeviceSynchronize(); // Wait for all operations to complete BEFORE
+                                 // freeing
+        cudaFree(data_ptr);
+        std::cout << "GPU memory freed successfully" << std::endl;
+      } else {
+        // This is CPU memory - free immediately
+        std::cout << "Freeing CPU memory at " << data_ptr << std::endl;
+        free(data_ptr);
+        std::cout << "CPU memory freed successfully" << std::endl;
+      }
+
+      std::cout << "Memory freed. Now erasing tensor " << tensor << std::endl;
+
+      // Remove from set (this will call the destructor if it's the last
+      // reference)
+      tensors.erase(it);
+
+      std::cout << "Tensor erased. Now returning \n\n" << std::endl;
+
+      return Error::Ok;
+    }
+  }
+  std::cout << "Error: Didn't find tensor " << tensor << std::endl;
+  return Error::InvalidArgument;
+}
+
+void checkCudaError(cudaError_t err, const char* msg) {
+  if (err != cudaSuccess) {
+    std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")"
+              << std::endl;
+    exit(EXIT_FAILURE);
+  }
+}
+
+AOTITorchError aoti_torch_copy_(
+    AOTITensorHandle self,
+    AOTITensorHandle src,
+    int32_t non_blocking) {
+  // check if size is the same
+  if (self->dim() != src->dim()) {
+    std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim()
+              << std::endl;
+    throw std::runtime_error("self.dim() != src.dim()");
+  }
+  std::cout << "self->data_ptr(): " << self->data_ptr()
+            << " sizes: " << self->sizes().data() << std::endl;
+  std::cout << "src->data_ptr(): " << src->data_ptr()
+            << " sizes: " << src->sizes().data() << std::endl;
+  for (int i = 0; i < self->dim(); i++) {
+    if (self->sizes()[i] != src->sizes()[i]) {
+      std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] "
+                << src->sizes()[i] << std::endl;
+      throw std::runtime_error("size mismatch");
+    }
+  }
+
+  int size = src->nbytes();
+  // should check for device
+  cudaPointerAttributes srcAttributes, dstAttributes;
+  cudaError_t err;
+  // Get attributes of the source pointer
+  err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr());
+  checkCudaError(err, "Failed to get source pointer attributes");
+  // Get attributes of the destination pointer
+  err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr());
+  checkCudaError(err, "Failed to get destination pointer attributes");
+  bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
+  bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
+  // Determine the memory locations and perform the appropriate copy
+  if (srcIsDevice && dstIsDevice) {
+    // Device to Device copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyDeviceToDevice);
+    checkCudaError(err, "Failed to copy from device to device");
+  } else if (srcIsDevice && !dstIsDevice) {
+    // Device to Host copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyDeviceToHost);
+    std::cout << "Device to Host copy, self data: "
+              << ((float*)self->data_ptr())[0] << std::endl;
+    checkCudaError(err, "Failed to copy from device to host");
+  } else if (!srcIsDevice && dstIsDevice) {
+    // Host to Device copy
+    err = cudaMemcpy(
+        self->mutable_data_ptr(),
+        src->data_ptr(),
+        size,
+        cudaMemcpyHostToDevice);
+    std::cout << "Host to Device copy, src data: "
+              << ((float*)src->data_ptr())[0] << std::endl;
+    checkCudaError(err, "Failed to copy from host to device");
+  } else if (!srcIsDevice && !dstIsDevice) {
+    // Host to Host copy
+    std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0]
+              << std::endl;
+    std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size);
+  } else {
+    std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type
+              << ", src: " << srcAttributes.type << std::endl;
+    throw std::runtime_error("Unknown memory type");
+  }
+  // print first value of src and self
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard) {
+  std::cout << "Entering stream guard for device " << device_index
+            << " with stream " << stream << std::endl;
+
+  // Set device
+  cudaError_t err = cudaSetDevice(device_index);
+  if (err != cudaSuccess) {
+    std::cerr << "Failed to set device " << device_index << ": "
+              << cudaGetErrorString(err) << std::endl;
+    return Error::Internal;
+  }
+
+  // Create minimal guard structure
+  CUDAStreamGuardOpaque* guard = new CUDAStreamGuardOpaque();
+  guard->device_index = device_index;
+  guard->original_stream = static_cast<cudaStream_t>(stream);
+  guard->sync_event = nullptr;
+
+  std::cout << "Stream guard created successfully for stream " << stream
+            << std::endl;
+
+  *ret_guard = guard;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(
+    CUDAStreamGuardHandle guard) {
+  std::cout << "Exiting stream guard" << std::endl;
+
+  if (guard == nullptr) {
+    return Error::Ok;
+  }
+
+  // Clean up the guard structure
+  delete guard;
+
+  std::cout << "Stream guard cleanup completed" << std::endl;
+  return Error::Ok;
+}
+
+// Cleanup function for clearing global state
+void cleanup_memory() {
+  is_tensor_own_memory.clear();
+  if (!tensors.empty()) {
+    std::cout << "Warning: tensors not empty" << std::endl;
+  }
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
new file mode 100644
index 00000000000..bcbb33d0e99
--- /dev/null
+++ b/backends/aoti/runtime/shims/memory.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Type definitions
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
+struct CUDAStreamGuardOpaque {
+  cudaStream_t original_stream;
+  int device_index;
+  cudaEvent_t sync_event;
+};
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+
+// Global storage declarations
+extern std::unordered_map<Tensor*, bool> is_tensor_own_memory;
+extern std::unordered_set<std::shared_ptr<Tensor>> tensors;
+
+// Memory-related operations
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
+
+AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor);
+
+AOTITorchError aoti_torch_empty_strided(
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor);
+
+AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor);
+
+AOTITorchError aoti_torch_copy_(
+    AOTITensorHandle self,
+    AOTITensorHandle src,
+    int32_t non_blocking);
+
+AOTITorchError aoti_torch_create_cuda_stream_guard(
+    void* stream,
+    int32_t device_index,
+    CUDAStreamGuardHandle* ret_guard);
+
+AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
+
+// Utility functions
+void checkCudaError(cudaError_t err, const char* msg);
+void cleanup_memory();
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
new file mode 100644
index 00000000000..b5333f50ea9
--- /dev/null
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "tensor_attribute.h"
+#include <iostream>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+// Global storage for tensor metadata
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
+extern "C" {
+
+int32_t aoti_torch_grad_mode_is_enabled() {
+  // No autograd ever
+  return false;
+}
+
+void aoti_torch_grad_mode_set_enabled(bool enabled) {
+  if (enabled) {
+    throw std::runtime_error("Cannot enable autograd");
+  }
+}
+
+AOTITorchError aoti_torch_get_data_ptr(
+    AOTITensorHandle tensor,
+    void** ret_data_ptr) {
+  *ret_data_ptr = tensor->mutable_data_ptr();
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_storage_offset(
+    AOTITensorHandle tensor,
+    int64_t* ret_storage_offset) {
+  // Storage offset is always 0 in ET
+  *ret_storage_offset = 0;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_strides(
+    AOTITensorHandle tensor,
+    int64_t** ret_strides) {
+  auto it = tensor_to_strides.find(tensor);
+  if (it == tensor_to_strides.end()) {
+    std::vector<int64_t> strides(tensor->dim());
+    auto tensor_strides = tensor->strides();
+    for (int i = 0; i < tensor->dim(); i++) {
+      strides[i] = tensor_strides[i];
+    }
+    it = tensor_to_strides.emplace(tensor, std::move(strides)).first;
+  }
+  *ret_strides = it->second.data();
+  std::cout << "getting strides from tensor " << tensor << " with dim "
+            << tensor->dim() << std::endl;
+  for (int i = 0; i < tensor->dim(); i++) {
+    std::cout << "strides " << i << " = " << (*ret_strides)[i] << std::endl;
+  }
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_dtype(
+    AOTITensorHandle tensor,
+    int32_t* ret_dtype) {
+  *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_sizes(
+    AOTITensorHandle tensor,
+    int64_t** ret_sizes) {
+  auto it = tensor_to_sizes.find(tensor);
+  if (it == tensor_to_sizes.end()) {
+    std::vector<int64_t> sizes(tensor->dim());
+    auto tensor_sizes = tensor->sizes();
+    for (int i = 0; i < tensor->dim(); i++) {
+      sizes[i] = tensor_sizes[i];
+    }
+    it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
+  }
+  *ret_sizes = it->second.data();
+  std::cout << "getting sizes from tensor " << tensor << " with dim "
+            << tensor->dim() << std::endl;
+  for (int i = 0; i < tensor->dim(); i++) {
+    std::cout << "size " << i << " = " << (*ret_sizes)[i] << std::endl;
+  }
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_storage_size(
+    AOTITensorHandle tensor,
+    int64_t* ret_size) {
+  throw std::runtime_error("Cannot get storage size on ETensor");
+}
+
+int32_t aoti_torch_device_type_cpu() {
+  // Let's say cpu is 0 for ET as well
+  return 0;
+}
+
+__attribute__((__visibility__("default"))) int32_t aoti_torch_layout_strided() {
+  // ET only support strided layout, the return value will always be 0, a.k.a
+  // at::Layout::Strided;
+  return 0;
+}
+
+__attribute__((__visibility__("default"))) int32_t
+aoti_torch_device_type_cuda() {
+  // Let's say cuda is 1 for ET as well
+  return 1;
+}
+
+__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
+  // Let assume the dtype here is all we will support
+  return 6;
+}
+
+void cleanup_tensor_metadata() {
+  tensor_to_sizes.clear();
+  tensor_to_strides.clear();
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h
new file mode 100644
index 00000000000..3ed966f99dc
--- /dev/null
+++ b/backends/aoti/runtime/shims/tensor_attribute.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Type definitions
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
+// Global storage for tensor metadata
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
+extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
+
+// Attribute-related operations (memory-irrelevant)
+AOTITorchError aoti_torch_get_data_ptr(
+    AOTITensorHandle tensor,
+    void** ret_data_ptr);
+
+AOTITorchError aoti_torch_get_storage_offset(
+    AOTITensorHandle tensor,
+    int64_t* ret_storage_offset);
+
+AOTITorchError aoti_torch_get_strides(
+    AOTITensorHandle tensor,
+    int64_t** ret_strides);
+
+AOTITorchError aoti_torch_get_dtype(
+    AOTITensorHandle tensor,
+    int32_t* ret_dtype);
+
+AOTITorchError aoti_torch_get_sizes(
+    AOTITensorHandle tensor,
+    int64_t** ret_sizes);
+
+AOTITorchError aoti_torch_get_storage_size(
+    AOTITensorHandle tensor,
+    int64_t* ret_size);
+
+// Utility functions for device and layout information
+int32_t aoti_torch_device_type_cpu();
+int32_t aoti_torch_device_type_cuda();
+int32_t aoti_torch_layout_strided();
+int32_t aoti_torch_dtype_float32();
+
+// Autograd mode functions
+int32_t aoti_torch_grad_mode_is_enabled();
+void aoti_torch_grad_mode_set_enabled(bool enabled);
+
+// Cleanup function for clearing global state
+void cleanup_tensor_metadata();
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl
index d51097f306d..7b02c1075a2 100644
--- a/backends/aoti/runtime/targets.bzl
+++ b/backends/aoti/runtime/targets.bzl
@@ -3,8 +3,17 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 def define_common_targets():
     runtime.cxx_library(
         name = "aoti_backend",
-        srcs = ["AotiBackend.cpp"],
-        headers = [],
+        srcs = [
+            "aoti_backend.cpp",
+            "aoti_model_container.cpp",
+            "shims/memory.cpp",
+            "shims/tensor_attribute.cpp",
+        ],
+        headers = [
+            "aoti_model_container.h",
+            "shims/memory.h",
+            "shims/tensor_attribute.h",
+        ],
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
         link_whole = True,
         supports_python_dlopen = True,
diff --git a/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin b/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d34f0ffd0262af56bf3c172beda228da57f93b3c
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJu$8lYb`tkxG*DnnE20DTA{aWI?Mq^C?Bjef
zPD^Q8rfHywPJ8(<X&(@urfJeN2~BL8wtfIsA+(`=nD(*D(1d8|l28!R7~b#yyj;$9
zvX?Ziwuw`6&+~tOeV*rh_VGIp-4%^Qa+{dAqU={}Ue0NF^HTzNb6AWIUiBcGU<KBP
z`!y`Wn9-PS_?~YUF;nB*jDY!QHKuFj7Bgy1qf(o!GsAaUzTtZnzh1N4dc~TYZ8&Ai
zs@Z;}=~zYEFHKvux;JB0ovDhCspWfirB<m;S<|*x_U+oVWo80}8>FfM-A;L9m3dCX
z@tV*oyq&7ni*^-RsQBFbQoYtdG=82^Ym}Yh?36X>*)xs;s+Ah%o>P^?0Jb|2wWcM6
ziEcIQMned#fwxhLUV#AT`Yp?E!9%i(qA4#8w^HIrk`J)L_Z1hV|3D}oC5B||YIVx9
z-Dw`Z>(%|bKj$LCkiN_YR6um<<-C8?^s6jJ)$RAsI5S3la<bw0qVMI>Qgo~atoEj*
z;QQ~n^Bzm^TN%r)R&4YyS|Yq$tX&p%myS;1ohR_Pn`+JcuPvZpS#@pz|Iq^4n9Woz
z^vk*gO)N`|4fZU~Asi(;L)e&{-v+$?+)#)jrbXa2wzUWE0UrtB2PDTF=TG(UKO^}S
zpVIg}&ab>!5qqBF<0}X``yI#k_293GlAr2yT_O7%zj*~AXWs&TXfTu?sq*)n|LPw8
zHm+R~OLMw37JGum)Rq(T^CK+LJ~2O%;8Vs3zq?0^L`Yk){Rl?L##9EL1S-n@1jb0@
zZ^_NLM}T3Gv_Ipf{km70uGGr2CCscTyHT&#9;w*r*(OU**Jqsc3W^SnvYmEn%|_a;
z`;~OHQcSyZ{&c;TGqRa<qvAWaxOVALd&+4D#c4~aPJ6R8zk>6Tm1didn=_A1PR};I
z(R{r=I%>8W&8epQXt7bL=iJA~-J;v@@+EVsS;;vw<+(?yhC9b1zx>O0UhezNHo0P^
zovQ124WdLp{PiDR>ifkDh^imC7FR5yZH+9ydqF!A9gMuIMWbg~WKnAYc;}oJiRzIh
zO^aqC=a;qP04@N~Ba7!RENf3jUuTh}#dFJx@1Ez3+A)wWzI72SXCsmG7vGLWwbvLF
z!Nkvw(-_qbU|7=nao1*Px(M1qh((`^BHtRX0Ch%Z1%L=S%(QJ0?P^l-NxG4%+E?ke
zi*8SlTiUfW-9op6bQ`3bhuhN9(&CbKBY_s(I3IWCQGn;PFB9@*a#~0`G^^2Vm~Jy&
z%I^&*+}2eDHgQ{AT3lW}_l`)*aikBkOyt5Ma+rxMzkTuT_s;PoEc2|tv%IuKSwkA;
zDGgK@qluzb>NUM!jN^II_NE+PZ_HSGGBz{pav3K}!)jEHIQkf!67nR`4V*1gH9e~v
z#eyzQj)$_@hxO0sNB4~G*6Dw2e79bpv4DSx8_xij2R(Ca-#^nur_e*$@qekWK>AGR
z-y`gi-9irkS%EL)VFxi*8kJ(z(G92Odvp3F@9`tdaBVzY>c;F?UN>Cd(|Zzg$wn-0
z!piKvg2u8}(wKQAjpeSSv3$r_&b;(~ldUisQhV9^Zj9YbA5k8|DN)w5W`P|ga0=ay
z&vttMu)9B-yA_L=<BEu~f7sifH&;;b5UBEglW=U;Aq$^QYbC3QPolNVUfO%AWwDHm
zqU6nD`98E^3+Lf|t5_J@Knqz|ptx{uQ$REIrZFoLk;NV~T=MNo*=Qm7#&p$?DavB^
z8}^7^Sj|1`nH!b>bJYZ3H{LJ{Iaufj8}%DTzfjQOn2~hE3}SY*n6Y<nm`8bdWY@|;
zwvkU*hJAB14`1tEKtUcr5p*KqBbIAaeWSrE=;8nY8BAuuG>JYlTQ#ODlI&&Sw4m=9
z&F6FD+3|4-*;5T*;$Q+DTsg!{1fkb6g<=pOfb%uL;}jc~R5P=Wo2%9XD)p0Rf@gGm
z4Ekn^0Y&r^kH*YNKOe7@9mP<Vu7Cq!aEofSk<K!e1q0b!sU=+Bt!G-~hiqzCmzarR
zH>+dm%FmQdlc)43l<?3@{TS-WHS3?&Gdi^hkEYWp!3*cRoZ^9&sbb_5awnaF&p?2m
zdsXrr+jrc0wNjch@G-UI;}Hh$gASQ{_d3l=$+xO@!{_^okjffe@xe}4d`fp$e220f
zD#=30E=@ZYp3;3I6g~GmA4H!M(epT{e>zex!V}%s8zq`@9Z^E6CrZ?|D@s`AnDs`<
zyJakhax4@j8oDz|zDCNlh*qm*7E5FuVUk!+oGN20-3p_Z6er&p3PGHELUCq!D|M_S
zzIe-GQ|PTH?e^=56k<J*BBxx{N~A#^*IQCvrfepNHEV{0r9FlB?xZ-{Ox^+1Sd+PN
zbNt}wZ3o4+qUgN0sm(%k)_BbG4co<w%9b(dA2Ad&PqR6SEo}`3v9V##<C~rm<1X{0
zV`<TDC<!^S?NY<RZ_1H8c5}3!P3G<@Waxsna;0fZ&Q_~|B-ci4*y+&8J#@ajB^^gM
z-_n#-z6T52IaBQ2io~;Ja({D_%E=kq@M?ioo~#^~=c(AUubYFTb6gG21$Cc%=*Z%s
zLmXR<ROkwBo_!uE54M4i=7B|87?X;|6Gc9_bbRp>Fi^#<<)Cndop?g3s24j`k$D%{
zkq(-HgT8W5WP&@9&m81_D9Zs4bu<dd)vquK|7x~Mo}1;x$_YPVl=~m1i;@b9NUU&P
zHbh?7fN(Zb7|WUY@f^=XmX2!*Qa!JQjBptjn!*fu!2l~=iqHsQzHG=FF5`wOnWN`n
zzJ76PbsntYqja~<k6CUJtMnLRVxHp%`OfcMdCqofWqK$*1{R~?@B^}6z8AHNMbByC
zfqq@ru^xT=yQp<)TbB!$N2oWj8M{QW^&jo0G{({ib`{2LG~X)b<VkEOG#`(#dBS~c
z3z6{MUWT!YI|De*Mi^V%8o)88chEI<Z3i5s_q!qa2upOp+z9B}VZ0mN-e$4u6YMO#
zXvAyP*x(lK3%%!|f9<*gi)~J@B<6!r%*SG+jbHRGlO`X^@$Yi{0OJM0f_IjT$1g$l
z@ip*DC-L{bt)+LL+&!g+-LZzAfZ^YqoIDz7GkOh74&0Dr*Ud@6ofvCLcJgSV&Ax#-
z@jMT>6=RE%g)OAOFW+1b1G$i7-)|-OJKF=>pb_+wt$sE^iKOe56#I6ozn|WMC*FTH
z#m=O_-`{@?-~)Y<|A_=UK)mD;$sbL&+1CM){S3i_ZT5_WU4n<&?3)rM|68JNZF-md
z+cfBXZT2-uFDBU#=pO=XpMG`|+2867@BrEq1fODCT5MpV#+X}N)L=h$6W~41{<+YO
zg8nMvZE5WtkZ0Qv6ql#>>(KY=sZH#S6pOdy%iA{Sh<8^j&Q7B-21p+K!>vS|S}no#
z-OElq9Zm4sZ#TBEg&62bPEQhjAr+6e#F}8F5Bd1{5d?M;3Me12H{2TJ{DVS&bhtId
z;UO-+1OAXTT+*N6R^I?&`Zzs}{KvZL74NE7JWlnB1@(%@JL?tesMm?7W62<%8zQhz
zj^ZNwJU^T&;020*OCNvjzx@+i*hmWS=0WZ;<>QA*z=@#%M*ofaDXh8qTS*;T*ui%C
z(dds~zG<A>SU3e((*hXzPc+2ZVLCW6Nx(^7FAgt6*-kGO;~D1qQjr1l2lPM5ZeK?I
zcf@vj@i^^Yd^14)B^|ck)7VSU-_nqtd9>_2;!DRs$5uh|7d7_ADFuW7eZa(@lKjvJ
z|8#6!;J+0o8PAXYqK0;w|NIKf=_~L^lJpCH^b^I$FKl_Ve)Th4zu<?z`WenI;Ma&h
z$@en;6&Tio?r9<)WdF<#f@9Rr+&;;B(InNM!>^>-jh?1aMR^rbKhWc%<0trf==Wmy
z{)euU)W6`5Mh5`(6AJMnKPczv33e9wdm|Ma@EGquHjjKGKmA_3E%ZSi@qr(^GV$~L
z5q}&AqyYJazD)%Z2c7c&I^esBOfd2b`}c6efY`jo&ZpSd=U5*ESl@PR^h8(t9ns%P
zvCqzZz-a$&x1ZA3=@h#*phM{Y5Vov?U%cZs>#heL#FyfHdicIO4_fzs;l2kbX&+r5
zW#GRe1GGJheWlZ#2?>5zY#ITP_`?_NL95A+O5_b)#BO=|=DW)TSN|1$cri9D1&D{v
z3wg@XlPSMG2YR<oBAk{Fd((m~xOek@X$|=iY3O8^Ko`iKT0{Ova(~>^CC~-3Gi%8I
zR37iQs2?uK2)gBO7I-%ZP)^Qn7n5BQR%M_)I(!`wfIAUJVjjC*AYPDolBc7vTYdzE
zflod5UzGBCNS>xjUe%F)*Fm@mB8IL%N_kahvQOuel2iDftRcU+iafPcxBOq%kWa`D
z6dCCg*{AcmTYekjU(Nr5lvjR|Jn?nQ7hoTMJ!{~wl&74BFD0*V^t&69m&-+=*}VEd
zBnf1n*3W#vAtoczKGDP1{EY&VX$(kYD}2%Kd&sA5`5#F6FfYx!<?j&v!*1_lRPuL&
zuCCSkkHVw8s^7CJUNs(;2GywYN9aqDQ}t1H)TrvE#;`r=-=Ho^K#i(^atzB6O4UQL
Xt5MZQjbS;8PvueZt5M}ujY{qx?zQXb

literal 0
HcmV?d00001

diff --git a/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin b/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..265d32b454c580e44b22d1a5f822ac83dbfedad1
GIT binary patch
literal 10048
zcmeHN?{6FB9e?b*Bz6gLT38{hqt~q#?TD_vvlF{5YXjxSB#P)LObCQ-XZxI7=h)8q
z5}byD(@kiarZSB;L)wdqca`=B3<;sW7!yL<G_e<Zix5K_gDUANiZnLw^L_4lE@wM|
z(S&Z3c+@;UzQ4cU=lgu`?s1>H@A&=kSS&llf);15v9_4gu#yfID_Sr<apMzgiWS%(
z`kPsdF}*p{^gZ7$VkXDM*<j5_t2t9Ex0qgA(A`?4&UD{t`MU49e!XTj>aJCpZ#rem
zs@cA~;8;c5FU?rBx;JZ8ooUy{)bc&st+};n%PyBKGaJg<yp-mQ?K$NwljAu}$6J8F
z47YK*S})pFxb6CUU8>ibzFqTk6idDA6z8X{if7L{a#3|_n0roDAj7rY1*k1p0W;yP
zrrm4?OdG&$#J-;)gf;w@<+orV>BTNdEA@t3;y~gLp@HpdCJOrjkTwbk@z~Ysv}ZSF
zxc3dO?$`ar20V<&%WN+AK~{c2{g=5-J0+9)<g7ER*DIB#;|J+24v)km>2Zn|EV(}U
z(0vbC0lk&6?5b-cO(8FODy+R0szz5r@B|Cs=uLSv{@>0?czU|e`R6((n=X>_D1UK>
z%*4@C*l6DYRPbon6M?E?ZUU~I-x0wC)02QJY<nNP1b8e0zd=x(;`FIL`Xxax>7>Nx
zIlZ)EjqODaA6Ns3*+mW??1K-)$xivSTthn?zHJR4X3K!@7+s4`TbS?~r@yI>zJs<`
zKxs}*Vd^t9riPc>?R_lKS#Ix3@F`;iAMcYA0HiMU3f2LyDkJb$fa0uyJ{J4O$Tsvb
zc+6r4{Mp6<zwVW0+**0QgqbyMH|zD<Dc3%bbr~yV-ST{Pu4(7Yk~`<j*B89Gxy6cY
zo<8N(7N$>4c-eA!qB(aecRKg|N_H{VaO!$vk;Ptp@5R>#U){-A60QeLjVaq>mp{6s
zEXGG;A1Tkp|G;7^t4a$iB}v`W=vG!&E-7j}69YaIyQC;_Ev7t6q?ay#xVoY&(ON~f
z^1gBd=s#Hb;0lP^vDoU$N*wzbi~SMm;yg*aodq~z3{!52DL0YYJE<d^l<!jaP3nBo
zcvaa+^9QNB$f!St?(*tYPQALSqzOj$UsdL5+Mw<+>VC(`D<tJ_%54Pks3UoouPWch
z@ONcDLC6Q?ICV#;BQsW4FDr5G5hmn+{5-iuLDUx<uj$rnT0zg_)ynp!9baqCS_d;W
z)A725V%)Tv?rBGxq{1qSuco6MPuDa<(~AWyD7H@;##7q2w55X+;~M=<=Et=HjRpJ(
zSUv+-4*1NO!=EOL3ip#n{xjtjNS+D#2LpYiw~)o35x@&M=z))J(=ApVO?PU(x2SEV
zEbe1^!^SI@rq562G`-<_T3<{yBj4iEt<3oK(PLaEJ?3@Nlf6!Qa@R@EL_|;4+*Ia@
z%$kfy{30_r&-T;H|6I*?%bH;p*b*(Lab)npq$Sghy9!58130J%*OF=PE^~4X1ls^A
z-(}_iV|$NV_|B`9tRlYYYMDb+x~k>iDC$K)YfO%7Pj1EGd+P8y!coq)Lbw6p<PDya
z<i>1$L7xxYN1aTUeA_MSEqL6Vsp=vwC~j#F*!+5C0lpPxnd|xt&08Tn3*k{u4V3LP
z@`4<-Yh~%DfvTPM2x*w>`Dy~Q73So~oVpjP#RDYm@gN(<Iw3Y;)>f#UAhr1TY3Nm7
zZ}L5F1RleHd?A;iAkA0x8CPIE1CKHU1ZX2~<PR1m4;lv#BQrdiG6tTdAOKkwmJ=e7
z>lt_u28c<7xLgk5I3j=t*xX^RuUZeW$Z=pnV@$%D=6o^4P;zoh&|_~n<I`^0kpyLv
zN0DrWSgui?O5!Y&Pz<IC-Y^v)Yh25;plT}JFf<%!@oh(mzeEbPGl)Is*Y4CZ8bz>6
zI9FOWxZQoif&4V#W+>cT6HXh@?N_h|@W00|9{RGu15-A$p<i$tjv~Tbcgb^X-)YpV
zZfQ}+H)zSn%NFi*j+=)LISX#dx2ks2=OrxQGIYv=NQj_+Jui+MT>?m6$u7+}7GAs|
zPXsU<`j`uR<hwA@8^$1h20sJ+zQDz6@C96bz944T?n^H8B_Hvn(CZ5?F32>hF&KI`
zhKkHJctdP`?#NB<$=?5LYbHW>j7-EIDnZ>j9#nxcm7#SVl0p4^sYa!v&mkPrYu}&A
zHHWfMhj4BJhbqByE=V)7x{}SC`C}7z9|J{m&hvG<flI!YUhz-sf;iYlDto*fNhps6
zKm6EVdWMQeB~>Aay}yv52BxyRpjYOr)leQk&Oxy<iGU8glnR^DUJz%VH<eji_i7=U
zS51=>@hM6xFGhl$A0m@u_#X1Cz}b$}v}h?C=M^rmoq-hQIez$BU>sU|-EnGVdP_Ki
zkC>*zYe!Pt2ie7<=Pcl@s<)l#BjwlT{e-=YTea>tVAd{?4aub>9AQj7kYG1r{0hyt
z2XhYJ5t-*z)+YQd!-T?(NrAB|d&2b~%iza_?csWW?d!s;OzT=J?AER||D2D=i?Ku(
zo+|}jIf46{uXdQaE5R-*jJ=INb#$29#xFcTsEkc9bz6dsU_Khhd{QN8ywUv%P5vdu
ze~R%FG4g-_Pj?y=6WP8dCcSVem*D-kNP(Q_ZPdLR$dP(BJDy#Nbr@Y+kzrpSVPlIz
z;vS5(5q5TIxWj&oInlg~bxUO{f`q-30zCpsjeaM3XN0}fO7J_TqdOrH__M7fo8mVW
z!2c%2e%4AR>DDgs@wpUxBL(_o@@A}$4hs5b66`3}XaWCHqQjoYiu7k7Z)sbH{Yb1E
zv>xfOpNKW=-_c_1IV?LD&keEP9{_%|!_EnOafEI6aD7Mk0`PCVz=lD8{&vP1#VZQz
zRqtrU*>t5+>BND5m*lG*_Cvw{9^%tdI=iuq?}SsZf4ir|Nnq!#7lzoIDYmyYq<oKZ
zgZSb90Oax30X_Urw}yvQ0EP+w=EH3HLVTF##F#$JUS@2<ALRK)<iwwDB^c2s0(r4l
zPUBAUEC3V_;9qK~J@FpsiTA(&#aj*IJuuK6Z?!Al>DD%G-?l*C;PM3}!EZswZXafq
zWN7~g$M05`FAU=3pppH>6pMR17<Yh_B7W)C2$wg?=?}oap*8<_JXG$V%JDCNAF(8P
z#1H?LFYFj5|GE74N3b3m3D<w2HB&?Tkbe_?6720GB7U(wemwrm*YC)a2=>OqEYVEx
zv&uBAkuQ5ZWsoOC5byX6EjkJ9XW2a~3VV0AD%P;&GS(@UK)wQhl*F;OQcO!Ei2ev;
zQ`}$e=N$jw+M45o^>4us{XAZy;NLsKG;c5gfoM~(Z#S(K?hjw5*f+ePLAovHwF3FN
z#~WHJ8;H+C`-zuYTVWrh*z=1_C4#{*6gk4n^Gx{nQ|y7oPZ-s%PUi;-yO?6PhIlaj
z7uJ?_?6G_Aw(fuQ3H*4(rzak{@0fM+JC8g{PJP+FjKKd?1gK1iCtjCbQ)1PQcYztv
zM;rYvwI2U20ZjHFd->@XqaGDp{Jx-Quo=NXH1s~gYYj%cJ@Ix3_+FWW_^GHWGXYuH
z_tHMF0sqH>|HF_onDp}d8}R>G82YEr;D2QU{#Sz{%I4`MF4B7Si*nhBknm|v1Zb~7
z$Zg=ynh5cHFmFsnU^IPP@c&xCiJwY*FaH=e1~&ES-!&F6YQYfZh~$_1NWX^?Y)a6{
zVM_4JeJ1_%9x3@G{QL&|kFUc|Db>sW_6Gb7!JiQZk$(Q3C)QH`5d2@y{`2eTC%Rt!
z`=KBI`p&?w1V6<*+9bb(nV1m2I4%;**loeIEdoyV)A?x!;IJPP`UxLxZM;KL+Z7C9
zPr9OweitX3dij4L_@lHm@8zEe@`v5iLn!%A0598m`A2?}i}L?<Nm@Cc5DMie<454l
z@X7s=dgLhgOO8=}{5weOi=>yM+$T9k`N&7PACg{<a)0C)<s<)O93{ORWnASb`Th;r
C9O_j7

literal 0
HcmV?d00001

diff --git a/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin b/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0e97dbe55ea36a4c977aa64fa95edadfdc9dfd28
GIT binary patch
literal 10816
zcmeHN-)~#h9Y6MUQo9Xy5+X>~ZoL_eXt(UfKa$vOSqT)1rWMgq_5wn8*Y-`~&avI=
z8`o(lb-D>n(*)Ca8PXmqo)Fu^9zkNL4^YuPp#2F|h4#>OD2W(C8pHd1&yVZt>tur^
zbeqIga?bhw{&BwNd#|s3>Csas!=X@S7ZX>Qz0F#3P1D*T0j%98=BMv^n$5Bt>&H04
zLW~*pg}UpwRvs%gFE0w1&t`q0QfxA#(lAPuxhgYUyXhLPQ*x^nvsNvcb4zu*Xqpwv
zEj4U2Z@Gm9vr=^y&9XgTa<MdB$0}7ym3h-D7R}TVPuKRf)E6wrF7B8f$FAE>0}7Me
z$N6$KZ<P^t$>ngNTB*BM#hs#D8bv$5G;hv1)}pO|a;bu~W0xh-gRL$^rC|zTqMLQA
zUKc`J;4S37SHQ#7+@|R^;UU?@E-5dKTB*R1B=2E`?;RJV|3D}oC5B|Ia(UjdY70F2
znp1VF?s5$g`t&8YmwZu`U()zhZPQMvqCT@|FB;Xkxw`F&@|K53(NXj`#SK%zXC8m_
zaZ~V{Nz*EqER-qq`A>y;&_vhhDhOU+0*^7Rj>iAnIq^<U_c{MU=VaSWQXS<l?~tW&
z^fWfua{x8m8peFsh+L<E*RKrwC}MdAc#RG9;0wTqeE2<*<1FV-^zg4pe#NIWzQFmF
z7ag&eIlixhkgKa4-`|7p3zMJfZrLH*9KXARkgHYThX*_PX-QLWbN;(~_}jP+N-VAE
z(pcm<niB)7t=1tHZLhWtMfs93!cX*Q5eQ`~GKFA-Y(!-s3RIZYz!(bsJ-!=b2pATc
zbQf!rZq+F)lq$ug0#@d{Rj*bnXG_-P*(_s)v)O89xn3=0G7IOd^xXLiOEc%4`9{5Z
zE_?ReV*bKh*_v}^D$VSV%T}>*zUnsSjoLB`z4hmp-|hdyJ&eT=y0{`CZ78(y@rT-S
zcrf&_b}9TP7Ft`^nt-$z4d-cyg!Rzvbxc=TNP9k<46S{rg@9YvE|S0oa5~0I82_U6
zVZ5d7f!uA*q=kXdR>C(zAx_)4y|FF`q#}%M4uwW|E@+Rsa{+|USEk(?((WdQCTZYK
zo}txuXsDAb!s$_hPttIW(fHOSj2qejts5BD*Vfmy!vyj0c%btH-bF)&2CjF8R{LrA
z5m!D=OD=MbR{Ln+!Y-{y<d$}vmXFaOY?HmU^|il=kSRr*t=4X>ZEXB?1CFh2Xd30^
z3gv}1$!OS4y;QB}IU|clH_Mr~UA?|&9!Xlvz~dO2ech~=F4+1EHE!A5bp!2wzM`je
zBcIbnGk+$PdRG6gzH(%GLZ|<k?1Y}9Ifs9V%O-)F0zG;0*uT<6H-=|Y*?+4qNBYvx
ze?-_LySWVhQv#oxf*r(Is+aO*TQ}^A>n!WrX}gD*QM2$!ryEN%Q@T-e9la+f+tF`w
z?PhY~&e%!aAv@_iWG8cn>`dJuJJUWpne?_QS60@HPve(a!C7{gK3truxTT_=O6S-L
zf%7;r_&}z|#%mAcj-v-~R34$l#+?VIk{uL$1FHBydI~sp@RW(qk(Gj($7jY$@+h^g
za#0*bBQJSVGZXqVJ8}4)J+_H*w6mR1o`Q0U2KQkKW3k#WmPGg|G_Uc3Yn6&d6Cu|Z
z%7#n~1vc&wJ-3->Xs|osS$fm3(OGvwc?Qb6f<-#VQ5qDgKT^bkNI^_(5@{OkekYun
zf-_d7=%p7i!YE9qr>8bEi@V8An4P8^R2^lOqJddqe1>-1c56o2HR`+`YOp%)5K}6b
zOA&o>scbBiB-u%O^lWN6m(J#L8A_$2$|;TeBILo9K`#9S+#m%3!QvM0Da4H#ctK4c
z<L1g$k4n|)r0M1jU+PPFkHCvl1e2bYZk{g{ZN*TOo*)usP#suOSrpX`F35T%g2YVd
z$z~F6$P)tN5;HqqOX)cM@`Gk0%`>_JT_Pp=MeG;Xtbbci>Xda6r!>nGQ=}q;^`KdV
zA25rMIc96KJY`d!*>uM&ZCQ6NM0%04KrVRli&TW<6t+_40zLzQ<Z~Yij%~Sity(S>
zmJNJlF1UDh!z-v$>7z&OMycSMWvlM;jwqy322$uxtJ2g-D}323zEi0#m1Ln{6&7q0
zpB!AnmvmkjGhXylB6=PNl|xtRMb1Ud_eP1}|AQzY)e|M`a7~>kmF~bUe(or_=Q%IR
ztS?G5<?bj`{A3r!ndl8OrAqsY?Iwx!1d8qv2$YO(Yd=LpN_wHDlD=3|=p}(*c|{jR
zJK7{zb$q^Hun_AB)=OMxTT@#!h0ndOq!W!J-3d424;QCYggYm0(rI1?R9<tLY&v^l
z`k@n|i7Gljz*KBLI;)>^T*Ip24QbPub1xW*nWs9PML*q~A;SwPO$A0BSBho0%e-O8
zR;&oTv+@1F$)hC62Z#)CaOh+%Nf)#gOATXgsa*C{a&5$hHhbh)FP$H9LC=r^YHZ3X
zzX4=~ZRE>O;@J}ZW~dp=En0?C@g#Y&GF)C%9jYt9!toidM;xmZ-6!a_CC#U}(;2P`
z;(m$<N+cOVjGuk9$L<@4|5h=f^V<klnL<Y=DaUrbN4C__&ZDSZDbf?%MSQ@l+q@CS
z<hwvCpLgsAo_K@T#U57vl}RsSP;jT<UI!Mf0{IYIi6IciB9l>e7v@n~4~aF$5Bt{H
z2x}4j-T@-vZCQ@7kH$Q>pC$3d(vS!DF?`A7qeqzD1#4u@M-S8cMjsqv(Jq)91zkIh
zcXj*PEV4h!uHwbl4g5z22e@zaa)bW0y|XN`JIdl%4~DUxiI6sa(YrNTd?x3A$oW&u
zJQ<+TyDZ9wy1p(KLA=sQ_};gNLXZC~<Re??QFeA}zPu7@GkR@F4t_Jv_AX1!W0)&(
zc6nu>&3=wG@w^JS8DVRZh22bmUlY(crkip0N;AseF+O+?G=hG)8Dq1QXu7T?*sq$g
z7`?TPe)3|1y_W!gEH(o8!G6jAT$DXXyyOwduf*Hz1wdp!N$_Bs{Y=6&f`{Ae7ZN7_
zn_^$v^y>EgNzg~y>_th>$JsFGp8;%NeQ_7rA9C<Im*82z*IojDw9S4h`4P_m_*>s&
ztd{>ogFlh`n_>3b*I)luI}H9mBL1e<J^=RcJqVKP)54^%ed8rgKiJ%@U7@-me#FxU
zd49FQk9>?b2X><@*#Ot~1Y5lx9^kdI&lq4gBj6`0F5>@y<mEeQf}wA`8E3>17y3hQ
zUI6_v5R@;je~{}R6!h>b%}8K>`U3mY*GKyk@%E>$uX}$YUHh|oJrd_0?fb?6%g5lK
zIP=`!n@_;MVQznz?0;||vU<IL(2K7a0)2$jN4S5JkSE7+$@o^U4-E4RP(EIbgH9*a
zqld<uQ67Jk<gtIN*LRP2{IA6U$D^Kq*w1Nq2xq!=oYb?M2iTY!4*%}m`|)C%-~?bz
z^I+sJS{G-N>7Xa$fCojv5R7`-gZhBJ=h?$+8pS7J==w-w?<9zhW<~T3&?kvbFz6p^
z?EM7Oc`{nZar5Ezqrb-WeV@bdM<1mx4}A1DIPB5MKYoFW&Tq7-;VX}pI6}}*{t}4~
z@Hg5VlJpjjSAR`IxweEp)UW;qr+fOxoW5PWFY$aO!edUPpXXP=Om{Sq5AOc~f+M0M
z%nJPu8x-Hc0Df;0bSB%Jpx@_c8dVJ6^=nVC`yA0v_W;ND!-;Eh$5GEOu`wsXyl;Jw
zpT}`F#nsZ-?F4&anMH}ee;*n>(N#ZVpx;Wc$Cf{3)W6&9pJ?oAf{l7~2>lme(>(FR
zM;<ayKJ_%dJmt&NPd<9WJoAGmpOSHXHGP$V|D6obu@~pN+oxFxw&d+iQHkGw(YfAC
ze!oP<0>q#^ePbFh!POs<x5Wi1Ks@wZ#@j3AQO?EKQP6`riS$ePP_`iX)D+}>Yzz5M
zr2IfYAOJFV3;9S)SR4%q1VHxY7V`V$ZF7h>d*yv^tcL}u*5MGV<k!S}P!iNO@Lum@
zUJ~ottPe-ar=<Kl5>N8vN>F|;HU=T|*dN^|1a&caHKOFzKGN?22sbPF)RdF*YM;qI
z-Ds4Y!e7}!{`4mDR8m3t8(YXPNcp66i0t#{TnQ`tyAc0o{{MOt`@|Qte;D@h*K-Db
zFXbuc{!7U#9Q|H{<mGWuXvX%7rTWbP`A_GkCD7iuSK24K|7wj1NUHHjWXpfi??T9@
zp!{#7yq}lWLHTJ>KkVTEqmn-jy1F*2KZ=|3s(#m~c-4Gb8dS5&AECD+r}jtLQM1}F
sHT&)H2c=*_rJ&j;HT&f#MztTxj+&J^HT&f#K9xtsuV$53H7mJ)0z0U8!T<mO

literal 0
HcmV?d00001

diff --git a/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin b/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..cce20665191bc4720a2b2dc9bfcd39965715bd98
GIT binary patch
literal 10176
zcmeHNU5p#W9Upu5?(A@!F9!vVrp=~}+oZj2yuS0<2{btfUsZ!h32h}1TCMH%-3H&<
ztoMwaBjqlwP*qg~m6!Gn@fN8MeMKsWhf<*mRj7UFD?$iWX&PNZ1d&SX{{FKw>-EP$
z$sJLvGRn=&|NCQR{<}NQGxr|5FCL4{j<KM{*{f_#%xS29Edc6!g7MKCA7u-yz(&#E
z%wmk`jirX~`Bo7#IkuMrn2%;-$t|~-?lyJDt<;$A+bv)BJ;$%PX1(T^m6e8FHci*^
zou+LTEx)v6x;1avtlEo?kE!W<mg73^qG^@OrjZSGZCy)a$@1*-w(0TghV3<>FvERZ
ztk#NF6=6F*hf6iL;ajdhN4eC?c5!9Vta#S4ErF`zV(!^hK@4H52jMo&K$z%e!)i1F
zp-u1>az7{#;_80O^jq+d>|&LqmwMePaU{uy*ueL$i^6{(q>lnaGFG*^=vnn89(~=b
z`8B^?M}!f5nXM&1*p;7A|M}jgm6DtK#In7t*D95U?FZXi93Dx>rpGC6ni4+o;Jpu;
z0l%3st*T>Tn?hgoRG52BRE?eu!8cfdM~}n)zj;o=)6?t!r#dHFE|T&ne|Co~i=(Ho
z$$<l?;L#u+8JwmHjx6xni&F#qCxBPj&H;Q0_*evggWx#H`BMY@tAbzhNsZ5Pe(6P5
z>;;Y==_17JHI5%0z>mbqPx-X$l5LLP(M5>aI`C7Ik^Bf#uX6sI2KYN@dj*!})D)&Z
zO=D_&eQj+YOLW%P_9gg~F~ZLbXb}iuOT7v>kX2;{&I1)^b@Z{=KPGmdk0D|fJK!(Z
z5BN2&wB)$ul@ey=qSdI?+>?%V;E6V4r6<}a+qq(?)>?9McBS5|)-1O@-+s!?JI$v3
zgrD`ird4z7>fFgw^X=Jke#O=6Z5I3OdoR2``pPYgB@uei)R?j}cKL&gN;^Io`#^an
z{uYaETvA$qlq7Yh&|SE^ap97p#xpT6WMUT;C9cJk9}(}x%YVPLp{x?DqT4vH+&}^w
z7p{P5Ef%}9u@V1G9I^cw_Tqe(_C^bI#2cpE8dGi}yWgOWoKk*B-G1tPviPBL3(b#F
zcZ9m{pgVu%1LY}<?{91<X+lx}A1W&}tyA{{>fU73S4hj>l-p_UQAheNf2cfw;r+l6
z#SmCHOfz!h(xuBvoJWKS#UFp3!lEqdP1|cYHCHR>dAwU$-lFYmjb-y-#$r0&mr#ux
zX2UsUYx7iDMfKHm)Z;~0Gc>(e(1L3Fm|;AweOp^Sn48h)Z$3Yx6=*ErPvG(y;O0Qj
zoWA4ZbW!Dg%*cPDz5?mXLjS?Q9@#C-;?D^1g*n(kj84NTR&7nUUEgbKTd9ltm|nN=
z&ZX%q^K+VB_dRVOCtJ~Pa_wej=K9z%u9KbYb+R*io$SnACp)=_o!RV`cCOf2^AU}o
z<qpoX{q*+#r0Y9n&BzwmDuIhQGWcTBlIi-Lg~O-;992YU$+UN8HrGYLH=xRQX6Jxo
zdykp;&~r;>5g&DK<`9*xYB@NHdQtEi^E29G+j00Fzhj7Ul(X$nZa_Ijg9jyrv0Q8F
zD?#|Elj)LgIc2?th#N~)UE~GDE$xAtALbVD+u>GrXw0y@9m;2+JQ}HivYp1hpg=9R
zEaNm#wbLFEjqI>kv!HB;J9Fet%?r)q0h0E3k_}@>inDNQJIv<DEWUo~dezq(e9h}Y
z#Bd;+%M=Qr7b~`)c?KS2eBHBnV#2Y;k|(!{0Sqz)21*nj)1eF=fFSUQjJe+I;&gxn
zhHuSgp>3rY3Sn!9Li4m#D^>j|ryP1x&ISt6lTI=xj~q#s>EL0=LTz`Qp>ambv@&<F
zg>>D}a9YKe8|~^<GN_$KV!6EbO)aBQdU}*|qh*78H=rEEk5irvm1n!k2jk)?CJTLW
z@%)wzI+BQMHjE2l1A!cJ+a=Goe7jz&I;FObdyA5fH!0lM9LpX$WH+6XZ&s~_&+Ahl
zW#~vpG)1sKEQ(`Bj{?$HvPw&~iPviA6A7FRW1I_O<lg4`(-`E>;2V4}7KC^Wu|R4d
z7Ub;OW0?zM$wy+L6VbCJc;&!~PyrdTFjPUVAsP}J2uCh*-|qd-mL?a5V`L)nP<`nO
zi0{BM)tI3H$)tX|tw!}>ARrvlYhR!FZa}lqfN*YtfGWX5EI2acb!9f6%^%I(brc+p
zCp}-c>bSyd=@tK!E}4U(ORG%fNMM7-<%b^!j~<ZXIY>!@+WQI_Y7i<rO}(;Gt%j0Z
zD9EL85DawSrB<G3p_>X-W!cg_HzdR5w3p>fyoOR8q)I2m`5~f0jJgF%P}6X>BQ-5r
z%4T?l%WJ2J*R%cbnQt8GzRTEdnO*@-;|rx>^V*RVw>?&|=-ExYGWEC91FZZ~e2}qU
zaJ%3^2bQf8`H);qA`taJg53yPU%;QbGnkWUbt*E?t89&Mw~iADw;=__uIvfnQI=tB
zV`m7Duzfvrm1#Y&!fxw<`ImY`UyLPs=-epi%5mJqe6hpST?zJ<0+aYtC&#&O{4xWK
z%Jc$LcO=*Z=96*E=T*|i8{M7I<X>X^rx-uNh*a=KcNLTm*}fzu{dlR9@PoHQbo;<q
z^p@$KP4o!#bsNsC#yX6yttf!6Ot5KO@$j|>y*t6qtd4is&oC#RmjJg^w!s<Dy^{hz
zl1?p+zjr3suUZLy$8_WtXaxOCE6Eo4MFr@;PqAOLl1aL?OI&>}#okDPKbgE4@R3o$
z|8#;K0gM*NuO>R|SwLhz1AVJII_#$ct`j`bVLulz{GVzu_6)$zYtN0bKOO*mvcsMe
z^djPG#XHz-Ot~3Oy?d7J^l+U=`~~p8n_{=OlsI>n77YHLSyemiCzz9c#5WH8tcc&m
zidPik-{~oV{Jyhe6wg#kQQpS>;r0PfdShf@t@Cz=>mO;6VzdsdK!3V5KBfX3kHh{S
z9%Ad~;^TZTP3z<ACB}08DBq*RPWeo?5{&p0#Q#nz_VOv*QJw*U`$zdy`|>~1m;aFw
z%D)=se`KUL|7uVE)2$ucza4?S(e-mmg5Qcxe|el$lA-?-oW5IKKR1fghDQDuQ!MUH
zF&+RJMZVIl39fIF^B+KbW8L_8K2#o`%IRkzk5rL9l1KdO=cdLfey;!g1mLlW5dJH{
zObz|Rdgc5i#Mcgs{KfY8@%XP_|9YNOus0rJiAI8-O{M|H{@LRxqkKVwe8+ES(aBi*
z5xX1J<=x$?fZ@w!zWxd9U(k<`I`*d&(-H~dKa2+qk5~I8r+>E#b9w;3nd0_&z9u2R
zcY<l&XaWk+7U17*f)ySQhg0mU-q<MJp7R>>###6~)-5;4&x89(mfD)aE~MDAZKe{#
z=rjr=(dGFj`uP;Qzx@%Tde`avSYfZF*li&lLjMP_X&(LlJ$IS+J@P1iyyDa258r##
zJn_AUAE6|E-agO3e^mykEb;o(=hcFM*W$ed;*U1^U2K^AE`dz;5&Pxo7pp!KT>ZYF
zNU<d$Ks@x`#A_Qydgf9)1bV+t!apyn*b)Je*3WzYCh}K>{KbeYP5b5jP2}Gbj{fBn
z<X_%I{*|C;vxn(LGt&C)i}GBLu<+@mNYGwEkjucIZ4u^`U|wH{;AlE0<bNmdBu|f&
ze)(xE415}}e^tn95qX+QdAW}Cdo1A=h!|<NO$UHnXR=T4yHZZ#U))4KGen-YRKNV&
zo5(*P<THUAe2??@OaV*#V~Bs4|E(eRiLc-Oe%Qyqfiv*Dkf)qSo0ONhEG8r`j*CPy
zcDtC0j|9peou4&<mcz8LFZYMUv)#dLSBMBEWGmX}cX{%uUw)U6kMh#IUp^P?A9i;i
zqm(}mx@^PtAH_|1mH)3y^2+hJFepcvKSFOsPOgu%BS*Pja*W#JOCi@q3dm8cll-WQ
c%JH?`B%gA9<QSEs_+%cX89B<l%2CSw8>=bkkpKVy

literal 0
HcmV?d00001

diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
new file mode 100644
index 00000000000..7d7e30069f9
--- /dev/null
+++ b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
@@ -0,0 +1,6 @@
+// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin b/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b2ba290a27dadc32cf1efdced36c497c5f85ee33
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<*}C&<61Q}1pwJ<$h>kK%f}zvdz9cS=eVi|@
z(^Beo6MNA{C-!O5q#?8qo5V*nv1!`)Kv6YSh)?^lPhD0`h?Xh|1tE>${r=C(<!mRL
zmZsG<acb^){_n5P^W2?#>fu8N!=X@a0~1%6{hqbuoQ8M5D1dj9Vtn$JC)ormumRj}
zV<E<j#&pATJ*$YB8s}yN%tx~^T`RYkQEM8N+GL#>p55{c&#id%n(5Rl=HzU{E}Le}
z@+wW+ELvV^+N{;x8MA6nRXj{h&$TMGN^Q!Vw%oF3)uzpHU*Gy@YCyAHURPbN-LTyz
zl%~0_Q`LIWsv-pykHe*Ut$|RyJmu9W+r`-_bJDeDYz0&+HOyVRDv3U9bs=g^QwS5?
zY*>wk5LyLqp$`25KF;x4rq_apWET}vUK&oN#E~TLV}<X_E=vD_P(Dfw$yn9ulxsQD
zJbK5idv$NlL4*N)>FY0m=+YZ_|E%A8SD|Y3BQy4lQJ<V_*q&&4xu_H!3jwRVX)5^0
zqYpo73Vt(fTGfh$)<ri2SBkmA#IDlSB)sVa9`~$TF8^z5$6rz1YyUr5JL|HEswIA7
z7he`jQe(q?3p0TuWP1P`k@J1P>o1Q4C}L^?ud&U2_&V^R0DeevG&z5wkN>-pU-2o8
zFL8e5#j@DXIX=3Kkh2RMKhTHY940^2sk%&lmg6@rBjoHyz>f?E@*`Eg&G~Qb<L}_w
zA+a>4OJkAeXiRK6-EQw>gPqguor8SJ7~yyIX^{wNE3y~C2-%3r!1F+b*<ZjI3jIC4
z5%&-<ER^zQoRn90OVgEFdA5X^Ib}8K_1e*jm1<^LYPvpSr<PH4W|XazU28T{R^6+l
zs+D5One(RWwVaVjry3Q{{<340p0TFvhESX`mFko`Tk|S78d<5C&F0GX+_BctamRA(
z(yUjVEsm8-HM?jX&5bsv8pUIteXKZptWiDc%~orbmf_5?(C_~ClXC;-x5yPUZC4%J
zZ4f2==^y^|=D=@XLsT*3T3nHkwmG!);Z^N;csTT-77kxzp#`l4;DgIrD6EGTH7%SD
zU0Kpj0k{f44=r4|x}-fHeusq?7cMU?e0YU3Y9~Ru_TDwHTnvS-Tzfwf)-Et8f{9<B
zqA{%P$FQiyaMxyOx&Yb%h=pGcBi|aY0Ch%p1%L=R%(N{b?N(Cp1-g-|+PCPogKp1}
zTiWe3-9)zobQ`9di`(Mj;=-bK4}m7#I3IWC8Gx6yZxZqxIW44Zn$_r*q}xo7^2dA%
zw{;7FP23h17nYVTe<0Fw3hBcv9lE-J9Hv7{?_YcWqsu%AOFZizEG;fl){urarGW}#
zG*Prly`~ooJOFx@J7s%%W5(Q_wwPg+%Q#sYW}|Z4*2n0SkSB?5;B1+y=^5Q97IblP
zJeA2jt$#&7v3qovPJd(LyYvE$1^h|ecpA7o=;@R9|0`W|3O$t>|F`-Iq%RBoyM;Zn
zTgc%rBk+Yh>>$QUqf)Hex?$HmcTT_FJ$@%M91G8tx-mPJ*A2&W^}f7ZuMLx1Fw?tk
zo~_JHw3WSywsJSoRz6@Wm%Z*zlU*<x(0ZdgZj9YWZza#-bSUfD(J^*{z$vsj-sbdJ
z(%GBI?Ze{bxFTYVCEdOG>@o@t0#)9dB^=vv$i&-dtz;JQ=2=Vcp*^Qs7VF0-O5O~X
z?NjTuVxGQ#1p{O2XCMOu6ctWu3TLL?G-gE_GT3>NCC{pqjTS<0Ojiw=pbYlBq)YVO
zD|v^Va=pUOt{6VH;Po<)gMqG~QM)91g+dNSjEo~<5Hl-9jQx7O9LmEXt5)_?jby?w
z%#UW$aJBCG6l4JuLC;fxY|k;Op3&gtb8u#W45s{OX0$*IGqY7=x+3Xr22Klx-J|(@
zZagzSP9eLh{8=1Epo1%in28`1yIG+bgi*L!F8g?#TuDjI!dEg^t@~8!CN~S7OdifO
zW{W;WbQ2FJo0EPXtCVfUP?oMxEYhM{X{57EWx+r;mum^vcj@UC`5{{x)+J^l>11>)
zTlpT@%<`O`fD#^BRzHb)a?Sde^t4Vb!lUVSO7Oz@9;bMqWhxmth1^M};4={5=U$au
z+wyFuUageo47{3_JUq7G^UtB|o;`N6Qu55I)$sV9BBU}#PkgY`6Q9!E6W^gsmrAlw
zvP#poi6?W<2t?04&->BmMD#ok>K{>TnR*ePXutj_(Uoh65>kCpqP9Iz!ZOFKHA>zs
zV}6ulfhe&8x})T4q)e-5rCMgNMAi@{iS@;)GPc~UFnV2a@|~gJ$GJNYC*MlC%9-Jd
zw=6b={(910zm`ZL))y&q%2h2#>gRE-CFNzxr2SYk*<i4=r|@7W#le>49YBpWnH$fJ
z9~gb$fY??Po%c4iS%A(O&$^yrIruKwGA6y_hGOPv&SEQEjX`W|V{n2tJtf9n=1G^k
zp(NzQwo45MzbQxZ(9O|)HkmtENYe#v<x10-oUK-s${g25Y}o10$$fObyd}K=1++mc
zt9%a@wsTprcPkRlR+jslqf}1LScY5kC3&)PT%M-_j|4p&9G&B8crK{><ReE0j~wFM
zvZX>taP#c*NO`agypH=8X<<w%8c!7Y;L!2Kk3e4)x0Zv#Wp?5TsG?r%Rz>C=WJfxf
z^&Rw-gCY~$iG2D1_d{9sd8ngNK(2n7N%&W@RPvk*FIG<Y38UQqAYGJHSVUrl^RglG
z!Ulvh>B3kpn;*~dJY?v&rXbbRTEGaGaiA&4kQWTF(xV8C5abhk1O=6Jxr`I2WR9MN
z`TE7F)qSuAkJ4Q_KW4c_tkRQ+iMh7t=R3b=`5D`;mFbc6Bv_1w%@4?!{1$2zi>}?o
z!~B}AlYRR5?@nvfwk8*Dh){oAGggU0i=Bv37Fi@U$Zo;7h31>Zoa09V^YI946a90W
zh=kwq(u`f(?!!^Gld*-(J{)0s7hPkwcfn!$rW=qCvB55w8v$KAjPIdaJ1lbNAiGE(
z7x5`IG7KN^VTZo$(4TfkfkieBvN-0$Va&%Oq>W$n{gNi1%JCm^{6vmd1slFqGM)no
zvb$EnE4{?u|CN@$d2$z(8urjCdVGd8PR^VNbr^jPCWr2hvpeRbqU{)Kadzg!V26DV
zbK-dga4W(VBnz8QfM0&WP69a}XFqKX@^82Mw?HH4XIe2fLCK}-tpt0i6^qf=@WGE?
zOtA9_@W*1e0p33#`JWqP`-zu4BKZ^X4toI**-sNZ++jbEutV@jhy75(<bO-lvP0jL
z-%f!(&|%+^^kSTifc`1`>Admc2C~1|9pV9W9wGQF+tgx1?9u>piVGU-NA3l@`^CQ&
zI$_ZNh<ICCXB*V9El|Sc={r00{rT($b}7N4E%_0*13Kc}(TXw$Ju*b{;7_&&qtt$b
zT;F5t^!LMqycf0_o7j8=^f;%-i9Vl*Mq6T?Fw%#7{N^|UI|Buj57<k#hB^PR&>v2=
zMmRje<+s5fvW83glWYwP5oUnXQ^<d$r(V&XdPSpDuZUl-XtcXtk*<23{(dCx$8&cG
z_Q_FPWS{4UQw2Ov@oyU7pZ~XheiPf70K9RSdrbNGWgPI}h!3OxM!f{qVS68`WAoeC
zb}t-$_1wMV+{XM_z?$a6$p2tNtR<#{BOM1E=k?<7e3)%_BN3iqt}hW9LVrO2^X$PT
z)PGxKyBm$tp2jx_<X_TZ`y-9L3H>b%>1m^7+lVg}0Ug^1$zRjhrLzhK|HpudKOy;{
z5&r4e&cMG9ryI|Y{<?;CYJY7R=JaKFXPop4e)JQ?$1iMyw0`yTT)*IlzxsL3FW?Kr
zALsiU|0xXXL3cHg53+xL8^ICkXKtV5-Ef@h&*8UH>>gLssG_`zs2}K2(eV>}J;dAy
zeg~rK4D~Pg!{H%7F+w3;<Ok(^V}e~o{w^gVLoVa}$J)p@@)L8T9ib2Mh!6bOsfnNG
zkNBfNAO*-T^ld1RIOvrBcK|;^WP*`j*ngB82E^JLyOLlp%&`Fouz{`E{E4piJEFgr
zU|*g4gwdYf>3mybZzS05J{>~;hp=fL_{KvIm<OMD5<iaf>EXv8K42dC`r}Vf(muOB
z%fNp{255U0`%1Sv6B7Kf*fe}1@dq#3gI1DHOXOWW#9n#&1-!=uSAUovUW`pk0pg+a
zLY{K;w92o8p!ez|!t3(kZ(6YV_g>y3tH|rp(3u{A9+0(Hk$*+*j~jXfdO&t=75QJu
z<6T$(fk8&lD}SHBJAQz2a$&2O?2xc31MShlOK!u?L=cJjf|P$v;z^#4!e03#3Im_|
z?Em6UA*hSN?-3=h>R5*`;U*-X8qQ03RcEqK=aZ6C_*Ykvzqo=twN$VCl~v?Fmhx%o
z5ZR~mx>x>I#J`gNJLL<C@{{C=uUCE)_VKT84NORR%6afo@(M@)|Ayq{a#3j3Rxe1B
zK=x_<w0#aSNlN=f4_@ti1SHeolgL)^N&rDV^~(QD$_IIA-Yfrr=pXiA52KQQ1ax(+
z)PEEn<yHNER>iBv!_uG{RsINlJ#wl(%8nXUz0?@ANB!&9MG2@;6;O^rIYOy=D0Vff
V`lvA|NAamVDt<MpysA;j{S%2@=^y|A

literal 0
HcmV?d00001

diff --git a/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin b/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0a22939ed8da0dc5a10e458e0d67ce85543309e8
GIT binary patch
literal 10936
zcmeHNTWlQF89qBRzQ%atK-17bGl94bYM8y*UMJ8H2&702A|<pBAT*lY9ebDTWim5%
z*D)!Zw4zl-0nw*EwdxZOJn+zmw5r;u0)!AkRi#$yTZ*DSln~iYQdKHt`~BxKp53)u
z)o~&!agv>L{`=+p=bYKIe*Tf8N3=vDH^4;G*l$=%j;Sl(BY^TA(LZ&|Q|u6%X8m~H
z#uAKKK{*Kh&?#V~`h}W+d2a{hda22*`n*-C&or17y3NoE{Yu!V+g_t$&&&mG$+qiG
zSebY2f)f_YcD>=(?5aCk2{E)o->K9q^;x^@_@&UPm+fq%Z*w#S(Cn5rRhRDuu0Ic@
zY3}Q6wNY@YNI@m!aIsMj5K1^fd9_MzVQ$u*@tvBhfNG_VvF}zTF@l{AM19^C!bG<N
zCkTYlI(P^3&?^w(ys&A9O?XIlF=NV0%c~SQlH?<-@O{-q=|2$4M~NXBr&^u$9k0xz
z_xwiK2p2p=7}J-&{s|COdIO)I)qD3URE2)L=GLsn%uL{hqU7bGQgkc?tnztV!N(ta
z<S|?D+iBaWRveTrsv*8o>^(Mim5w6eMJMohVk7wfTszT<>h%AAT05JviK-=jV;5f*
zOHyZBdlqIAN679N){x_az?(nY7Ndxv4ZP0AdhiY46EXZ2$uZCQlRf<3m;8!PX?&IQ
zD=${Xe#-HEs|Y!IkK_A$@M9YJsZP~Z@=1;#UPZ{!kAUB{HI^S~>V3|CYY%@LO&-aN
z#?*9XJV$+U<V>rzmkqYhwDu12A>%yzdbCJ{v}N3eU<`^~C?M*yKxynR7$p+_9393p
z0Srr|!kU)~8-B4|sh8%87}>K<&}h`3tvIP#ou$f+nwwfh(V0<lQf_@dNI8wLlB!k;
zDQ_VxH|jYnlTHPd(7o4liYJ^|HxP<bwo;w)=jvevM<XlN>d($h6i+Qo78}`9*?dsV
zd2?azRG1HZCqG*}anf^Y*~t@)Vx^qVmp#Wh;aT1SOZ@t;KYpwK58LI6DZ5qA^#h`4
zpZxYuZ}$J{7l>*Ixfabx=wpeMYnSy?+SbH1UDGbG#FE|wuyP6i^n{j9EbGr}W&$)b
z@zKiVOIJSBhe5vtn#Agh+J!_y?*j~(Wkb{7Wkj2c7>E_74x`h~Cy?7e!o0>;NoTf!
zjycEl?Fs!>7}M{j2f3x&^w>?0=d|Cmgua7@BlI|e$F&cyd~{`LX+{4WVTR~&f*#+Z
z$L|<DPts#29+w1mir~lTagZMKcq}b1T@&H)$a$chqoofoUs?HRWqIk!@(Q?@m#?m9
zZ%|;AR%;$9uQclBw3WwOpySWFp&8Wd{b`3;PN{@*A+UqWDc78$JzkyzriGJXwr*xj
zt1xYfegEl9<{9&==HmXzeI|XT^83tb>ZkFMxO^J83DDE0@B2@>=nQ!}lmD;!rb%BG
z`u7WaWOq7;Pe$OUCtwFLR)R{Q>YA2Y5B&x6dUxo(%<>$(6`I!E)P!kyp>Ou&<$7({
z+=89nck^szZlbO1O|+G}iMA$UwsP6)t~6N%lQFF~y5gqTJ@lR8B=+`_nVp<siv-T1
z%<)~!92)oT&mY9v<){Ke92)oUpUAAD;2%(>`?Fc#*q);{zDm}Mb^%`->*)iu<y1>z
z`B(+Xo58Amdb1YHGxx1wU~2OWWMF`z!cj}%)Ee{FoJd0k8_#$#bSfpQiO_>`)shLy
zV9Oi#iN1d=@32vBR`}U9!^aN1Sq5@2&=EA|ZXBgTA;%*|#t|`ynYALucD-2+O~4_i
zUW!tUWWq4aPiE3^wc$q;WC0XGp8%;a<6427IyfXID2vksE(c{_O=}is3~1oU=3%N-
zQbJiAG9YlZS=fqsL0ZOrVz@V#g|@juBt%7`gg|1oxvKT;N-1)sB<(>hoaE-zG9k$l
z<rEzmT%8DL+-Ihnz^QtKb&0`plV9HEvpg4z<N){2JdJtaO3kmBX_KaQ4SxhL{O<CH
zM^d6xlRrqB^at(&fs*GA6@AwUU9VBC6c;SqJQPE`ZQ#!3X!gJXcfL{#?Wz-md@m4E
z8LKNa*y##Q>0S-(Xr@CODK9$3vTNf_IJ9DcbEhYwz;hyS9s<=wN5Vy9qGEajL?LY;
zKuGljh#Bn)5SBS+0|D~#n2G|NiUo-B?F^7teMwYsZ-5yT{stl>v7QiBvR2Cz^}W#$
z`4XOvLfjt<k+1oVNzCx2Sfcg2R+-YKypbRw))OR5a8;{8Mk(CDL~@5S=_t%hHXdhA
z&E<^P!Ls~_p-P&`<+J(2lMfyiyN9BSy6B{{;G`c~j)#lCrZp3uvJ^9~{45s4dJKFD
zvXgLv_BJIZwn9oYiN**b79zlJN{>7kb5vq8xg*nQYS31y%v&>a)v8jN<C_n$VMCj`
zub0keSJJ7zX*-qf<oH_8a@#p<PjM2@R+jrq8#J4#IhJ3KBzdxgJdYH|b{7ZFP>!qN
zTBz{kYe5FD1-NeWtg0&&dLjxwJ3LYzY=G~(kwsb{Gm6F&MLu|R6m_Z+0n-K%u@g^B
z6|Jm@##NEC9>SCkW+Mkf<seU0jyo}tKFs}4mLnb>Y>sPR<q_`H9hEyzq>g7ThY8`{
z$H*dYBJtByKIFNo1~FBT&7`NNa@mP|j(d(Wi?d}!HiZci@laH8h6!FS<Y}iK)K8pE
z><q+)dL<DLRV!1Jqc>F?OPwcY{H)w(@^h5?f~9#Hu`u5aBMlP=xGw(k>()#3u6P<O
zR^ajza7f-TI)#Go&f_I}L+x~rKK^UY2DNR-g&QK&8&}OKQfNbqL&&0$8f3S?Fx^BN
zV`5Bx7~5jwyun(8`}_!z@Y`6Lv8%fyxR32+Y-ucl4Q6)Gb+)4e<~Kkw`2-v6pmQUj
z>&I}%xU<cSI}COV*LKEM*vC~1-L}w2zkQk+!-H%T<E<LTQwC|`jqV9)@QLjICHs$M
ze_62MHj451CCKhv2e0%JfA6g)-D)u=uNk?9hu6^~(KoF-yO?M*x{jp)K1j0L7o_Rk
z=&ez9c2R4yA7D&81;9;%ElC!^FQXj?^5>)MCryLjpgyo28lmrOvyUC3$)NUjlD*pO
z>!aIf;~y_2**i(__x0Td_<?@O|D3@dAYSr_<QE6q>;*t%e~J7VZnGasI8F4CHhWRR
z1^C|-bKj;b={I))Hrm@U9H6`1i;uHypnrm4`}|7->|&CA0seCLXo3Kb`2#$l_M@P~
zzcJr12+`jAGKaT;{{m`+%Y(kvPY$5;TI~yMuJ7!k(PrO+0Qvi+Q8wPxlAtq9gTD7t
z?976u5zKa4Bb2Z4<{+mJ68(dh*)LDQ@w1?lJn}W(9Om?4qF+q*omm(hjN~tp{0OIy
z2)cG=VR$&A?;&}E{9^{E?}Gi2k%+!G3Ao?jxA!~0IKn;{Ws_kN>!NkAO-VuicZZtx
z`deSl0|NXwU|o-3_%|4c+GZx`=~2L=xc5gR81=FR`G&slu!oj(c5#;>Vd%Q5vp17O
z$MPil3g{`K6Ab#_b@qOenS5cj4l;I#(R`S%aeZIsF#I!jj1vDU{TzoQI{C*N{j_iw
z{A=nczm_O5=qG=n02>58@OQA8l=K#l-+WC+I4z-%?4RTGNdIoXzaK7w84dHd8}&03
z!Ql6Gkq_?wE`s}ceh4Q2eQi|yIs^Ef6u|=;>5}xjeSMht`Of-wGI6)BZ6P}C$yBdH
zem{QqpmuhE>Sf3uXbXA3pd+7{-t&jp1?2l;lJbjkCK&l8*k}EG{#!IZ;P;tW04>Z1
z@-wiP$l#|2`oJ`i32rg=Zj!yQz<B-i@5JVdcpg*x2W#o<Z%OvGg^wBSsqOZ6b#^|<
zc0_dO`xL;oefXOXKWHC$@+tf{%ZJCFc;v8s{2Na^2?sxGpJm|FkpbF{#X9fw=a2+n
z(K_+OA8%U#tR<h8$UD1;-SYGcY?leH{utjp@OM&DfOu$c;@dWQyxH+~1oUp5L^#J!
zTgVHx=-JJCd>wglJ}_4A66gY1YaRKQuvt_4v*@B{xBOe{$j{O_jdqUCk67!Lmw!g)
z#aQ^z+$k`7BtcDs(ZwjYP47?_{$VNqMogYu>6RbI#K5N>`&XsB8Iz}>l2`NCfH2_>
z5i!<w+#vup&*VS7=PEgce|;VK^cwP1Qr+^G){%cy%BO`JyvFG~?Uuh4@vr57a}E2%
z*DXH@`}p6p240c!l=FB~@(M?P|3UI{xhOP?zEH?72b4crKdn^`J{p(yReeO{yTnMk
z5uwReyd{AkpStDmkn(X}8h6V-i2M=mp)N)x|0w9HtyO;%H|16RJ*DDR{V{1!^(uda
zz8*O>Kgy2k)qJTwZjVoe`l6=<R4?aAeVL2P@wr_mpK5+oAD5%}R34QX)vLU!UdjC%
Dx2T&K

literal 0
HcmV?d00001

diff --git a/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin b/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..392966b0e2e00490663a7a868650f5f2a49cdfa9
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<Vcq#HFK+4DK%qlg5glck1Vg8@eMzqPt>=sD
zw3NEt#0J{v#J+6Oq*Z*{7$4EZrfKU3%Bl$=K27`BMQB2_R7ofZX$<f8e_k#ZJK3}}
zt+t6%e9!ZLe|?_ke9p;-4(tzyLb-KJTw(Tm)|PV`-n&--?<K|f#7$4Iah7L&xZlD;
zj2VrohUa=#0W&quR0PaNvoTdIwU|+D8cubh#thGHd4}gYUae}@YmPZF-LOlhS+zW;
zX`2PhD^8i!np-i;_N3!sYI?5aRGsRiIc2#e&#F$DBfh@1(bRxuyR@ddT)SbrO(;!s
zUnk48f>lNe9FN1rTD5^tykW|#QL+ouljekLRcr;6ohs(8U6w>2ww55OO;ZRH-E3Hm
zh7ei-Z=nvo0zR(pwM?%C56Lbnro1%jPLU%?-p2~x7hRP81EGAB7?QEd<w@77Px0vM
zZq2KCGj&85(3igU0*EfXj`z>|y?Yg^Mn6=sD@JW%qG5ZY<>jJMbSwm{@}{ZaLytW4
zh$;BZv}u(c3$2T82(A=!n~7azNt5uV6L{RSYPtNctsQ?wEnWNn(b`#)O;j!M>$~`}
zSdtnW>{*xz93fi+*od6(0bV~l6rhNy3B1NO_TX#4hXVKk$<gHei5~v%Nq)trG`__7
zl^2U*Kj--9B0|pI;`qKE{Khc(sZP~J@)3?-zle~t9|Au#7|4%Q`8Majxre`lYn#N<
zoGy(;o}n?Z;Z(c5o%MH4wYT^4DPx4+(W6Boq^-zq1S4c4Dg(~~6=r`0V<`0Z_<Gz!
zz_3uttJG6o%`Hwj)zWkkGjq~v)N0kkj+Hukl%=L>6+5+vqBEmprR-|6k+N!@lPWuf
zRDH&qs#SAFCY@?Hp8b`&Reai-v>QTk%2cXT?sV02a5S>w(WCB>!_8{Nv7Dk+@+PXS
z+)R1&aOH^Gw1(@BTdo|*7MzAVG1GcF=N+*thaICn!$QCN+mFxp{c)3AF;jNAZo3Vl
zgg^VkpWo>F?Q4iChFps)64Ew?7CyS79SaYJKGMSBi!3y!wE%p0NehMb(7dLF)1k`?
z+DQOc0O+B)%U2e(XT$HX(EQw`g}IL|b4Kk1NLSy#3YLqZ(B-QiM8euz42odl*C%NV
zYkM)wYcbrlX`0S~whv<Avti_0;}xLJSXu!f0uD26Q%Jj+RD6kU<f`^<x^1J|Gvt<b
zD@`}hZ6DnR>E_}#KR-V=uiZtUNjJ{N-FX_|CGA^;JWoywX`5zMx+UpW=~DivPvN$1
zBCv_u-2B|a!le&IT23N;n59Ek=8(g5XyJpaAAEd?Ct-nS{lkU%dCD5n(55s{VT>k<
z=G3Zs-WbDkq~%W9p5CaKJJS|3tWpUlOT%n9$83F+P6>IE=myS~$*P{wjY3`*C&!bS
z%v1VT_2WB7cIfmO9owPjY0Tpzabs!VhCxrCxbI)-qEqO}%-FxxmnVH$=-(;qk==X_
zpNzoghhYaXIt`~#wspg<dhU#Vt$X}-X4EY_SL(*}=&)|oJy-9^%eC4txdk)5<Hp&_
z+(28|8)z$c18oflY~`}o+-b54Mgm%|cgKygd+1xq5u6SsJv%bWjuSYEHpjO)J(jHR
z&gAxB@p4=NF~*Ya?&0hr3JwBQ+MOjF+jhXjx6x|REa01GHNA`WoN`I5AEO|7Gg!7y
zuGNZp>b_+RjIN!53=B|IIISt1O08*3i!@}g^CXL&<&=yTLT^l!4Vj<}_PnG^^qtFj
zhn;e*!p|-nKDOYsGLVCTB|)QhN%RVZ9E=zlN5mjzmWvqs^;$VJ42P_0$xk(s3Bxcy
zl1anWn(I@L1yBTim<nWjb))PV4PL%F&J2*jluKvQBcl}NblI44B;C!xX~8g-8OdkI
z^7$Nv?5gr-aTtLPt{h?x5`<<DU=?SoQkccbm2|V5nuV`qu3Yn})J-n&?1V3k>4HxX
z&BUY0!UZmO)G67Dp(H&4hcc)Z8mTN%RWOjr#Ztoa9eTP&Zpe;?afumE)-yU*t^69<
z%<_~ThZ5ntegfs>n)NU1X`MP`8K(p<obPgq$62C^kyA*WbP7HL0e<dP(X}nluGh*=
zamK(m)1rr`7QFpAklnS*ZaPKJEL#naZz)15V|2v_J6-W9-Cgk=$ShGw7K&DJ$~N&}
z?iqpTx#z=v^f?hdkAwP4)LN!qgeSVMH%c_+Dx!o`Pn4)_SCp{KF{_P|cgv_B<!B(v
zvF<4O5-HIlTCSEEtdLcNNn$;5s*EjmD~w)KoP1-*`*H3J#L0J(CFRWUwObN<LT^23
zt6xo|5bKE)IpwMrBlYvR+LH1zWzv4EnQSmv+EaM26XIOU@(!TJn#hf1$M%ifzfbIA
ziq3nR+AKh4jU%pSSarOXY#9^YF+(x)G-pS!rLDjqHa6OSY130;+-070EUf?`Cw5(G
zIQUIDl4ov?_OprH{(PD)Xe&8QV`92oRw{E`8?j-hLnrsp`SO<ZJQUFWEZxcRJy_Vz
zWyRL5NIY9v?r)A#IZ?3;x9UssWC?km3Oo^Xad32wtKqqz?vqa(89Z@_W6PEb>w=qS
zpGV4rZQ$FuZ;=+pgrf08kq>o*+@*?J%R%8HJMjcmQ7<l4Mds_sj&v~VJLo9~MJBit
z!|8q84`tcsp^ioYx%x#W;a}BK$*X60v2wys80G#4>7t~<A`&Z{mklv2Y(O}Z&X4A@
z!(%y~hYX$86r_4s3mD-tb!ZAQ3=0NW=~9G72=a+Nf`UrAT&5nVWR4z%`TE7Fwe(;O
z9;G{Ue#~-<SfwWr6LW3P&-d`I#Yb$rTB0Y?6JRkKHa{R^@>Qr+D7bbL&+@CfPW0&G
zKRc~b+p1i+E<(L=RjeX~7CRn0!6K=Cb`$Jw!Y8s(%*l<&P+&e5VQs>FVFQuy8(x~R
zt6P0I%C<8$x6y|qOkYCR*sV+8Ful(W$cI?}5||qST|0<(p_@A_a(h3!NG}%g8Z|Nq
zAMjF#-s{juyDiTm>-$+8^T9CYqY=`^FM4-Llh5S%4>^8{@rq!<J4(jmmms@i1-#Np
z{Jn2y>0KvxPpM%KuAs+fxW>up<Dm|t*TLl2opE;Cj8wQ4V>QlBAMfw5?_*9pF9U8x
z*qmfxvkCCaH`qxaXXEUrt$zOQcJC%=1pRa?#>OeZbiJ8iFSTMZdK=#V$%_egAp!na
z>=wX#`y~G}{cJDsl1C(eJl<i?0V4Zpf(JY71qs&)9_p|kNSOR@iF$VEo$}i$(EB><
zyOLgrvmwwwgFl^fFRml|8{GjOK<8nC&#(<GHo$%oVfDhC2K$k_0srciSMKeELH`rt
zZE2k?P{%ew374n$?9g}d%sTdNf<;^MC2j|F#JjB(WoOYR10)arWUD_)E!fZXJ<3i!
zAMWQpvDw(bW+R}-IXzDF*+ewj5^IH#KIG#!#}L?QD4=}6Ua~dF`3HsmaI!VT;UO-+
z1^$pVT+*Lpt8aiXeVm>`{v%!Wigwj28l`$g{CY*BOY0R`Qm<3bN8)}wcZOh}9K}WU
zd44!mz_S$phCcrKfAbeNu<Z%J>j$~Vl#gG;0rwC2F#2!AOJFUw_mDa^yM=A_!r@oX
z-#x}{%$@<PX+Dhn_cz3vVmdg|almn2FAmR!*;Y3a;Th)o5}^U~2lPM79#}yAw?wwO
z(J1X}e3L-_B^|au*4P`+-_nqtHd?lg_)-zjv4xQQRgJxSM#13!1TgU@BtJC5KOI{d
z`1jy=<N48F*U(PwuP?%!z6fuRlYYUEexmsJg{_d*uYQ5+7yR&7zrgtg{1)-Y`5woA
z3d4HPT}|YJ>|fYIaD@7q+b4N99H;to_{|i%%hfchD6b;w2YOU={5W3^F*kzWf#^C-
z{R{qZcmPn0P>2`#K{?Njvx~^zyNSqv%Xt5>Hu8=9#N22{=z~1s13z|a;^+Az{wNSg
z0rCrd>+&QHI_3Wzzz-9dVB{C}AK``pv9`u8C)jf{tPcXLZ!-=6qO1Lm=<g@k*JeIt
zw6AwM-_h8)1iRIzL+JkyHqCwCeDHpA|Km^K$8kPA_}D}H%tPOJ>~Tuk=hx>M_^-$S
zZO>v~S?bQX1b-wp4WCH-!Hf2w<>b>6c}EwqTb_Oa?=r#FALNG@V^dOqc<8*4ryM=8
z@@qfn-8zZzx_k(n5^VmxoA=NP^13v1x=Wx7WbGB?UzYphx-Nk(key#a{+IH2-=hA3
zK}OIme~-Y|{Q%|U!e%kqCSg?u+M|P)+=lDpK_upvrTl9WPx5pWcFQMG82Hp<|LpBT
zP#1&WBT8P?u?AtnjY~c?oRjjZ&Sam?CncxwudX2fmu2LsrMl%WuOL4s<<rt3vQOuA
zxBShBe>wkk`9($fN%F+kEk6SL`0H5%c_~jh4_-=M;pqQwNM0@%g=TH_MM)CKKCPd&
z&mksBX`kr9t9_S%WEy-D*$Q3>Ajqd~`JYMoATQ0k<?k2$!yf2jRPqmluCC?!kHVw8
zs{dzIylOls4XRP)kI>g5r|P5Zs8Q8RjX`_VzkXemfErZ+<rtJBl&XhfSEH(r8iR5a
QpUR`+SEI_S8kO8X0of?&djJ3c

literal 0
HcmV?d00001

diff --git a/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin b/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..6b3a326908c9130dd8168f761ec4d95b5b8d0993
GIT binary patch
literal 10944
zcmeHN+ix7z89%$Tc6JlvwT%T1Nt-dmYNER1z4)>TG#J9ARf9+geF35A?8Wv*>)p-l
z*sNm$o3uhzRS{I4ctbq#(1$*P)S^5<MEZdE6HtWu5H5185FwSa{e9m#XFNM=7pP6t
zs*JpI&Ue3@?|f%=)-ODK;*n@1lHDcb6%}ubmYUO0KP-XzK{-Bk+f!mz<U}v-!y+Pt
z)mUhFuIJ=2)8opbgn6_Z3)Ny%Sk+~#T%D^4%PTcK%X7<Kt!me6WqWR^Q7YPY)$z*9
zB|GnUg$27>a~JJOX};`XYJ09zu9mCwwo@$H>1kir_O&z?9Jf^5F+FanQF51|FvUL3
zS892ug0RaT!-ZP4;W<@rl5%MkOZlaFd(L$hOB$$@tC+i`iX!^3(}Aci+ftb5cEf2j
zq|g?42f6PS@NspoX?sn0NOrMG+DofmE-;eheXR6-!$svk5ZXtDAsMGqnRlJ~0!LqW
zYhKM;sUyOGzSQ=TFRSu9x_@8Wv{JgLPc4=ft=imNqvXl*R+~rDQS{iw%eID3J@)Wp
zw&b@{wo@rPC{ySQ?h5;$jjqvA5L{pqk9$UMjsLZM;_sf$eg3WX$+nxM-pb$ICd**!
z8DgMo0~$CQMg!QGnrDDFUmFTg<n$Eqh8XO^7l4lh@Ou=;Y35IM@vka=&8IcK$o$%i
z4Y5}k-?M>GvrCNc?ZWp&$xnS+Zjeii-@SoQv-7|Y4FvL|RKCspw|DWkajBzL(3~zq
z#Ga!u*>}FxIwa!l^Q}X1o(kp}@6w_W%2q6gV5DqJXJ8jlQBenDB=V2`-MB}95s?XR
zu|DC|+`>Y+T3jk%X3sl~TCIAz>`ctKLKJ4))7e@vKVLXocN)!REt_pN3*P)pVY1oC
z<e#n2FE3>0if5LxwbN&&md?yA<<D646%qN%-(PvR_pf&dkwED3ibagU$okFCjFsp>
z<fidL^aBxDyKXcA840?b#qE>zwU5_7y>7&!X2f_tnu@G_W<)T*Zk!|jb>K|gU%>ri
zqX+kE#vT%5CL;=ju^Rm_5@Fi<C+pWGfmB5Q97W*6oC;c^&Qt&qa8($0MvU9ZDT{9G
z<C8QSr(1(ukv@+Q{3zYt6?A`yZXDoEV+r%mKfbYM946$mbmKVZX?7dks&r$$Q#9L8
zw_mgJF`BZ-8Jg{*8w-0hJB8ad;}}iZ_GPxc40!GO+UGK4O3`MswQFnZ>z}S)2j{wB
zP+nf9ywDn1%O$r_u2s#PHG?NN$DJ>EW@FJll5&KFr!n;UhTSNiEt%8Qw^eU9Ep+?&
zs+l&ee9n};{ONT18S|&+>XE5&lRnck<7SS=96ky+lLBrM^wha~|D7&67oJYfe67A5
z>B~U>5owR?=Cb&tB|bL^JBYE|DCa9B(<)Uxcg5UJ*F7Yxx`QV=(^{IIG_AVlnq4{B
zj((f9+o|zeV<&x!>|}0{o$M{LGkJ^bOa<&@Gux_MRaw&kjo)Pj&xphH-QrBuD;Ldl
zCMQ-2oX3{I_c1duR=+QI3_XBRd4!f2bMMO@*+9WJpo;fpCV>+NPuTbxSuNOkd}XYr
zj#BHY6y;X5@`^V-J#Id|6PxdudpA*zcD57B(@;*);25Sb7HiAak_<nM<~3IEoO01>
zBIL$G#Zrl(z{Xsn=Qi^U4R$9y%WN7pI_pj-&q8@eut?_^N`pcTMv7PvDTwJ!BF&)P
z?}RgxaK@<?{q!P67=`J~)Z}JnahmLe*(u6F&DCZp8km*Fr)kAYUfrsAR)g!I4y$7>
zF{R-a{1W-OlA^g8CQ?OYaNHqRjRjZNQ3eMh1i_JkR-EC!5Ll%~+%p|1$ly>05)40?
zhN;F<-WQ^(WH)dpFBX?7*0bfJ?@CeGgQFm`IlUiAvM7reT#)rE0CA6-sb=cl00wR|
zW_GNeHnF+Y*U04zCu|kEWJb($SPs@~{@6^Jl++Hh9Db2%2-d@95n;$IBL5n*oVH2d
zY-Yo3cP?a#k+V=PIKM@zF>)&F=K`q$0{rYl!7Vvnsa~s;3o91B5Enc=n&Ac0iOkWX
zrR8$Lvnx)+<Ax`t(iTz}Osm$U`^uP$-mzJHC(<1%$wI*?ER<|~FYv5D(s})-{pcrU
z^c=@jIQ25;vZTACMDYJXl#uF*5_VYAMwD82sCd3{l<axVk8&muC3<dWfu(ul7Ull!
z4l}Jw``fK1iFE~v<`4>$jBjf_Wxq-Jp{7%TSkq`Fp<ub9i?ZQt60F`n-!fQ;bp`7u
zZev-~TQr4lyso4ZjU%%WZZ;S$H8gG>bMg$%@JUPMHJ6>q%p9M3;JEB*nl9_4lP(%(
zT+edqc>me7=Df3(X6D^aO?WE?UW?2OoS=@Y#Ijso+%QNqK;T$-J+SjAN$U9^OIe!B
zK9Wn(1#QLhvNg9<srV{ch}h7?QO{j;wx;Mupnw{ic8424R@z463X(Wm(%&qla&FPF
z+^R3h$;z_4tU6Rzh=o&>mCrXEcbd)#Iz^QEv~-<SK|Dy2PRk@iNbufAd+a<of=9);
z$;Sw*OrxWdlw0!rC$jX>jVH}gwMY+W=kWcpQQ}6NP;dI2eBLcB<AFAOo$F%dA4Iwt
z!-6{ncRR4?6v&6fY65}8CgS2Y*c!nnHYn$0IyMxT&%{KFaCi0*32(x3Lfjbj;a-sv
zVr|fed&Hp*dQ6xdutC-W@=<!17=R-p-T|{w(2bLL54W!^V*BIb1H9B)!zVV-$G*{<
z3;Gy)XGLsxT=Zi;5XF2tM%wtI_h~fwN{#<i<CkhgI{2b@Rg@Eb{Xk8^c&(T4-ERl!
zU7HZWw~P;Mp+})_*>GVs(iZgEk^;E9U+i5`rbjVW`^AOTzP9*1=EU<l;ATv$DHd@x
z34RsT7?4-{#jDLYzhk`r4rm1ZLNg&|X))-!oD_d-CKB}4HvYv+N%3A1{E5Uc;QM<O
z|8sG1Kk<@BB){6<7B2!K`zeA4+TwQ#t`j`e7JpDM`QMal-==q;?@xd}+!ik>dY<S;
z`%Zv8^w#gAi(&};Ujc4kdTE#VC@Jo08uaQ~E*Sj1uo`cR-vTE4h;Pu9%h;X;Kl~qb
z4TIR*sf#S%2Yn)q-%ayh7^H6q`YxfSSU>m&+}(J8EL!ak+sxn7q{_h6hE>=bYxeC%
zDT}@+^#AQ~asEoQk85k6)hDjTL=W=M7x6=XPgA|*B^db{YxWD`=qLHB$;ex0LB9Y5
z`Oo?XSpR^eM_+BmLhIENTCbiSTCbSDUOhdX>lN!*uk%-8{p``cANGlS0{+Q9!SP*v
z9R3Zl{UNgd(Sg|cE4>4Le8mXp!%QD$|0W<$j^k4CoxjpI#2KJ`yxtEw?QWkQ8EeKl
z{y52BzKHzo9`^a)=m*>%_x;0qPI-gq1Fd7EUR*sOM!jhCPw)O{hIJyJqn=^-F!C2~
z$kGrdI8yz92WUM7!>F%4wEm*?ym)X8>wh4oVCcSKh<B1iM}s5!I_MKbCm8gbhIl_I
zOio7Y7>+AJ^<}=n`hLMM{4q!9$Akd=O@@6s`NtQyXg|lB27Ur*$t?x_<S&u<27lwt
zK}BzI{g`i1J-4Jjvi~O2ef^_u&o18aoUdeb)Q$CWekCkS*O2*O{|^ux<NOdz{<~s8
z{w4$Xy$OO<1CjK5u%D=6c-1dIF7Clk4%C15;6w7GZoi>c9QAw=a8mfc0)hV6&xw()
zmLWb#iWgTzocMe9;cy_j?r%iDmK2Y!d?~2sx7)ul#HFMd@#zryKY(rf_^%#%z<%V(
zr|{z~PftGa@NxUpFQ0gllKB1geFpxkGC*xc_LEMxW);|qcH)UYcu^nOOn$#YCPKup
zJpE!DGQsK(sbhFS2@ntUEpF!+N0>{9qo9X%66t02(72%Z^bqEKbPM?}l>DBMKnN7x
z7V<_yS{w-pgh27u7V`VlF}>zDW2rAJum7D)$EU}>yX5SkLg;1SvqdvuT$1zpYyd~o
zl#+i(;Yps(xv>0REDU_AD>i?)q2x`43ES7}NdJ#OI%gH19!B=bnO<j-r-Mz)Y5Z$j
z$fq`ur;-ZGf4GJG&y;*hy1_Nh&)Evr_IDxv&HQg}VxRcJ_7B58{<`+S%SxV~EtE|f
zw7f=TFd=!hT{K#V{c5TRlt0=(Err&@US(gekH(9;<xH#5c(N6|=;5Ax3d`?T@<Cpj
zhvlb`Kf*m2V$|{{LD$!2^+(}RUiJS~YF<5_R0j2^^GE3I$m#XbcJ!#%OOHW&{9qPN
hs20=<sK%fiqqfMWR;S0H9L1;esLkk6=T(nd?q6&!cl7`O

literal 0
HcmV?d00001

diff --git a/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin b/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..ecd10024549b703b0f583e6eeaa463f4df7e6433
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&YSVCDQ8$1NQM3XEw*biiH&L#MNSNxVAt^?V6V
zOQ~C?X`qQtd-*VF9}u4=HffrKCN@o5KLDyGw4r^N_OXl5glOrKP!Q4>-tYgsoX>W$
zmo%-miBodV^M8MRp67h_@p}*47Y>E8o0z!5?ANR%=QO-^s{q~_72~5<J<29ofeqk(
z4GS@5G-eu}>sdw2)VMG!U_P3SnOeEYjM}_WsZG_H;n_{k@Z5@5ubEE0VouF9?6PUr
zEUz+en?=hj&6u^iJ8M?$>57M`>A6;=R;f*!GnQNStlEs39`_Y)kg5iB+vSZ_=GqP0
zorl&G_jbBkFIrV(q2h74RIfD<jhCa;8fCjUH*HS2)~v07YNdv`YgZ-FhpjF|ZQc~Z
zL^m5&qalRWz*{IqzkrW(yr$_j;UU>Y(Ug~lQz>yI$@^I0d&foTKM=}Ci6I%QTAg++
zXNE`bxOK1YEjWlUpf9xn6%d_zIqx4g{VI!5b^F7!_N-B#nrhgd=zF=e6dkJpt9{;7
z@WT(>`+zC<&6H_XD;D|}EfHKU<}MSvOIN4x&J%dtGiuHJuPq>dS#@s!|Iq^4n9Woz
z^vk*gGgy`y8}3`2BREQS2Cxx1zYBQ%`H=ucOpCy4Y-=Ch1wItO4@r&%&Y$Sxe^&A<
zKBe*doL_m-5qp8-qaB2t{hs3o`ta9;$xn5<cE}#bZ|)%E?AySP3<vTfRsMnVU){&w
z#<fdgX-=2MB2Uto*m9!P8e_5ciPl(*PZ=Zp?mjINA#FwWAs8VWQ5kp&s4)987(=1I
z#W&*~0)~Z>-mH`K>TYSKQY+7uFf*sEM!jD9TE$8puCwG!eb!EPP;_vVt)yL>ZzQd{
zS4mbY#iX;~&D3jIBb`b%DxQ6tW0ekB({@8BPMS(}(w(b$6`YT(bhv(`UUA3g%K6&i
zVyT|5P2?(>{GsuM>G{K%Bh#5fj#Diio?Cb#SI<;SPh@lX3By@np<n&wyRQzM*(O)a
zj9qnXw?UNfN5A>wD+9lL2~iCq*W!wVw5_3)_bzBh!o#8Wv~c(=3oU6)0PmjDLSa3$
ztZCs?==_Ry9KZztdT8nVg%$0o@Ea_&ymW45>AmxuQ9B0G#kVhl<!mT){^C24u=YBG
zBAEF3aT>$gofww2LEN=Dnl6F1A7bI>!^pSBD?pvmT>&5h4l`|ANV}R;+(b8WRr@O4
zcG2xga!b3Grd#N?pKil+b8%Z<US3+(ZY0p88|UNh90GVw`!XS4C8ve7O|u%^M(H-&
zqx?ai!fjne;5=?i%S$UO=iU`*Iga#UmI_^1LJm`*m3J<_^Zq%WgcY9kcUP8|DQieW
zi_$=aG3HUUO1-8Rj0rqXTJE&%>5W-)Ps(D3RW9RXX_$@55nIpGDIrf1-N4y0UDMOL
zQ7q`<<ajKdeq8^Iess_HZk_(~6T9^SjRpKm+(Zhv9O$WIxBoL;bP7F|p7@vg3ZyRs
z{d<HxvRlaFKP~Wu9PA**N~2P&+PY!aJa<9A<UM|j8IFagOWl~u=XAsIT)i(bmu$r3
zCd}0CD`+fzC5>gSq_OOkG?oh(%VsXU-()L{2h?8nzRR;)=p)KuoDyX{GoEKh37kf^
z<FlPUIO^<6XYarw=C~rF92|A`<uV-<JOrw|FGD!C>wt+*r?rw<#3#{OYA@|Q)v{Pd
zMp5#nv3wufu!ZyZ?W<VGZ=i)VEKpoHw<)06`n)kG5|PFpG+OekO4(>4_{L1tkSR)I
z_ZxMIzGpS}uxD;q0y3*60K4&qS;)ddSJ<fEDEfth4#te6BW4iOtHq4Hd&4}+!6U0y
z_Op$A!ZPfSr&I8??)ntu0Te;cQH^ZRF{+-?;1zUmfPf69iSbOH=(BTGW2Pd>ZW>NY
zdV0K&nJ5&p6tb%tAcKPmbZ})6bC4hu(|H9?%Vni7gHvqO&2VZ4u9CTG-KSDNxx|w#
zz?sHe(Wi)h;?ZQ_0~dUvQnnRCS-JuaNOFs6wUN#;l?4OY?9>vj@77aI@<TQ?tV_(~
zsFT*QbmeEt`3z6#Q7944>&H+}u37)Ip3<pBR&h%3!ucMjc%Wsf7&(R9NvGg55a8!t
zm0a8MY^PqWlokwpOf7kMgu(lu1DU;h?fFW{Gpkm^<NJz`N*g`#!A?(nN_S6u2hv?C
z$wJ90&DbWM(mf*(J@-84N1qkZ^Ejw~x>7H~6W!MzC7N;_Q9`OOO4PO|N?7KY^+w6N
zCGSU>4@5c98zo;OWm-h5)iRAGvW_rGtS?TLu}-(b=q1I;H->^A=bk{Ed?)EDXPPhG
zve*>*>q)!)dLo5bU!=$>SJjEs&*OSa%FC2a`LU)m!C+}m;lWOcqb<WbfEsHmJCT{#
zKYrJKv8^aN?`>+c0G%}syPjb=cv0Cjro1DDV&-YijAKh%gF$R;*em&_r^L9+Jn3>b
zl!UC<cB$duH|0nkyII=Lrn2`HQglIExiW7|%~h+)lPuRpY}o10$$fObyd}K=1++mc
zt9%a@wzC<rcPkRlR)+hVrBqJMT83NmC3&*4T%M-_4+T9O9G&HAcrK{><U>aq4;|vz
zvZX>taP#c*NO`agd^Gnh(!!WhG@dB(!9mD9s<^c*6n5B&C!mUYv0D|HcaR<FV8(aQ
zQx1wua3^x9{oD^_+2^5-Mgh6{4wLY&W~t;kX<n?X@DoP4|3SJasj!H|3g=}*<b(|f
zr&EP|Hj|sk@;s#JxTYZ0^IE_NmvNve$dD5Zu+pOljS%D$djthF>T($;P{}Mk5A*el
zQ>*)64IZVtb$-lpi&&+{5EFB4&(C*mZ|6DNu9fMb^cYx-hRqMiLHS<PDi&RP9uM^E
zx{mef<KIQCQ`@>+xI9Arfz4Vaif!=dAf+*qjIpaQZln2DF(*$VBZ2uugtZ9wi7iCJ
zcY7(uF7EW<C>vvJX{!%MnBGO#*tK17nBMOO<U=gh1#=^yYX|XebbFgcu8*;^^r8{3
zRU^aj0WW#zJrDhB*A-Y~bBx6?9}Z)lkB~Ng(Ys8Vd?d%e%ke{u7X%C5Su!5K1lcFo
zz$=}^-~YCj-hFcSlp1!=8hU(&e{*v3XsFHTH844FL!4c=AO&|~ti{>Mqp>#oCg#NR
z0^nwZElC!(m;k?gb3F>=Vx0Y;8RPG4@7xBBpr33GvPnuLU9TnBcbbEP^cFn!!E*_A
zCISAz!D|5DIUxC;jIle3mpmf*qwzNT1|YJZB6zsXo|Uje@JO3|OTy%TQ`D_Z?~;F)
z1bv{*zAov-I2!@|Bly!k{oE$9zttV$0krQY_!QgHWJBzx5Oa!48tg}I2K?g}U%a&)
z2K~2)x2d&vKpoo#C0w4~uS4G(r#7)S6D-=4FK^qRBi>!jC~KiXhDaX#qs>^9S}n%)
zJ;+Wx6OQrPZ#TBE#R%wePLC6PF%gY6#hPHG5Bd1T5d?M;3Me12H`*NL{KG<jc(ggf
z;Snys1OAXTT+*M>=D-kP1~@&5{6~7~744~4G)nb~`1OiLyXzI{s@I8UB5^;S8$z&8
zj^ZNwJU^T&;6;jm%K(4vzx`8N*jNJa=3(wJ<>N<jz_Ae@M*od_39Px+9i)ye?qEB;
zaQG*$-aNr=ES>_aX+Dhn#~NboFdZDJIN&(17l#+aY^NKE@C<W(iO>-G1Nxt0cdwxS
zJ0d&XXq5IZz8N6@k`CMNYwQ*1Z)!+S3oYA1e8~vt*eXc=qQ>4lrC{)X0GRj_k{=r3
zpN_2y{CD6a<N48F*3eF^&v#%>@4#bm(l7YYPZS@&u;tPE)z5JKf*=0sXE?urUnl-J
z-^=({U|0{jtBHJ&{WCiVj!-{y`y}s%<5YhRzm{Y-x|&86<yA!eK#z)!pXBRd(2d~x
zAG%Ib|AIdp9s)E-D8!5Wpq!^C*;(Z8%|v9#WxW4b3;9NV2Hj{|=z~1s13z|U;^+Az
z{wNSg0rCrdn+hZjI_3Wj!1oiGVB{C}AK-=ov6jZpC)hU@*Z>6Bz;<l(L|6MA(cez6
z&n|q(X#Z}vpVrvv1iRL!L+JkyHqHHCyyq_SzDFL#m*RYS@S%J6n-72Cp+_ibA73A5
z;J+dRv^|S`rQ4lJ34Tv(8a|QugBR^VtI3Z`<PAN<UU~ZFyT=4q{}q0CF*YLwh=<M#
zdCJj~DZf4kdaq6*oR$xJGlI>(_ws&e4f!!?=wy#T56GTgL;gj%KW^#~=mFW8HROLL
zkN4Zu4;N$vz4EsRyyFKbCug^d$u0@2GSD6!yp9OKnG7N^k6bShFG)Pf(^1$fKZe4<
zr#|~HOL;vYPg5nY>PWxqAlxJo1J|FVys9(Vr}Ih4Df~~@kY8Fwo?5C`{;zAu$K(f!
zl=O-0(|O%1zYX!P=6^xTD?dq|_<H3Fu#dmKHE>YMQ_h2zl2<tT-3`gh<)YB6r9Kcz
z0@<ha)ABjQWK7y8dhlx9C?J_epG3BT7yZ76eCn0|p_C8u(!5vx9??JS?jA-Ze?REz
zTCM*mJj$#3J*(nX<3VXqjVgbHz7#oCA7w|4s$Oag+N1vU>!JkIs0t{@pd6u8Jruhd
WRejVLl%x1m9u>bDRbJJo<o*F}Wa=#d

literal 0
HcmV?d00001

diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
new file mode 100644
index 00000000000..37ee5419513
--- /dev/null
+++ b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
@@ -0,0 +1,965 @@
+
+#include <torch/csrc/inductor/aoti_include/cuda.h>
+// Definition of AOTI runtime interface functions
+
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+
+#include <iostream>
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+  return AOTI_RUNTIME_SUCCESS;
+
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+};
+
+extern "C" {
+
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+
+} // extern "C"
+
+
+#define CUDA_DRIVER_CHECK(EXPR)                    \
+do {                                               \
+    CUresult code = EXPR;                          \
+    const char *msg;                               \
+    CUresult code_get_error = cuGetErrorString(code, &msg); \
+    if (code_get_error != CUDA_SUCCESS) {          \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string("invalid error code!"));   \
+    }                                              \
+    if (code != CUDA_SUCCESS) {                    \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string(msg));                     \
+    }                                              \
+} while (0);
+
+static inline CUfunction loadKernel(
+        std::string filePath,
+        const std::string &funcName,
+        uint32_t sharedMemBytes,
+        const std::optional<std::string> &cubinDir = std::nullopt) {
+    if (cubinDir) {
+        std::filesystem::path p1{*cubinDir};
+        std::filesystem::path p2{filePath};
+        filePath = (p1 / p2.filename()).string();
+    }
+
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline void launchKernel(
+        CUfunction func,
+        uint32_t gridX,
+        uint32_t gridY,
+        uint32_t gridZ,
+        uint32_t numWarps,
+        uint32_t sharedMemBytes,
+        void* args[],
+        cudaStream_t stream) {
+    CUDA_DRIVER_CHECK(cuLaunchKernel(
+        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+    ));
+}
+CACHE_TORCH_DTYPE(float32);
+CACHE_TORCH_DEVICE(cuda);
+CACHE_TORCH_LAYOUT(strided);
+namespace torch::aot_inductor {
+namespace {
+class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
+  public:
+    CUfunction triton_poi_fused_convolution_0{nullptr};
+    CUfunction triton_poi_fused_convolution_1{nullptr};
+    CUfunction triton_poi_fused_convolution_2{nullptr};
+};
+}  // namespace
+
+
+
+AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                   const std::string& device_str,
+                                   std::optional<std::string> cubin_dir)
+    : AOTInductorModelBase(1,
+                           1,
+                           1,
+                           device_str,
+                           std::move(cubin_dir),
+                           true) {
+    inputs_info_[0].name = "arg2_1";
+    constants_info_[0].name = "conv_weight";
+    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[0].offset = 0;
+    constants_info_[0].data_size = 540;
+    constants_info_[0].from_folded = false;
+    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[0].shape = {5, 3, 3, 3};
+    constants_info_[0].stride = {27, 9, 3, 1};
+    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[0].original_fqn = "conv.weight";
+    update_constants_map(std::move(constants_map));
+    update_constants_array(std::move(constants_array));
+    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
+    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
+    outputs_info_[0].name = "output0";
+    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
+}
+
+std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor,
+    bool initialization
+) {
+
+    if (!initialization) {
+        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                  << "aot_inductor.use_runtime_constant_folding=False\n";
+    }
+    return {};
+}
+} // namespace torch::aot_inductor
+using namespace torch::aot_inductor;
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_0(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_0', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 12
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
+        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
+    }
+    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_2 = ynumel;
+    int var_3 = xnumel;
+    CUdeviceptr global_scratch_4 = 0;
+    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
+    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_1(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_1', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 15
+        xnumel = 9
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
+        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
+    }
+    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_7 = ynumel;
+    int var_8 = xnumel;
+    CUdeviceptr global_scratch_9 = 0;
+    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
+    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_2(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_2', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 20
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y0 = (yindex % 5)
+        y1 = yindex // 5
+        y3 = yindex
+        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
+        tmp1 = y0
+        tmp2 = tl.full([1, 1], 2, tl.int64)
+        tmp3 = tmp1 < tmp2
+        tmp4 = tl.full([1, 1], 1, tl.int64)
+        tmp5 = tmp1 < tmp4
+        tmp6 = -0.16373057663440704
+        tmp7 = 0.04603243246674538
+        tmp8 = tl.where(tmp5, tmp6, tmp7)
+        tmp9 = tl.full([1, 1], 3, tl.int64)
+        tmp10 = tmp1 < tmp9
+        tmp11 = tl.full([1, 1], 4, tl.int64)
+        tmp12 = tmp1 < tmp11
+        tmp13 = 0.16525162756443024
+        tmp14 = 0.022457100450992584
+        tmp15 = tl.where(tmp12, tmp13, tmp14)
+        tmp16 = -0.08230065554380417
+        tmp17 = tl.where(tmp10, tmp16, tmp15)
+        tmp18 = tl.where(tmp3, tmp8, tmp17)
+        tmp19 = tmp0 + tmp18
+        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
+    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
+        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
+    }
+    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_12 = ynumel;
+    int var_13 = xnumel;
+    CUdeviceptr global_scratch_14 = 0;
+    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
+    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
+}
+
+namespace torch::aot_inductor {
+
+void AOTInductorModel::_const_run_impl(
+    std::vector<AtenTensorHandle>& output_handles,
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {}
+
+AOTI_NOINLINE static void check_input_0(
+    AtenTensorHandle* input_handles
+) {
+    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
+    int32_t arg2_1_dtype;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
+
+    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
+    if (arg2_1_expected_dtype != arg2_1_dtype) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dtype, "
+           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
+           << "but got: " << arg2_1_dtype << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_size = arg2_1.sizes();
+
+    if (4 != arg2_1_size[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 0, "
+           << "expected: 4, " << "but got: " << arg2_1_size[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (3 != arg2_1_size[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 1, "
+           << "expected: 3, " << "but got: " << arg2_1_size[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_size[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 3, "
+           << "expected: 8, " << "but got: " << arg2_1_size[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_stride = arg2_1.strides();
+
+    if (192 != arg2_1_stride[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 0, "
+           << "expected: 192, " << "but got: " << arg2_1_stride[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (64 != arg2_1_stride[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 1, "
+           << "expected: 64, " << "but got: " << arg2_1_stride[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_stride[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_stride[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (1 != arg2_1_stride[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 3, "
+           << "expected: 1, " << "but got: " << arg2_1_stride[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    int32_t arg2_1_device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
+
+    int32_t arg2_1_expected_device_type = 1;
+    if (arg2_1_expected_device_type != arg2_1_device_type) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched device type, "
+        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
+        << "but got: " << arg2_1_device_type << "\n";
+        throw std::runtime_error(ss.str());
+    }
+}
+
+static bool _check_aoti_runtime_check_inputs_env() {
+    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
+    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
+    return result;
+}
+
+AOTI_NOINLINE static void __check_inputs_outputs(
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+    if (!_check_aoti_runtime_check_inputs_env()){
+        return;
+    }
+    check_input_0(input_handles);
+}
+
+void AOTInductorModel::run_impl(
+    AtenTensorHandle*
+        input_handles, // array of input AtenTensorHandle; handles
+                        // are stolen; the array itself is borrowed
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {
+    __check_inputs_outputs(input_handles, output_handles);
+
+    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
+    auto arg2_1 = std::move(inputs[0]);
+    [[maybe_unused]] auto& conv_weight = constants_->at(0);
+
+    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
+        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
+        AtenTensorHandle arg2_1_aligned;
+        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
+        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
+    }
+    inputs.clear();
+    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
+
+    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
+    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
+    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
+    AtenTensorHandle buf0_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
+    RAIIAtenTensorHandle buf0(buf0_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    arg2_1.reset();
+    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
+    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
+    AtenTensorHandle buf1_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
+    RAIIAtenTensorHandle buf1(buf1_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    AtenTensorHandle buf2_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
+    RAIIAtenTensorHandle buf2(buf2_handle);
+    buf0.reset();
+    buf1.reset();
+    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
+    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
+    AtenTensorHandle buf3_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
+    RAIIAtenTensorHandle buf3(buf3_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf2.reset();
+    output_handles[0] = buf3.release();
+} // AOTInductorModel::run_impl
+} // namespace torch::aot_inductor
+
+
+
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin b/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1be0cd3083897a28e082defde99cc1da6a9ef442
GIT binary patch
literal 10936
zcmeHNTWlQF89qC+zQ%ZyfYP9ZW)k8O+I046dlS-N2uYC|L`vWVM2Fekv3JROcQP|x
z)-frYv}&tf0MVyD74->G9{SJ+T2*Zx03n1BQq@*{OA)H7HjQkPRFz8Ee*d|QXLs#Z
zI8H<*j^a7zzhBOO&Y3xzXO12}rp02}0VbNpe$85POkMd-0hITO{?VHrWm7EA`tiJl
z#TYaFvLASXUBF27^Roixz2%o{r6x0L4YN|4t}`=mnt>U3m7rd;+<L{Dp7WiOW!3DU
z(r~PT9Tdw}t?tcQRcEFWU}y!NU8z-SGgjI5N`YM~Te(o*#%S`P*(q(PF3<5DuK}eg
z?(0moUa+f3K_%dDv0n2LN|2+xnkA<&H)BnE_N=3TYNdv;=Ts#zgzXMQtzijaqFcW0
z`$A|9yp4J26$o)|(6oXkJS4lAG3BM{R*D=+@*!6EzT%?v9|+~6#E^_#t<HG1TjtTb
zUOlJ>^DZKc=u2Ju1c)lVj?Yi)y?Yg^LO(g{%$oJ-Y2OJ%$;(Bh=vW9?<qb>0Cm%Ta
zfF<~?lx0^dHcA)O5M3$OUJJWQN0IQN6L>r`YPtNst(|a1b*}x7wRSdS6IDz6`Yyf<
zmZZ)$_bki=j*#6EY+Q~H18=;zH9`?X3wWJv>A}~5k45mCBu9huCwln5E%_Cn()a@B
zS6-}${g~tXRuFRZHpln(;J0Yxr#e+v$fr4ecm*LxKLCE~=16{|sdqU4%{}~WG<hU5
z8dK9*{AubFBj;MJeQdCOuC;HF4;km#-=jq$q^<a71QVy&g#x0^0Hv`%VHAt~YjhaT
z7%(iB3})SAQ1^=EO06_k#K@Yl{d&FjWW`RFT$U`?XPx8<iq4FZopfpqKWW#4O0rrh
zB;EO-T(4!#bSmjr0_TgaT|8yaIKEJvw3O<kH&+WPI2u{8<hnECnQ8yY^66aBo$&or
zK~OF^ryEY;^r>1QJL|dG(%gJ?@~KK;E+~5L^rY#|v)Hfx{KMD#f45Dpn6guK9nU9<
z_R(*C|62bqe~zezkZaN6F?~yH`RZl;jJ7#;RoAqOEVigO0qDAxis>;tE-&lPXhw`b
z7B4R^US7O<2?Ss?dMs!cV==uC!{tlMOL0wqixGM*WFS`9If71mH-^Oi7Pd9ML^@Lq
zbi^~JZ;R<SlP7!W!F~D~jrP#vY3*GW)3?)bgdRunSXx?IT)J{eV222MiXM;9g9~k?
zkx!4E^dP}YLi{Fz8+a@(EnXGD@tAp-oukFSUH)MC@5@VzA1p0{dui#)vUY*OqKuji
zWV=$Y8F_OOFMzf;;{=93YaK}0%(P1-90|VVSI#)b1a0r~2rx_>3Ntk$ZJ33;A-4R-
z(&@*IFBuC5#`hcaH!->2$Wx!kpTte3fXjiNI(yfD(?!R}W9iBNsV`6ZGSGiO*dx37
zEdJ60pU=S#VyyU;Le()$rxtkg#<gzF`<Ut4cp)^*xrv-%x`Ai(<mFmzSloh@+JED0
zrEj3E%nh`ay@9rJ5nI{JHCLLfg7Jvf>s@gZ>`wZeIE`(+WMsxC*aCqwD06&<8AD_4
zp~=HoyBt+Oh(lxEp<HGK1^<959m)`n?LBVcE3#I!3ix8Ir4G`TQ!R<*V-_TD8msoP
zjao2|-?fT?iH$Rmh5?ESr!0jtTW^?iA`NM5JY&Vcu9VCsLifv6Qzj^lEpN;t`hnHF
z!$!GL;b&G2A3N|y8OXvwN6?tNF_a309E}(mN5mkeSBn_i^+q|AgF|+$6s8)<gkhK;
zPp9B&-3uwm0w{u>qbTO8X1QYea`xa8$Y7dGkLNR!`Fxfl^3?2RaMXYft}H@|5^&ZC
z0+iy!R0=aVm&Uvdr)J;^nXA@ADiw=IJO?JAZmtj#M6GaVGjM^+Jyj_=ilHPu0f#cE
z))A>JQPR<o$(5OcgZqtCliZL63*!=l^(a5=8X2Cf1;`7xjkB02uGIK~kuqrRSMf*i
z!tXABc!(vM3-Sk9lm5V6Ai&QZDte9`IBvaKDbAbtRxAd19l?Ff@yx-4PNPx`tg7t?
ze6tWzX|pRd*y##Q>0Sx#c)CLyDKFZ^vSZ;bIWQxEbEk7*;8_tk4}pqC%&<(jh)k4J
zZ-A(@bp!~ho&YhUT>-)}$E+hjULF%+fD@4bC%XgWYobJpVzn7eV@a$dLK5o<Q6+1o
zJYjUbA@Vhz4?{c<36XCK9g~>m>$F5Gd(|LSb$z@#rOm!4NSNrF$1sKKm`Lt$Iu(YQ
z&P3y+t%JuoE%vtzpLeRH>Fi`?^2qo-N5poe=)9Pzt|D~ipY{UNc5x}#G^c|zrefw9
z&WvN{T7yCCU)b9C-loL3%RJv$OSI`pLRM_9Gz0LPYCxWxS=zRyv&Zr&YS31yG|cI_
zYBiMP+K3Gs8tS))&eyY~=b?bMVP%zXwZe8bBX(#-;@Qe@f3uXz=~>(KYM~@gR+h{2
zRN!3i;^62kSHp8bg(u$|(s*ycRh(;99jVae{%3jid89m8AK${EMOqHiipCQ~KDcxi
zb*d5xf%z47;)$rDGVD}E&br8ebTAV-7$^rt8o3j>)DiB7vK;b=baL%0Ji@)2t#aq`
zlw^5MvX~I=eS|DBD-w^a3eROV<an8oY&w;n$Yyerl(I3G>W~6duUjIU+yobKF@sTt
zoM3>FPCaxINA-wQa$mTJi>j3=%F?SWj-}3%GkR9;H~2ZqO<`%CMJ&v7f>1;5AlJp;
zj-6VGUMA0i#q=G10uIUBN4rq)oCe;)*VWGU=;OcitW(>%T(~Ymy>ZRjMG9?bVF+1_
zCkNS0Fif|U@hxIZe#Ey%#*=Z@BHU+2h=kwGQjA^M9m0KVA7hJKLO9Nh4!X{^cfkCD
zC?X$YgB^5k1a$ob?jLuwS$t=lP2ox}z8Us$7em)E^rzpNXYt`dHj43P4daP8Y2uCU
z5NYs{?EfwM57A#1EV!s*Ja!4P+t$D<oy6aJ8%h^mjLGXquHn8l^horEcjp&kZARCz
z<mh_|cI&(}u^YWP%FZunZT20EiKhU#8E1=<Mexh0#(@0EDEm<}&Tmlf-Uf}(cfQ%j
zrYPam-bk<)ntgqA8y)}WvkCTQ0{ne_w*bDoU-Ca4XLl1Xc|`IHgKhRLKxBWB{26Yu
z?@BmD^pQ6EzJv?#zbWRoO;^%y?qXX3e+2vO3(pRay)E7Vces5&;I|%RcQo0+PhWod
z^KFe_lJ9Hpf+*VtA<i%6r+x9c0ro!Gi?`V~rM=IMvc1g&dl7XpK=R-pYibFU8`B7W
zJIT(?YZ}39hdDy|8EX!5`XJHYdyf6$3>-Z#^&vlF&G-QE$BF*_F31lKhVp+I1>7I!
zx9vMVJHp;0eZySeu&^H+YmRVugv%$ibMwQ)q5QrC<VQwAcu#_j2MN@A>u{Uq2>IR}
zXxb~U-!TaY@PmMLJ%r)kpf75c8K9>|0gvLIAB|wt$0p<(`o7NYUDV-!T*A<GMQ5)i
zh>j&k^kvYKL?;;Zf9UL;1T*-8X&q*4is#>WiR=3chvA>GeU$iD=&y1(q?3QV(N76`
z;9pZmxwS+|K|lEm1z7a>gTI5#grv84{KiW<!f6S8m`~$XP7n3(_WJwbBAC%If4fmn
zLm>=)PZ#;%{_i5VpXY~Q^54@&#ji1d-%Jubpph;~zunV^iJ$M5ZzN*3d)g+V(>_b}
zH{|u>cMfXj2WUMFc>`@B4;Xah6XkJXid{s$-%n6}QSSsJzXW@%pVv=|<_G*96APk+
z`9OXK_7NHU)IcA|6Pe%^qxJEvdB*Fne+M>H#PfjKFIY=we@(D2&wt2hpKQ0kp|cAK
zwmqan-zN~ZtRs)ycaL@K;Yab~EFYeD=;#sa<X0bh7!H5hKFz=<BLlQOi*?`W&y)n8
z*E;dUA8nfetR|n5$lJPz-SYGcY?leH{siAl@c*Qw0P)aX#kX<vczfgR80g(PiSVj?
zeJBgI@Y&6Kat(QLUNCm5OP~v6tu^Gohs~SXA4C^EyX9YBL%u-oC1`)7^CZ%`<>h}P
zb0aK#cxZ>f?3DyH4e>5UxsAG0UHB)Y{Ldot<Vv^v7$ycj_1NFMQwSOnc^WEtHIH=&
z6K;x#k(QG3YM#k|dY@Ht3jfL)^7pSIPbJkYe`yW*C!~Bz`b75WeC?LM8S$^?|MRQZ
zC%$g^aoESdo;C22l&73Wo03;J`u`s!FPDo#v+%<~egu&Jw0>GE9DFn;?W_6-$$P{|
zx)GwuR<tF6AfLMBcS`vvFO9q9??L_ucW)P?lD{8x)mE!NiktGP{y(MSRs9KRQ1vQ*
zguWIzH9yLZ>eYOyK5CCoh5DdV0;-quq(0iBa(r&r$fueg)kozhK9xshM)fMMs#kLV
E0n);sHUIzs

literal 0
HcmV?d00001

diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
new file mode 100644
index 00000000000..cc963cd88f0
--- /dev/null
+++ b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
@@ -0,0 +1,6144 @@
+
+#include <torch/csrc/inductor/aoti_include/cuda.h>
+// Definition of AOTI runtime interface functions
+
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+
+#include <iostream>
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+  return AOTI_RUNTIME_SUCCESS;
+
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+};
+
+extern "C" {
+
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+
+} // extern "C"
+
+
+#define CUDA_DRIVER_CHECK(EXPR)                    \
+do {                                               \
+    CUresult code = EXPR;                          \
+    const char *msg;                               \
+    CUresult code_get_error = cuGetErrorString(code, &msg); \
+    if (code_get_error != CUDA_SUCCESS) {          \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string("invalid error code!"));   \
+    }                                              \
+    if (code != CUDA_SUCCESS) {                    \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string(msg));                     \
+    }                                              \
+} while (0);
+
+static inline CUfunction loadKernel(
+        std::string filePath,
+        const std::string &funcName,
+        uint32_t sharedMemBytes,
+        const std::optional<std::string> &cubinDir = std::nullopt) {
+    if (cubinDir) {
+        std::filesystem::path p1{*cubinDir};
+        std::filesystem::path p2{filePath};
+        filePath = (p1 / p2.filename()).string();
+    }
+
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline void launchKernel(
+        CUfunction func,
+        uint32_t gridX,
+        uint32_t gridY,
+        uint32_t gridZ,
+        uint32_t numWarps,
+        uint32_t sharedMemBytes,
+        void* args[],
+        cudaStream_t stream) {
+    CUDA_DRIVER_CHECK(cuLaunchKernel(
+        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+    ));
+}
+CACHE_TORCH_DTYPE(float32);
+CACHE_TORCH_DEVICE(cuda);
+CACHE_TORCH_LAYOUT(strided);
+namespace torch::aot_inductor {
+namespace {
+class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
+  public:
+    CUfunction triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_10{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_14{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_17{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_21{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_24{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_3{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_6{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_12{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_16{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_19{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_23{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_8{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7{nullptr};
+    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9{nullptr};
+    CUfunction triton_poi_fused_convolution_0{nullptr};
+    CUfunction triton_poi_fused_convolution_1{nullptr};
+    CUfunction triton_poi_fused_permute_copy_26{nullptr};
+};
+}  // namespace
+
+
+
+AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                   const std::string& device_str,
+                                   std::optional<std::string> cubin_dir)
+    : AOTInductorModelBase(1,
+                           1,
+                           262,
+                           device_str,
+                           std::move(cubin_dir),
+                           true) {
+    inputs_info_[0].name = "arg262_1";
+    constants_info_[0].name = "mv2_features_0_0_weight";
+    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[0].offset = 0;
+    constants_info_[0].data_size = 3456;
+    constants_info_[0].from_folded = false;
+    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[0].shape = {32, 3, 3, 3};
+    constants_info_[0].stride = {27, 9, 3, 1};
+    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[0].original_fqn = "mv2.features.0.0.weight";
+    constants_info_[1].name = "mv2_features_0_1_weight";
+    constants_info_[1].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[1].offset = 0;
+    constants_info_[1].data_size = 128;
+    constants_info_[1].from_folded = false;
+    constants_info_[1].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[1].shape = {32};
+    constants_info_[1].stride = {1};
+    constants_info_[1].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[1].original_fqn = "mv2.features.0.1.weight";
+    constants_info_[2].name = "mv2_features_0_1_bias";
+    constants_info_[2].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[2].offset = 0;
+    constants_info_[2].data_size = 128;
+    constants_info_[2].from_folded = false;
+    constants_info_[2].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[2].shape = {32};
+    constants_info_[2].stride = {1};
+    constants_info_[2].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[2].original_fqn = "mv2.features.0.1.bias";
+    constants_info_[3].name = "mv2_features_1_conv_0_0_weight";
+    constants_info_[3].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[3].offset = 0;
+    constants_info_[3].data_size = 1152;
+    constants_info_[3].from_folded = false;
+    constants_info_[3].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[3].shape = {32, 1, 3, 3};
+    constants_info_[3].stride = {9, 9, 3, 1};
+    constants_info_[3].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[3].original_fqn = "mv2.features.1.conv.0.0.weight";
+    constants_info_[4].name = "mv2_features_1_conv_0_1_weight";
+    constants_info_[4].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[4].offset = 0;
+    constants_info_[4].data_size = 128;
+    constants_info_[4].from_folded = false;
+    constants_info_[4].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[4].shape = {32};
+    constants_info_[4].stride = {1};
+    constants_info_[4].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[4].original_fqn = "mv2.features.1.conv.0.1.weight";
+    constants_info_[5].name = "mv2_features_1_conv_0_1_bias";
+    constants_info_[5].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[5].offset = 0;
+    constants_info_[5].data_size = 128;
+    constants_info_[5].from_folded = false;
+    constants_info_[5].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[5].shape = {32};
+    constants_info_[5].stride = {1};
+    constants_info_[5].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[5].original_fqn = "mv2.features.1.conv.0.1.bias";
+    constants_info_[6].name = "mv2_features_1_conv_1_weight";
+    constants_info_[6].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[6].offset = 0;
+    constants_info_[6].data_size = 2048;
+    constants_info_[6].from_folded = false;
+    constants_info_[6].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[6].shape = {16, 32, 1, 1};
+    constants_info_[6].stride = {32, 1, 1, 1};
+    constants_info_[6].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[6].original_fqn = "mv2.features.1.conv.1.weight";
+    constants_info_[7].name = "mv2_features_1_conv_2_weight";
+    constants_info_[7].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[7].offset = 0;
+    constants_info_[7].data_size = 64;
+    constants_info_[7].from_folded = false;
+    constants_info_[7].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[7].shape = {16};
+    constants_info_[7].stride = {1};
+    constants_info_[7].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[7].original_fqn = "mv2.features.1.conv.2.weight";
+    constants_info_[8].name = "mv2_features_1_conv_2_bias";
+    constants_info_[8].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[8].offset = 0;
+    constants_info_[8].data_size = 64;
+    constants_info_[8].from_folded = false;
+    constants_info_[8].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[8].shape = {16};
+    constants_info_[8].stride = {1};
+    constants_info_[8].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[8].original_fqn = "mv2.features.1.conv.2.bias";
+    constants_info_[9].name = "mv2_features_2_conv_0_0_weight";
+    constants_info_[9].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[9].offset = 0;
+    constants_info_[9].data_size = 6144;
+    constants_info_[9].from_folded = false;
+    constants_info_[9].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[9].shape = {96, 16, 1, 1};
+    constants_info_[9].stride = {16, 1, 1, 1};
+    constants_info_[9].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[9].original_fqn = "mv2.features.2.conv.0.0.weight";
+    constants_info_[10].name = "mv2_features_2_conv_0_1_weight";
+    constants_info_[10].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[10].offset = 0;
+    constants_info_[10].data_size = 384;
+    constants_info_[10].from_folded = false;
+    constants_info_[10].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[10].shape = {96};
+    constants_info_[10].stride = {1};
+    constants_info_[10].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[10].original_fqn = "mv2.features.2.conv.0.1.weight";
+    constants_info_[11].name = "mv2_features_2_conv_0_1_bias";
+    constants_info_[11].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[11].offset = 0;
+    constants_info_[11].data_size = 384;
+    constants_info_[11].from_folded = false;
+    constants_info_[11].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[11].shape = {96};
+    constants_info_[11].stride = {1};
+    constants_info_[11].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[11].original_fqn = "mv2.features.2.conv.0.1.bias";
+    constants_info_[12].name = "mv2_features_2_conv_1_0_weight";
+    constants_info_[12].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[12].offset = 0;
+    constants_info_[12].data_size = 3456;
+    constants_info_[12].from_folded = false;
+    constants_info_[12].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[12].shape = {96, 1, 3, 3};
+    constants_info_[12].stride = {9, 9, 3, 1};
+    constants_info_[12].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[12].original_fqn = "mv2.features.2.conv.1.0.weight";
+    constants_info_[13].name = "mv2_features_2_conv_1_1_weight";
+    constants_info_[13].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[13].offset = 0;
+    constants_info_[13].data_size = 384;
+    constants_info_[13].from_folded = false;
+    constants_info_[13].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[13].shape = {96};
+    constants_info_[13].stride = {1};
+    constants_info_[13].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[13].original_fqn = "mv2.features.2.conv.1.1.weight";
+    constants_info_[14].name = "mv2_features_2_conv_1_1_bias";
+    constants_info_[14].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[14].offset = 0;
+    constants_info_[14].data_size = 384;
+    constants_info_[14].from_folded = false;
+    constants_info_[14].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[14].shape = {96};
+    constants_info_[14].stride = {1};
+    constants_info_[14].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[14].original_fqn = "mv2.features.2.conv.1.1.bias";
+    constants_info_[15].name = "mv2_features_2_conv_2_weight";
+    constants_info_[15].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[15].offset = 0;
+    constants_info_[15].data_size = 9216;
+    constants_info_[15].from_folded = false;
+    constants_info_[15].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[15].shape = {24, 96, 1, 1};
+    constants_info_[15].stride = {96, 1, 1, 1};
+    constants_info_[15].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[15].original_fqn = "mv2.features.2.conv.2.weight";
+    constants_info_[16].name = "mv2_features_2_conv_3_weight";
+    constants_info_[16].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[16].offset = 0;
+    constants_info_[16].data_size = 96;
+    constants_info_[16].from_folded = false;
+    constants_info_[16].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[16].shape = {24};
+    constants_info_[16].stride = {1};
+    constants_info_[16].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[16].original_fqn = "mv2.features.2.conv.3.weight";
+    constants_info_[17].name = "mv2_features_2_conv_3_bias";
+    constants_info_[17].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[17].offset = 0;
+    constants_info_[17].data_size = 96;
+    constants_info_[17].from_folded = false;
+    constants_info_[17].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[17].shape = {24};
+    constants_info_[17].stride = {1};
+    constants_info_[17].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[17].original_fqn = "mv2.features.2.conv.3.bias";
+    constants_info_[18].name = "mv2_features_3_conv_0_0_weight";
+    constants_info_[18].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[18].offset = 0;
+    constants_info_[18].data_size = 13824;
+    constants_info_[18].from_folded = false;
+    constants_info_[18].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[18].shape = {144, 24, 1, 1};
+    constants_info_[18].stride = {24, 1, 1, 1};
+    constants_info_[18].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[18].original_fqn = "mv2.features.3.conv.0.0.weight";
+    constants_info_[19].name = "mv2_features_3_conv_0_1_weight";
+    constants_info_[19].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[19].offset = 0;
+    constants_info_[19].data_size = 576;
+    constants_info_[19].from_folded = false;
+    constants_info_[19].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[19].shape = {144};
+    constants_info_[19].stride = {1};
+    constants_info_[19].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[19].original_fqn = "mv2.features.3.conv.0.1.weight";
+    constants_info_[20].name = "mv2_features_3_conv_0_1_bias";
+    constants_info_[20].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[20].offset = 0;
+    constants_info_[20].data_size = 576;
+    constants_info_[20].from_folded = false;
+    constants_info_[20].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[20].shape = {144};
+    constants_info_[20].stride = {1};
+    constants_info_[20].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[20].original_fqn = "mv2.features.3.conv.0.1.bias";
+    constants_info_[21].name = "mv2_features_3_conv_1_0_weight";
+    constants_info_[21].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[21].offset = 0;
+    constants_info_[21].data_size = 5184;
+    constants_info_[21].from_folded = false;
+    constants_info_[21].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[21].shape = {144, 1, 3, 3};
+    constants_info_[21].stride = {9, 9, 3, 1};
+    constants_info_[21].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[21].original_fqn = "mv2.features.3.conv.1.0.weight";
+    constants_info_[22].name = "mv2_features_3_conv_1_1_weight";
+    constants_info_[22].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[22].offset = 0;
+    constants_info_[22].data_size = 576;
+    constants_info_[22].from_folded = false;
+    constants_info_[22].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[22].shape = {144};
+    constants_info_[22].stride = {1};
+    constants_info_[22].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[22].original_fqn = "mv2.features.3.conv.1.1.weight";
+    constants_info_[23].name = "mv2_features_3_conv_1_1_bias";
+    constants_info_[23].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[23].offset = 0;
+    constants_info_[23].data_size = 576;
+    constants_info_[23].from_folded = false;
+    constants_info_[23].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[23].shape = {144};
+    constants_info_[23].stride = {1};
+    constants_info_[23].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[23].original_fqn = "mv2.features.3.conv.1.1.bias";
+    constants_info_[24].name = "mv2_features_3_conv_2_weight";
+    constants_info_[24].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[24].offset = 0;
+    constants_info_[24].data_size = 13824;
+    constants_info_[24].from_folded = false;
+    constants_info_[24].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[24].shape = {24, 144, 1, 1};
+    constants_info_[24].stride = {144, 1, 1, 1};
+    constants_info_[24].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[24].original_fqn = "mv2.features.3.conv.2.weight";
+    constants_info_[25].name = "mv2_features_3_conv_3_weight";
+    constants_info_[25].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[25].offset = 0;
+    constants_info_[25].data_size = 96;
+    constants_info_[25].from_folded = false;
+    constants_info_[25].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[25].shape = {24};
+    constants_info_[25].stride = {1};
+    constants_info_[25].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[25].original_fqn = "mv2.features.3.conv.3.weight";
+    constants_info_[26].name = "mv2_features_3_conv_3_bias";
+    constants_info_[26].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[26].offset = 0;
+    constants_info_[26].data_size = 96;
+    constants_info_[26].from_folded = false;
+    constants_info_[26].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[26].shape = {24};
+    constants_info_[26].stride = {1};
+    constants_info_[26].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[26].original_fqn = "mv2.features.3.conv.3.bias";
+    constants_info_[27].name = "mv2_features_4_conv_0_0_weight";
+    constants_info_[27].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[27].offset = 0;
+    constants_info_[27].data_size = 13824;
+    constants_info_[27].from_folded = false;
+    constants_info_[27].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[27].shape = {144, 24, 1, 1};
+    constants_info_[27].stride = {24, 1, 1, 1};
+    constants_info_[27].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[27].original_fqn = "mv2.features.4.conv.0.0.weight";
+    constants_info_[28].name = "mv2_features_4_conv_0_1_weight";
+    constants_info_[28].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[28].offset = 0;
+    constants_info_[28].data_size = 576;
+    constants_info_[28].from_folded = false;
+    constants_info_[28].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[28].shape = {144};
+    constants_info_[28].stride = {1};
+    constants_info_[28].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[28].original_fqn = "mv2.features.4.conv.0.1.weight";
+    constants_info_[29].name = "mv2_features_4_conv_0_1_bias";
+    constants_info_[29].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[29].offset = 0;
+    constants_info_[29].data_size = 576;
+    constants_info_[29].from_folded = false;
+    constants_info_[29].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[29].shape = {144};
+    constants_info_[29].stride = {1};
+    constants_info_[29].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[29].original_fqn = "mv2.features.4.conv.0.1.bias";
+    constants_info_[30].name = "mv2_features_4_conv_1_0_weight";
+    constants_info_[30].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[30].offset = 0;
+    constants_info_[30].data_size = 5184;
+    constants_info_[30].from_folded = false;
+    constants_info_[30].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[30].shape = {144, 1, 3, 3};
+    constants_info_[30].stride = {9, 9, 3, 1};
+    constants_info_[30].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[30].original_fqn = "mv2.features.4.conv.1.0.weight";
+    constants_info_[31].name = "mv2_features_4_conv_1_1_weight";
+    constants_info_[31].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[31].offset = 0;
+    constants_info_[31].data_size = 576;
+    constants_info_[31].from_folded = false;
+    constants_info_[31].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[31].shape = {144};
+    constants_info_[31].stride = {1};
+    constants_info_[31].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[31].original_fqn = "mv2.features.4.conv.1.1.weight";
+    constants_info_[32].name = "mv2_features_4_conv_1_1_bias";
+    constants_info_[32].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[32].offset = 0;
+    constants_info_[32].data_size = 576;
+    constants_info_[32].from_folded = false;
+    constants_info_[32].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[32].shape = {144};
+    constants_info_[32].stride = {1};
+    constants_info_[32].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[32].original_fqn = "mv2.features.4.conv.1.1.bias";
+    constants_info_[33].name = "mv2_features_4_conv_2_weight";
+    constants_info_[33].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[33].offset = 0;
+    constants_info_[33].data_size = 18432;
+    constants_info_[33].from_folded = false;
+    constants_info_[33].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[33].shape = {32, 144, 1, 1};
+    constants_info_[33].stride = {144, 1, 1, 1};
+    constants_info_[33].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[33].original_fqn = "mv2.features.4.conv.2.weight";
+    constants_info_[34].name = "mv2_features_4_conv_3_weight";
+    constants_info_[34].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[34].offset = 0;
+    constants_info_[34].data_size = 128;
+    constants_info_[34].from_folded = false;
+    constants_info_[34].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[34].shape = {32};
+    constants_info_[34].stride = {1};
+    constants_info_[34].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[34].original_fqn = "mv2.features.4.conv.3.weight";
+    constants_info_[35].name = "mv2_features_4_conv_3_bias";
+    constants_info_[35].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[35].offset = 0;
+    constants_info_[35].data_size = 128;
+    constants_info_[35].from_folded = false;
+    constants_info_[35].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[35].shape = {32};
+    constants_info_[35].stride = {1};
+    constants_info_[35].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[35].original_fqn = "mv2.features.4.conv.3.bias";
+    constants_info_[36].name = "mv2_features_5_conv_0_0_weight";
+    constants_info_[36].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[36].offset = 0;
+    constants_info_[36].data_size = 24576;
+    constants_info_[36].from_folded = false;
+    constants_info_[36].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[36].shape = {192, 32, 1, 1};
+    constants_info_[36].stride = {32, 1, 1, 1};
+    constants_info_[36].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[36].original_fqn = "mv2.features.5.conv.0.0.weight";
+    constants_info_[37].name = "mv2_features_5_conv_0_1_weight";
+    constants_info_[37].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[37].offset = 0;
+    constants_info_[37].data_size = 768;
+    constants_info_[37].from_folded = false;
+    constants_info_[37].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[37].shape = {192};
+    constants_info_[37].stride = {1};
+    constants_info_[37].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[37].original_fqn = "mv2.features.5.conv.0.1.weight";
+    constants_info_[38].name = "mv2_features_5_conv_0_1_bias";
+    constants_info_[38].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[38].offset = 0;
+    constants_info_[38].data_size = 768;
+    constants_info_[38].from_folded = false;
+    constants_info_[38].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[38].shape = {192};
+    constants_info_[38].stride = {1};
+    constants_info_[38].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[38].original_fqn = "mv2.features.5.conv.0.1.bias";
+    constants_info_[39].name = "mv2_features_5_conv_1_0_weight";
+    constants_info_[39].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[39].offset = 0;
+    constants_info_[39].data_size = 6912;
+    constants_info_[39].from_folded = false;
+    constants_info_[39].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[39].shape = {192, 1, 3, 3};
+    constants_info_[39].stride = {9, 9, 3, 1};
+    constants_info_[39].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[39].original_fqn = "mv2.features.5.conv.1.0.weight";
+    constants_info_[40].name = "mv2_features_5_conv_1_1_weight";
+    constants_info_[40].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[40].offset = 0;
+    constants_info_[40].data_size = 768;
+    constants_info_[40].from_folded = false;
+    constants_info_[40].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[40].shape = {192};
+    constants_info_[40].stride = {1};
+    constants_info_[40].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[40].original_fqn = "mv2.features.5.conv.1.1.weight";
+    constants_info_[41].name = "mv2_features_5_conv_1_1_bias";
+    constants_info_[41].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[41].offset = 0;
+    constants_info_[41].data_size = 768;
+    constants_info_[41].from_folded = false;
+    constants_info_[41].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[41].shape = {192};
+    constants_info_[41].stride = {1};
+    constants_info_[41].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[41].original_fqn = "mv2.features.5.conv.1.1.bias";
+    constants_info_[42].name = "mv2_features_5_conv_2_weight";
+    constants_info_[42].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[42].offset = 0;
+    constants_info_[42].data_size = 24576;
+    constants_info_[42].from_folded = false;
+    constants_info_[42].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[42].shape = {32, 192, 1, 1};
+    constants_info_[42].stride = {192, 1, 1, 1};
+    constants_info_[42].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[42].original_fqn = "mv2.features.5.conv.2.weight";
+    constants_info_[43].name = "mv2_features_5_conv_3_weight";
+    constants_info_[43].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[43].offset = 0;
+    constants_info_[43].data_size = 128;
+    constants_info_[43].from_folded = false;
+    constants_info_[43].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[43].shape = {32};
+    constants_info_[43].stride = {1};
+    constants_info_[43].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[43].original_fqn = "mv2.features.5.conv.3.weight";
+    constants_info_[44].name = "mv2_features_5_conv_3_bias";
+    constants_info_[44].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[44].offset = 0;
+    constants_info_[44].data_size = 128;
+    constants_info_[44].from_folded = false;
+    constants_info_[44].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[44].shape = {32};
+    constants_info_[44].stride = {1};
+    constants_info_[44].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[44].original_fqn = "mv2.features.5.conv.3.bias";
+    constants_info_[45].name = "mv2_features_6_conv_0_0_weight";
+    constants_info_[45].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[45].offset = 0;
+    constants_info_[45].data_size = 24576;
+    constants_info_[45].from_folded = false;
+    constants_info_[45].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[45].shape = {192, 32, 1, 1};
+    constants_info_[45].stride = {32, 1, 1, 1};
+    constants_info_[45].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[45].original_fqn = "mv2.features.6.conv.0.0.weight";
+    constants_info_[46].name = "mv2_features_6_conv_0_1_weight";
+    constants_info_[46].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[46].offset = 0;
+    constants_info_[46].data_size = 768;
+    constants_info_[46].from_folded = false;
+    constants_info_[46].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[46].shape = {192};
+    constants_info_[46].stride = {1};
+    constants_info_[46].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[46].original_fqn = "mv2.features.6.conv.0.1.weight";
+    constants_info_[47].name = "mv2_features_6_conv_0_1_bias";
+    constants_info_[47].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[47].offset = 0;
+    constants_info_[47].data_size = 768;
+    constants_info_[47].from_folded = false;
+    constants_info_[47].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[47].shape = {192};
+    constants_info_[47].stride = {1};
+    constants_info_[47].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[47].original_fqn = "mv2.features.6.conv.0.1.bias";
+    constants_info_[48].name = "mv2_features_6_conv_1_0_weight";
+    constants_info_[48].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[48].offset = 0;
+    constants_info_[48].data_size = 6912;
+    constants_info_[48].from_folded = false;
+    constants_info_[48].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[48].shape = {192, 1, 3, 3};
+    constants_info_[48].stride = {9, 9, 3, 1};
+    constants_info_[48].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[48].original_fqn = "mv2.features.6.conv.1.0.weight";
+    constants_info_[49].name = "mv2_features_6_conv_1_1_weight";
+    constants_info_[49].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[49].offset = 0;
+    constants_info_[49].data_size = 768;
+    constants_info_[49].from_folded = false;
+    constants_info_[49].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[49].shape = {192};
+    constants_info_[49].stride = {1};
+    constants_info_[49].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[49].original_fqn = "mv2.features.6.conv.1.1.weight";
+    constants_info_[50].name = "mv2_features_6_conv_1_1_bias";
+    constants_info_[50].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[50].offset = 0;
+    constants_info_[50].data_size = 768;
+    constants_info_[50].from_folded = false;
+    constants_info_[50].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[50].shape = {192};
+    constants_info_[50].stride = {1};
+    constants_info_[50].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[50].original_fqn = "mv2.features.6.conv.1.1.bias";
+    constants_info_[51].name = "mv2_features_6_conv_2_weight";
+    constants_info_[51].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[51].offset = 0;
+    constants_info_[51].data_size = 24576;
+    constants_info_[51].from_folded = false;
+    constants_info_[51].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[51].shape = {32, 192, 1, 1};
+    constants_info_[51].stride = {192, 1, 1, 1};
+    constants_info_[51].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[51].original_fqn = "mv2.features.6.conv.2.weight";
+    constants_info_[52].name = "mv2_features_6_conv_3_weight";
+    constants_info_[52].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[52].offset = 0;
+    constants_info_[52].data_size = 128;
+    constants_info_[52].from_folded = false;
+    constants_info_[52].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[52].shape = {32};
+    constants_info_[52].stride = {1};
+    constants_info_[52].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[52].original_fqn = "mv2.features.6.conv.3.weight";
+    constants_info_[53].name = "mv2_features_6_conv_3_bias";
+    constants_info_[53].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[53].offset = 0;
+    constants_info_[53].data_size = 128;
+    constants_info_[53].from_folded = false;
+    constants_info_[53].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[53].shape = {32};
+    constants_info_[53].stride = {1};
+    constants_info_[53].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[53].original_fqn = "mv2.features.6.conv.3.bias";
+    constants_info_[54].name = "mv2_features_7_conv_0_0_weight";
+    constants_info_[54].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[54].offset = 0;
+    constants_info_[54].data_size = 24576;
+    constants_info_[54].from_folded = false;
+    constants_info_[54].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[54].shape = {192, 32, 1, 1};
+    constants_info_[54].stride = {32, 1, 1, 1};
+    constants_info_[54].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[54].original_fqn = "mv2.features.7.conv.0.0.weight";
+    constants_info_[55].name = "mv2_features_7_conv_0_1_weight";
+    constants_info_[55].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[55].offset = 0;
+    constants_info_[55].data_size = 768;
+    constants_info_[55].from_folded = false;
+    constants_info_[55].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[55].shape = {192};
+    constants_info_[55].stride = {1};
+    constants_info_[55].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[55].original_fqn = "mv2.features.7.conv.0.1.weight";
+    constants_info_[56].name = "mv2_features_7_conv_0_1_bias";
+    constants_info_[56].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[56].offset = 0;
+    constants_info_[56].data_size = 768;
+    constants_info_[56].from_folded = false;
+    constants_info_[56].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[56].shape = {192};
+    constants_info_[56].stride = {1};
+    constants_info_[56].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[56].original_fqn = "mv2.features.7.conv.0.1.bias";
+    constants_info_[57].name = "mv2_features_7_conv_1_0_weight";
+    constants_info_[57].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[57].offset = 0;
+    constants_info_[57].data_size = 6912;
+    constants_info_[57].from_folded = false;
+    constants_info_[57].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[57].shape = {192, 1, 3, 3};
+    constants_info_[57].stride = {9, 9, 3, 1};
+    constants_info_[57].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[57].original_fqn = "mv2.features.7.conv.1.0.weight";
+    constants_info_[58].name = "mv2_features_7_conv_1_1_weight";
+    constants_info_[58].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[58].offset = 0;
+    constants_info_[58].data_size = 768;
+    constants_info_[58].from_folded = false;
+    constants_info_[58].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[58].shape = {192};
+    constants_info_[58].stride = {1};
+    constants_info_[58].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[58].original_fqn = "mv2.features.7.conv.1.1.weight";
+    constants_info_[59].name = "mv2_features_7_conv_1_1_bias";
+    constants_info_[59].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[59].offset = 0;
+    constants_info_[59].data_size = 768;
+    constants_info_[59].from_folded = false;
+    constants_info_[59].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[59].shape = {192};
+    constants_info_[59].stride = {1};
+    constants_info_[59].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[59].original_fqn = "mv2.features.7.conv.1.1.bias";
+    constants_info_[60].name = "mv2_features_7_conv_2_weight";
+    constants_info_[60].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[60].offset = 0;
+    constants_info_[60].data_size = 49152;
+    constants_info_[60].from_folded = false;
+    constants_info_[60].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[60].shape = {64, 192, 1, 1};
+    constants_info_[60].stride = {192, 1, 1, 1};
+    constants_info_[60].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[60].original_fqn = "mv2.features.7.conv.2.weight";
+    constants_info_[61].name = "mv2_features_7_conv_3_weight";
+    constants_info_[61].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[61].offset = 0;
+    constants_info_[61].data_size = 256;
+    constants_info_[61].from_folded = false;
+    constants_info_[61].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[61].shape = {64};
+    constants_info_[61].stride = {1};
+    constants_info_[61].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[61].original_fqn = "mv2.features.7.conv.3.weight";
+    constants_info_[62].name = "mv2_features_7_conv_3_bias";
+    constants_info_[62].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[62].offset = 0;
+    constants_info_[62].data_size = 256;
+    constants_info_[62].from_folded = false;
+    constants_info_[62].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[62].shape = {64};
+    constants_info_[62].stride = {1};
+    constants_info_[62].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[62].original_fqn = "mv2.features.7.conv.3.bias";
+    constants_info_[63].name = "mv2_features_8_conv_0_0_weight";
+    constants_info_[63].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[63].offset = 0;
+    constants_info_[63].data_size = 98304;
+    constants_info_[63].from_folded = false;
+    constants_info_[63].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[63].shape = {384, 64, 1, 1};
+    constants_info_[63].stride = {64, 1, 1, 1};
+    constants_info_[63].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[63].original_fqn = "mv2.features.8.conv.0.0.weight";
+    constants_info_[64].name = "mv2_features_8_conv_0_1_weight";
+    constants_info_[64].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[64].offset = 0;
+    constants_info_[64].data_size = 1536;
+    constants_info_[64].from_folded = false;
+    constants_info_[64].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[64].shape = {384};
+    constants_info_[64].stride = {1};
+    constants_info_[64].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[64].original_fqn = "mv2.features.8.conv.0.1.weight";
+    constants_info_[65].name = "mv2_features_8_conv_0_1_bias";
+    constants_info_[65].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[65].offset = 0;
+    constants_info_[65].data_size = 1536;
+    constants_info_[65].from_folded = false;
+    constants_info_[65].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[65].shape = {384};
+    constants_info_[65].stride = {1};
+    constants_info_[65].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[65].original_fqn = "mv2.features.8.conv.0.1.bias";
+    constants_info_[66].name = "mv2_features_8_conv_1_0_weight";
+    constants_info_[66].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[66].offset = 0;
+    constants_info_[66].data_size = 13824;
+    constants_info_[66].from_folded = false;
+    constants_info_[66].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[66].shape = {384, 1, 3, 3};
+    constants_info_[66].stride = {9, 9, 3, 1};
+    constants_info_[66].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[66].original_fqn = "mv2.features.8.conv.1.0.weight";
+    constants_info_[67].name = "mv2_features_8_conv_1_1_weight";
+    constants_info_[67].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[67].offset = 0;
+    constants_info_[67].data_size = 1536;
+    constants_info_[67].from_folded = false;
+    constants_info_[67].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[67].shape = {384};
+    constants_info_[67].stride = {1};
+    constants_info_[67].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[67].original_fqn = "mv2.features.8.conv.1.1.weight";
+    constants_info_[68].name = "mv2_features_8_conv_1_1_bias";
+    constants_info_[68].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[68].offset = 0;
+    constants_info_[68].data_size = 1536;
+    constants_info_[68].from_folded = false;
+    constants_info_[68].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[68].shape = {384};
+    constants_info_[68].stride = {1};
+    constants_info_[68].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[68].original_fqn = "mv2.features.8.conv.1.1.bias";
+    constants_info_[69].name = "mv2_features_8_conv_2_weight";
+    constants_info_[69].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[69].offset = 0;
+    constants_info_[69].data_size = 98304;
+    constants_info_[69].from_folded = false;
+    constants_info_[69].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[69].shape = {64, 384, 1, 1};
+    constants_info_[69].stride = {384, 1, 1, 1};
+    constants_info_[69].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[69].original_fqn = "mv2.features.8.conv.2.weight";
+    constants_info_[70].name = "mv2_features_8_conv_3_weight";
+    constants_info_[70].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[70].offset = 0;
+    constants_info_[70].data_size = 256;
+    constants_info_[70].from_folded = false;
+    constants_info_[70].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[70].shape = {64};
+    constants_info_[70].stride = {1};
+    constants_info_[70].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[70].original_fqn = "mv2.features.8.conv.3.weight";
+    constants_info_[71].name = "mv2_features_8_conv_3_bias";
+    constants_info_[71].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[71].offset = 0;
+    constants_info_[71].data_size = 256;
+    constants_info_[71].from_folded = false;
+    constants_info_[71].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[71].shape = {64};
+    constants_info_[71].stride = {1};
+    constants_info_[71].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[71].original_fqn = "mv2.features.8.conv.3.bias";
+    constants_info_[72].name = "mv2_features_9_conv_0_0_weight";
+    constants_info_[72].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[72].offset = 0;
+    constants_info_[72].data_size = 98304;
+    constants_info_[72].from_folded = false;
+    constants_info_[72].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[72].shape = {384, 64, 1, 1};
+    constants_info_[72].stride = {64, 1, 1, 1};
+    constants_info_[72].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[72].original_fqn = "mv2.features.9.conv.0.0.weight";
+    constants_info_[73].name = "mv2_features_9_conv_0_1_weight";
+    constants_info_[73].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[73].offset = 0;
+    constants_info_[73].data_size = 1536;
+    constants_info_[73].from_folded = false;
+    constants_info_[73].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[73].shape = {384};
+    constants_info_[73].stride = {1};
+    constants_info_[73].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[73].original_fqn = "mv2.features.9.conv.0.1.weight";
+    constants_info_[74].name = "mv2_features_9_conv_0_1_bias";
+    constants_info_[74].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[74].offset = 0;
+    constants_info_[74].data_size = 1536;
+    constants_info_[74].from_folded = false;
+    constants_info_[74].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[74].shape = {384};
+    constants_info_[74].stride = {1};
+    constants_info_[74].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[74].original_fqn = "mv2.features.9.conv.0.1.bias";
+    constants_info_[75].name = "mv2_features_9_conv_1_0_weight";
+    constants_info_[75].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[75].offset = 0;
+    constants_info_[75].data_size = 13824;
+    constants_info_[75].from_folded = false;
+    constants_info_[75].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[75].shape = {384, 1, 3, 3};
+    constants_info_[75].stride = {9, 9, 3, 1};
+    constants_info_[75].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[75].original_fqn = "mv2.features.9.conv.1.0.weight";
+    constants_info_[76].name = "mv2_features_9_conv_1_1_weight";
+    constants_info_[76].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[76].offset = 0;
+    constants_info_[76].data_size = 1536;
+    constants_info_[76].from_folded = false;
+    constants_info_[76].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[76].shape = {384};
+    constants_info_[76].stride = {1};
+    constants_info_[76].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[76].original_fqn = "mv2.features.9.conv.1.1.weight";
+    constants_info_[77].name = "mv2_features_9_conv_1_1_bias";
+    constants_info_[77].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[77].offset = 0;
+    constants_info_[77].data_size = 1536;
+    constants_info_[77].from_folded = false;
+    constants_info_[77].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[77].shape = {384};
+    constants_info_[77].stride = {1};
+    constants_info_[77].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[77].original_fqn = "mv2.features.9.conv.1.1.bias";
+    constants_info_[78].name = "mv2_features_9_conv_2_weight";
+    constants_info_[78].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[78].offset = 0;
+    constants_info_[78].data_size = 98304;
+    constants_info_[78].from_folded = false;
+    constants_info_[78].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[78].shape = {64, 384, 1, 1};
+    constants_info_[78].stride = {384, 1, 1, 1};
+    constants_info_[78].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[78].original_fqn = "mv2.features.9.conv.2.weight";
+    constants_info_[79].name = "mv2_features_9_conv_3_weight";
+    constants_info_[79].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[79].offset = 0;
+    constants_info_[79].data_size = 256;
+    constants_info_[79].from_folded = false;
+    constants_info_[79].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[79].shape = {64};
+    constants_info_[79].stride = {1};
+    constants_info_[79].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[79].original_fqn = "mv2.features.9.conv.3.weight";
+    constants_info_[80].name = "mv2_features_9_conv_3_bias";
+    constants_info_[80].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[80].offset = 0;
+    constants_info_[80].data_size = 256;
+    constants_info_[80].from_folded = false;
+    constants_info_[80].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[80].shape = {64};
+    constants_info_[80].stride = {1};
+    constants_info_[80].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[80].original_fqn = "mv2.features.9.conv.3.bias";
+    constants_info_[81].name = "mv2_features_10_conv_0_0_weight";
+    constants_info_[81].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[81].offset = 0;
+    constants_info_[81].data_size = 98304;
+    constants_info_[81].from_folded = false;
+    constants_info_[81].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[81].shape = {384, 64, 1, 1};
+    constants_info_[81].stride = {64, 1, 1, 1};
+    constants_info_[81].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[81].original_fqn = "mv2.features.10.conv.0.0.weight";
+    constants_info_[82].name = "mv2_features_10_conv_0_1_weight";
+    constants_info_[82].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[82].offset = 0;
+    constants_info_[82].data_size = 1536;
+    constants_info_[82].from_folded = false;
+    constants_info_[82].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[82].shape = {384};
+    constants_info_[82].stride = {1};
+    constants_info_[82].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[82].original_fqn = "mv2.features.10.conv.0.1.weight";
+    constants_info_[83].name = "mv2_features_10_conv_0_1_bias";
+    constants_info_[83].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[83].offset = 0;
+    constants_info_[83].data_size = 1536;
+    constants_info_[83].from_folded = false;
+    constants_info_[83].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[83].shape = {384};
+    constants_info_[83].stride = {1};
+    constants_info_[83].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[83].original_fqn = "mv2.features.10.conv.0.1.bias";
+    constants_info_[84].name = "mv2_features_10_conv_1_0_weight";
+    constants_info_[84].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[84].offset = 0;
+    constants_info_[84].data_size = 13824;
+    constants_info_[84].from_folded = false;
+    constants_info_[84].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[84].shape = {384, 1, 3, 3};
+    constants_info_[84].stride = {9, 9, 3, 1};
+    constants_info_[84].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[84].original_fqn = "mv2.features.10.conv.1.0.weight";
+    constants_info_[85].name = "mv2_features_10_conv_1_1_weight";
+    constants_info_[85].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[85].offset = 0;
+    constants_info_[85].data_size = 1536;
+    constants_info_[85].from_folded = false;
+    constants_info_[85].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[85].shape = {384};
+    constants_info_[85].stride = {1};
+    constants_info_[85].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[85].original_fqn = "mv2.features.10.conv.1.1.weight";
+    constants_info_[86].name = "mv2_features_10_conv_1_1_bias";
+    constants_info_[86].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[86].offset = 0;
+    constants_info_[86].data_size = 1536;
+    constants_info_[86].from_folded = false;
+    constants_info_[86].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[86].shape = {384};
+    constants_info_[86].stride = {1};
+    constants_info_[86].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[86].original_fqn = "mv2.features.10.conv.1.1.bias";
+    constants_info_[87].name = "mv2_features_10_conv_2_weight";
+    constants_info_[87].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[87].offset = 0;
+    constants_info_[87].data_size = 98304;
+    constants_info_[87].from_folded = false;
+    constants_info_[87].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[87].shape = {64, 384, 1, 1};
+    constants_info_[87].stride = {384, 1, 1, 1};
+    constants_info_[87].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[87].original_fqn = "mv2.features.10.conv.2.weight";
+    constants_info_[88].name = "mv2_features_10_conv_3_weight";
+    constants_info_[88].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[88].offset = 0;
+    constants_info_[88].data_size = 256;
+    constants_info_[88].from_folded = false;
+    constants_info_[88].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[88].shape = {64};
+    constants_info_[88].stride = {1};
+    constants_info_[88].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[88].original_fqn = "mv2.features.10.conv.3.weight";
+    constants_info_[89].name = "mv2_features_10_conv_3_bias";
+    constants_info_[89].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[89].offset = 0;
+    constants_info_[89].data_size = 256;
+    constants_info_[89].from_folded = false;
+    constants_info_[89].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[89].shape = {64};
+    constants_info_[89].stride = {1};
+    constants_info_[89].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[89].original_fqn = "mv2.features.10.conv.3.bias";
+    constants_info_[90].name = "mv2_features_11_conv_0_0_weight";
+    constants_info_[90].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[90].offset = 0;
+    constants_info_[90].data_size = 98304;
+    constants_info_[90].from_folded = false;
+    constants_info_[90].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[90].shape = {384, 64, 1, 1};
+    constants_info_[90].stride = {64, 1, 1, 1};
+    constants_info_[90].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[90].original_fqn = "mv2.features.11.conv.0.0.weight";
+    constants_info_[91].name = "mv2_features_11_conv_0_1_weight";
+    constants_info_[91].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[91].offset = 0;
+    constants_info_[91].data_size = 1536;
+    constants_info_[91].from_folded = false;
+    constants_info_[91].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[91].shape = {384};
+    constants_info_[91].stride = {1};
+    constants_info_[91].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[91].original_fqn = "mv2.features.11.conv.0.1.weight";
+    constants_info_[92].name = "mv2_features_11_conv_0_1_bias";
+    constants_info_[92].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[92].offset = 0;
+    constants_info_[92].data_size = 1536;
+    constants_info_[92].from_folded = false;
+    constants_info_[92].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[92].shape = {384};
+    constants_info_[92].stride = {1};
+    constants_info_[92].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[92].original_fqn = "mv2.features.11.conv.0.1.bias";
+    constants_info_[93].name = "mv2_features_11_conv_1_0_weight";
+    constants_info_[93].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[93].offset = 0;
+    constants_info_[93].data_size = 13824;
+    constants_info_[93].from_folded = false;
+    constants_info_[93].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[93].shape = {384, 1, 3, 3};
+    constants_info_[93].stride = {9, 9, 3, 1};
+    constants_info_[93].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[93].original_fqn = "mv2.features.11.conv.1.0.weight";
+    constants_info_[94].name = "mv2_features_11_conv_1_1_weight";
+    constants_info_[94].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[94].offset = 0;
+    constants_info_[94].data_size = 1536;
+    constants_info_[94].from_folded = false;
+    constants_info_[94].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[94].shape = {384};
+    constants_info_[94].stride = {1};
+    constants_info_[94].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[94].original_fqn = "mv2.features.11.conv.1.1.weight";
+    constants_info_[95].name = "mv2_features_11_conv_1_1_bias";
+    constants_info_[95].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[95].offset = 0;
+    constants_info_[95].data_size = 1536;
+    constants_info_[95].from_folded = false;
+    constants_info_[95].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[95].shape = {384};
+    constants_info_[95].stride = {1};
+    constants_info_[95].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[95].original_fqn = "mv2.features.11.conv.1.1.bias";
+    constants_info_[96].name = "mv2_features_11_conv_2_weight";
+    constants_info_[96].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[96].offset = 0;
+    constants_info_[96].data_size = 147456;
+    constants_info_[96].from_folded = false;
+    constants_info_[96].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[96].shape = {96, 384, 1, 1};
+    constants_info_[96].stride = {384, 1, 1, 1};
+    constants_info_[96].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[96].original_fqn = "mv2.features.11.conv.2.weight";
+    constants_info_[97].name = "mv2_features_11_conv_3_weight";
+    constants_info_[97].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[97].offset = 0;
+    constants_info_[97].data_size = 384;
+    constants_info_[97].from_folded = false;
+    constants_info_[97].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[97].shape = {96};
+    constants_info_[97].stride = {1};
+    constants_info_[97].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[97].original_fqn = "mv2.features.11.conv.3.weight";
+    constants_info_[98].name = "mv2_features_11_conv_3_bias";
+    constants_info_[98].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[98].offset = 0;
+    constants_info_[98].data_size = 384;
+    constants_info_[98].from_folded = false;
+    constants_info_[98].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[98].shape = {96};
+    constants_info_[98].stride = {1};
+    constants_info_[98].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[98].original_fqn = "mv2.features.11.conv.3.bias";
+    constants_info_[99].name = "mv2_features_12_conv_0_0_weight";
+    constants_info_[99].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[99].offset = 0;
+    constants_info_[99].data_size = 221184;
+    constants_info_[99].from_folded = false;
+    constants_info_[99].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[99].shape = {576, 96, 1, 1};
+    constants_info_[99].stride = {96, 1, 1, 1};
+    constants_info_[99].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[99].original_fqn = "mv2.features.12.conv.0.0.weight";
+    constants_info_[100].name = "mv2_features_12_conv_0_1_weight";
+    constants_info_[100].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[100].offset = 0;
+    constants_info_[100].data_size = 2304;
+    constants_info_[100].from_folded = false;
+    constants_info_[100].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[100].shape = {576};
+    constants_info_[100].stride = {1};
+    constants_info_[100].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[100].original_fqn = "mv2.features.12.conv.0.1.weight";
+    constants_info_[101].name = "mv2_features_12_conv_0_1_bias";
+    constants_info_[101].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[101].offset = 0;
+    constants_info_[101].data_size = 2304;
+    constants_info_[101].from_folded = false;
+    constants_info_[101].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[101].shape = {576};
+    constants_info_[101].stride = {1};
+    constants_info_[101].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[101].original_fqn = "mv2.features.12.conv.0.1.bias";
+    constants_info_[102].name = "mv2_features_12_conv_1_0_weight";
+    constants_info_[102].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[102].offset = 0;
+    constants_info_[102].data_size = 20736;
+    constants_info_[102].from_folded = false;
+    constants_info_[102].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[102].shape = {576, 1, 3, 3};
+    constants_info_[102].stride = {9, 9, 3, 1};
+    constants_info_[102].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[102].original_fqn = "mv2.features.12.conv.1.0.weight";
+    constants_info_[103].name = "mv2_features_12_conv_1_1_weight";
+    constants_info_[103].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[103].offset = 0;
+    constants_info_[103].data_size = 2304;
+    constants_info_[103].from_folded = false;
+    constants_info_[103].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[103].shape = {576};
+    constants_info_[103].stride = {1};
+    constants_info_[103].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[103].original_fqn = "mv2.features.12.conv.1.1.weight";
+    constants_info_[104].name = "mv2_features_12_conv_1_1_bias";
+    constants_info_[104].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[104].offset = 0;
+    constants_info_[104].data_size = 2304;
+    constants_info_[104].from_folded = false;
+    constants_info_[104].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[104].shape = {576};
+    constants_info_[104].stride = {1};
+    constants_info_[104].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[104].original_fqn = "mv2.features.12.conv.1.1.bias";
+    constants_info_[105].name = "mv2_features_12_conv_2_weight";
+    constants_info_[105].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[105].offset = 0;
+    constants_info_[105].data_size = 221184;
+    constants_info_[105].from_folded = false;
+    constants_info_[105].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[105].shape = {96, 576, 1, 1};
+    constants_info_[105].stride = {576, 1, 1, 1};
+    constants_info_[105].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[105].original_fqn = "mv2.features.12.conv.2.weight";
+    constants_info_[106].name = "mv2_features_12_conv_3_weight";
+    constants_info_[106].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[106].offset = 0;
+    constants_info_[106].data_size = 384;
+    constants_info_[106].from_folded = false;
+    constants_info_[106].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[106].shape = {96};
+    constants_info_[106].stride = {1};
+    constants_info_[106].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[106].original_fqn = "mv2.features.12.conv.3.weight";
+    constants_info_[107].name = "mv2_features_12_conv_3_bias";
+    constants_info_[107].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[107].offset = 0;
+    constants_info_[107].data_size = 384;
+    constants_info_[107].from_folded = false;
+    constants_info_[107].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[107].shape = {96};
+    constants_info_[107].stride = {1};
+    constants_info_[107].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[107].original_fqn = "mv2.features.12.conv.3.bias";
+    constants_info_[108].name = "mv2_features_13_conv_0_0_weight";
+    constants_info_[108].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[108].offset = 0;
+    constants_info_[108].data_size = 221184;
+    constants_info_[108].from_folded = false;
+    constants_info_[108].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[108].shape = {576, 96, 1, 1};
+    constants_info_[108].stride = {96, 1, 1, 1};
+    constants_info_[108].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[108].original_fqn = "mv2.features.13.conv.0.0.weight";
+    constants_info_[109].name = "mv2_features_13_conv_0_1_weight";
+    constants_info_[109].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[109].offset = 0;
+    constants_info_[109].data_size = 2304;
+    constants_info_[109].from_folded = false;
+    constants_info_[109].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[109].shape = {576};
+    constants_info_[109].stride = {1};
+    constants_info_[109].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[109].original_fqn = "mv2.features.13.conv.0.1.weight";
+    constants_info_[110].name = "mv2_features_13_conv_0_1_bias";
+    constants_info_[110].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[110].offset = 0;
+    constants_info_[110].data_size = 2304;
+    constants_info_[110].from_folded = false;
+    constants_info_[110].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[110].shape = {576};
+    constants_info_[110].stride = {1};
+    constants_info_[110].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[110].original_fqn = "mv2.features.13.conv.0.1.bias";
+    constants_info_[111].name = "mv2_features_13_conv_1_0_weight";
+    constants_info_[111].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[111].offset = 0;
+    constants_info_[111].data_size = 20736;
+    constants_info_[111].from_folded = false;
+    constants_info_[111].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[111].shape = {576, 1, 3, 3};
+    constants_info_[111].stride = {9, 9, 3, 1};
+    constants_info_[111].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[111].original_fqn = "mv2.features.13.conv.1.0.weight";
+    constants_info_[112].name = "mv2_features_13_conv_1_1_weight";
+    constants_info_[112].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[112].offset = 0;
+    constants_info_[112].data_size = 2304;
+    constants_info_[112].from_folded = false;
+    constants_info_[112].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[112].shape = {576};
+    constants_info_[112].stride = {1};
+    constants_info_[112].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[112].original_fqn = "mv2.features.13.conv.1.1.weight";
+    constants_info_[113].name = "mv2_features_13_conv_1_1_bias";
+    constants_info_[113].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[113].offset = 0;
+    constants_info_[113].data_size = 2304;
+    constants_info_[113].from_folded = false;
+    constants_info_[113].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[113].shape = {576};
+    constants_info_[113].stride = {1};
+    constants_info_[113].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[113].original_fqn = "mv2.features.13.conv.1.1.bias";
+    constants_info_[114].name = "mv2_features_13_conv_2_weight";
+    constants_info_[114].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[114].offset = 0;
+    constants_info_[114].data_size = 221184;
+    constants_info_[114].from_folded = false;
+    constants_info_[114].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[114].shape = {96, 576, 1, 1};
+    constants_info_[114].stride = {576, 1, 1, 1};
+    constants_info_[114].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[114].original_fqn = "mv2.features.13.conv.2.weight";
+    constants_info_[115].name = "mv2_features_13_conv_3_weight";
+    constants_info_[115].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[115].offset = 0;
+    constants_info_[115].data_size = 384;
+    constants_info_[115].from_folded = false;
+    constants_info_[115].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[115].shape = {96};
+    constants_info_[115].stride = {1};
+    constants_info_[115].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[115].original_fqn = "mv2.features.13.conv.3.weight";
+    constants_info_[116].name = "mv2_features_13_conv_3_bias";
+    constants_info_[116].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[116].offset = 0;
+    constants_info_[116].data_size = 384;
+    constants_info_[116].from_folded = false;
+    constants_info_[116].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[116].shape = {96};
+    constants_info_[116].stride = {1};
+    constants_info_[116].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[116].original_fqn = "mv2.features.13.conv.3.bias";
+    constants_info_[117].name = "mv2_features_14_conv_0_0_weight";
+    constants_info_[117].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[117].offset = 0;
+    constants_info_[117].data_size = 221184;
+    constants_info_[117].from_folded = false;
+    constants_info_[117].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[117].shape = {576, 96, 1, 1};
+    constants_info_[117].stride = {96, 1, 1, 1};
+    constants_info_[117].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[117].original_fqn = "mv2.features.14.conv.0.0.weight";
+    constants_info_[118].name = "mv2_features_14_conv_0_1_weight";
+    constants_info_[118].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[118].offset = 0;
+    constants_info_[118].data_size = 2304;
+    constants_info_[118].from_folded = false;
+    constants_info_[118].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[118].shape = {576};
+    constants_info_[118].stride = {1};
+    constants_info_[118].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[118].original_fqn = "mv2.features.14.conv.0.1.weight";
+    constants_info_[119].name = "mv2_features_14_conv_0_1_bias";
+    constants_info_[119].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[119].offset = 0;
+    constants_info_[119].data_size = 2304;
+    constants_info_[119].from_folded = false;
+    constants_info_[119].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[119].shape = {576};
+    constants_info_[119].stride = {1};
+    constants_info_[119].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[119].original_fqn = "mv2.features.14.conv.0.1.bias";
+    constants_info_[120].name = "mv2_features_14_conv_1_0_weight";
+    constants_info_[120].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[120].offset = 0;
+    constants_info_[120].data_size = 20736;
+    constants_info_[120].from_folded = false;
+    constants_info_[120].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[120].shape = {576, 1, 3, 3};
+    constants_info_[120].stride = {9, 9, 3, 1};
+    constants_info_[120].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[120].original_fqn = "mv2.features.14.conv.1.0.weight";
+    constants_info_[121].name = "mv2_features_14_conv_1_1_weight";
+    constants_info_[121].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[121].offset = 0;
+    constants_info_[121].data_size = 2304;
+    constants_info_[121].from_folded = false;
+    constants_info_[121].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[121].shape = {576};
+    constants_info_[121].stride = {1};
+    constants_info_[121].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[121].original_fqn = "mv2.features.14.conv.1.1.weight";
+    constants_info_[122].name = "mv2_features_14_conv_1_1_bias";
+    constants_info_[122].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[122].offset = 0;
+    constants_info_[122].data_size = 2304;
+    constants_info_[122].from_folded = false;
+    constants_info_[122].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[122].shape = {576};
+    constants_info_[122].stride = {1};
+    constants_info_[122].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[122].original_fqn = "mv2.features.14.conv.1.1.bias";
+    constants_info_[123].name = "mv2_features_14_conv_2_weight";
+    constants_info_[123].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[123].offset = 0;
+    constants_info_[123].data_size = 368640;
+    constants_info_[123].from_folded = false;
+    constants_info_[123].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[123].shape = {160, 576, 1, 1};
+    constants_info_[123].stride = {576, 1, 1, 1};
+    constants_info_[123].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[123].original_fqn = "mv2.features.14.conv.2.weight";
+    constants_info_[124].name = "mv2_features_14_conv_3_weight";
+    constants_info_[124].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[124].offset = 0;
+    constants_info_[124].data_size = 640;
+    constants_info_[124].from_folded = false;
+    constants_info_[124].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[124].shape = {160};
+    constants_info_[124].stride = {1};
+    constants_info_[124].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[124].original_fqn = "mv2.features.14.conv.3.weight";
+    constants_info_[125].name = "mv2_features_14_conv_3_bias";
+    constants_info_[125].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[125].offset = 0;
+    constants_info_[125].data_size = 640;
+    constants_info_[125].from_folded = false;
+    constants_info_[125].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[125].shape = {160};
+    constants_info_[125].stride = {1};
+    constants_info_[125].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[125].original_fqn = "mv2.features.14.conv.3.bias";
+    constants_info_[126].name = "mv2_features_15_conv_0_0_weight";
+    constants_info_[126].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[126].offset = 0;
+    constants_info_[126].data_size = 614400;
+    constants_info_[126].from_folded = false;
+    constants_info_[126].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[126].shape = {960, 160, 1, 1};
+    constants_info_[126].stride = {160, 1, 1, 1};
+    constants_info_[126].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[126].original_fqn = "mv2.features.15.conv.0.0.weight";
+    constants_info_[127].name = "mv2_features_15_conv_0_1_weight";
+    constants_info_[127].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[127].offset = 0;
+    constants_info_[127].data_size = 3840;
+    constants_info_[127].from_folded = false;
+    constants_info_[127].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[127].shape = {960};
+    constants_info_[127].stride = {1};
+    constants_info_[127].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[127].original_fqn = "mv2.features.15.conv.0.1.weight";
+    constants_info_[128].name = "mv2_features_15_conv_0_1_bias";
+    constants_info_[128].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[128].offset = 0;
+    constants_info_[128].data_size = 3840;
+    constants_info_[128].from_folded = false;
+    constants_info_[128].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[128].shape = {960};
+    constants_info_[128].stride = {1};
+    constants_info_[128].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[128].original_fqn = "mv2.features.15.conv.0.1.bias";
+    constants_info_[129].name = "mv2_features_15_conv_1_0_weight";
+    constants_info_[129].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[129].offset = 0;
+    constants_info_[129].data_size = 34560;
+    constants_info_[129].from_folded = false;
+    constants_info_[129].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[129].shape = {960, 1, 3, 3};
+    constants_info_[129].stride = {9, 9, 3, 1};
+    constants_info_[129].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[129].original_fqn = "mv2.features.15.conv.1.0.weight";
+    constants_info_[130].name = "mv2_features_15_conv_1_1_weight";
+    constants_info_[130].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[130].offset = 0;
+    constants_info_[130].data_size = 3840;
+    constants_info_[130].from_folded = false;
+    constants_info_[130].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[130].shape = {960};
+    constants_info_[130].stride = {1};
+    constants_info_[130].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[130].original_fqn = "mv2.features.15.conv.1.1.weight";
+    constants_info_[131].name = "mv2_features_15_conv_1_1_bias";
+    constants_info_[131].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[131].offset = 0;
+    constants_info_[131].data_size = 3840;
+    constants_info_[131].from_folded = false;
+    constants_info_[131].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[131].shape = {960};
+    constants_info_[131].stride = {1};
+    constants_info_[131].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[131].original_fqn = "mv2.features.15.conv.1.1.bias";
+    constants_info_[132].name = "mv2_features_15_conv_2_weight";
+    constants_info_[132].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[132].offset = 0;
+    constants_info_[132].data_size = 614400;
+    constants_info_[132].from_folded = false;
+    constants_info_[132].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[132].shape = {160, 960, 1, 1};
+    constants_info_[132].stride = {960, 1, 1, 1};
+    constants_info_[132].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[132].original_fqn = "mv2.features.15.conv.2.weight";
+    constants_info_[133].name = "mv2_features_15_conv_3_weight";
+    constants_info_[133].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[133].offset = 0;
+    constants_info_[133].data_size = 640;
+    constants_info_[133].from_folded = false;
+    constants_info_[133].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[133].shape = {160};
+    constants_info_[133].stride = {1};
+    constants_info_[133].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[133].original_fqn = "mv2.features.15.conv.3.weight";
+    constants_info_[134].name = "mv2_features_15_conv_3_bias";
+    constants_info_[134].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[134].offset = 0;
+    constants_info_[134].data_size = 640;
+    constants_info_[134].from_folded = false;
+    constants_info_[134].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[134].shape = {160};
+    constants_info_[134].stride = {1};
+    constants_info_[134].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[134].original_fqn = "mv2.features.15.conv.3.bias";
+    constants_info_[135].name = "mv2_features_16_conv_0_0_weight";
+    constants_info_[135].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[135].offset = 0;
+    constants_info_[135].data_size = 614400;
+    constants_info_[135].from_folded = false;
+    constants_info_[135].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[135].shape = {960, 160, 1, 1};
+    constants_info_[135].stride = {160, 1, 1, 1};
+    constants_info_[135].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[135].original_fqn = "mv2.features.16.conv.0.0.weight";
+    constants_info_[136].name = "mv2_features_16_conv_0_1_weight";
+    constants_info_[136].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[136].offset = 0;
+    constants_info_[136].data_size = 3840;
+    constants_info_[136].from_folded = false;
+    constants_info_[136].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[136].shape = {960};
+    constants_info_[136].stride = {1};
+    constants_info_[136].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[136].original_fqn = "mv2.features.16.conv.0.1.weight";
+    constants_info_[137].name = "mv2_features_16_conv_0_1_bias";
+    constants_info_[137].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[137].offset = 0;
+    constants_info_[137].data_size = 3840;
+    constants_info_[137].from_folded = false;
+    constants_info_[137].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[137].shape = {960};
+    constants_info_[137].stride = {1};
+    constants_info_[137].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[137].original_fqn = "mv2.features.16.conv.0.1.bias";
+    constants_info_[138].name = "mv2_features_16_conv_1_0_weight";
+    constants_info_[138].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[138].offset = 0;
+    constants_info_[138].data_size = 34560;
+    constants_info_[138].from_folded = false;
+    constants_info_[138].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[138].shape = {960, 1, 3, 3};
+    constants_info_[138].stride = {9, 9, 3, 1};
+    constants_info_[138].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[138].original_fqn = "mv2.features.16.conv.1.0.weight";
+    constants_info_[139].name = "mv2_features_16_conv_1_1_weight";
+    constants_info_[139].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[139].offset = 0;
+    constants_info_[139].data_size = 3840;
+    constants_info_[139].from_folded = false;
+    constants_info_[139].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[139].shape = {960};
+    constants_info_[139].stride = {1};
+    constants_info_[139].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[139].original_fqn = "mv2.features.16.conv.1.1.weight";
+    constants_info_[140].name = "mv2_features_16_conv_1_1_bias";
+    constants_info_[140].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[140].offset = 0;
+    constants_info_[140].data_size = 3840;
+    constants_info_[140].from_folded = false;
+    constants_info_[140].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[140].shape = {960};
+    constants_info_[140].stride = {1};
+    constants_info_[140].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[140].original_fqn = "mv2.features.16.conv.1.1.bias";
+    constants_info_[141].name = "mv2_features_16_conv_2_weight";
+    constants_info_[141].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[141].offset = 0;
+    constants_info_[141].data_size = 614400;
+    constants_info_[141].from_folded = false;
+    constants_info_[141].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[141].shape = {160, 960, 1, 1};
+    constants_info_[141].stride = {960, 1, 1, 1};
+    constants_info_[141].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[141].original_fqn = "mv2.features.16.conv.2.weight";
+    constants_info_[142].name = "mv2_features_16_conv_3_weight";
+    constants_info_[142].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[142].offset = 0;
+    constants_info_[142].data_size = 640;
+    constants_info_[142].from_folded = false;
+    constants_info_[142].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[142].shape = {160};
+    constants_info_[142].stride = {1};
+    constants_info_[142].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[142].original_fqn = "mv2.features.16.conv.3.weight";
+    constants_info_[143].name = "mv2_features_16_conv_3_bias";
+    constants_info_[143].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[143].offset = 0;
+    constants_info_[143].data_size = 640;
+    constants_info_[143].from_folded = false;
+    constants_info_[143].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[143].shape = {160};
+    constants_info_[143].stride = {1};
+    constants_info_[143].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[143].original_fqn = "mv2.features.16.conv.3.bias";
+    constants_info_[144].name = "mv2_features_17_conv_0_0_weight";
+    constants_info_[144].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[144].offset = 0;
+    constants_info_[144].data_size = 614400;
+    constants_info_[144].from_folded = false;
+    constants_info_[144].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[144].shape = {960, 160, 1, 1};
+    constants_info_[144].stride = {160, 1, 1, 1};
+    constants_info_[144].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[144].original_fqn = "mv2.features.17.conv.0.0.weight";
+    constants_info_[145].name = "mv2_features_17_conv_0_1_weight";
+    constants_info_[145].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[145].offset = 0;
+    constants_info_[145].data_size = 3840;
+    constants_info_[145].from_folded = false;
+    constants_info_[145].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[145].shape = {960};
+    constants_info_[145].stride = {1};
+    constants_info_[145].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[145].original_fqn = "mv2.features.17.conv.0.1.weight";
+    constants_info_[146].name = "mv2_features_17_conv_0_1_bias";
+    constants_info_[146].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[146].offset = 0;
+    constants_info_[146].data_size = 3840;
+    constants_info_[146].from_folded = false;
+    constants_info_[146].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[146].shape = {960};
+    constants_info_[146].stride = {1};
+    constants_info_[146].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[146].original_fqn = "mv2.features.17.conv.0.1.bias";
+    constants_info_[147].name = "mv2_features_17_conv_1_0_weight";
+    constants_info_[147].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[147].offset = 0;
+    constants_info_[147].data_size = 34560;
+    constants_info_[147].from_folded = false;
+    constants_info_[147].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[147].shape = {960, 1, 3, 3};
+    constants_info_[147].stride = {9, 9, 3, 1};
+    constants_info_[147].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[147].original_fqn = "mv2.features.17.conv.1.0.weight";
+    constants_info_[148].name = "mv2_features_17_conv_1_1_weight";
+    constants_info_[148].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[148].offset = 0;
+    constants_info_[148].data_size = 3840;
+    constants_info_[148].from_folded = false;
+    constants_info_[148].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[148].shape = {960};
+    constants_info_[148].stride = {1};
+    constants_info_[148].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[148].original_fqn = "mv2.features.17.conv.1.1.weight";
+    constants_info_[149].name = "mv2_features_17_conv_1_1_bias";
+    constants_info_[149].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[149].offset = 0;
+    constants_info_[149].data_size = 3840;
+    constants_info_[149].from_folded = false;
+    constants_info_[149].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[149].shape = {960};
+    constants_info_[149].stride = {1};
+    constants_info_[149].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[149].original_fqn = "mv2.features.17.conv.1.1.bias";
+    constants_info_[150].name = "mv2_features_17_conv_2_weight";
+    constants_info_[150].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[150].offset = 0;
+    constants_info_[150].data_size = 1228800;
+    constants_info_[150].from_folded = false;
+    constants_info_[150].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[150].shape = {320, 960, 1, 1};
+    constants_info_[150].stride = {960, 1, 1, 1};
+    constants_info_[150].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[150].original_fqn = "mv2.features.17.conv.2.weight";
+    constants_info_[151].name = "mv2_features_17_conv_3_weight";
+    constants_info_[151].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[151].offset = 0;
+    constants_info_[151].data_size = 1280;
+    constants_info_[151].from_folded = false;
+    constants_info_[151].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[151].shape = {320};
+    constants_info_[151].stride = {1};
+    constants_info_[151].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[151].original_fqn = "mv2.features.17.conv.3.weight";
+    constants_info_[152].name = "mv2_features_17_conv_3_bias";
+    constants_info_[152].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[152].offset = 0;
+    constants_info_[152].data_size = 1280;
+    constants_info_[152].from_folded = false;
+    constants_info_[152].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[152].shape = {320};
+    constants_info_[152].stride = {1};
+    constants_info_[152].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[152].original_fqn = "mv2.features.17.conv.3.bias";
+    constants_info_[153].name = "mv2_features_18_0_weight";
+    constants_info_[153].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[153].offset = 0;
+    constants_info_[153].data_size = 1638400;
+    constants_info_[153].from_folded = false;
+    constants_info_[153].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[153].shape = {1280, 320, 1, 1};
+    constants_info_[153].stride = {320, 1, 1, 1};
+    constants_info_[153].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[153].original_fqn = "mv2.features.18.0.weight";
+    constants_info_[154].name = "mv2_features_18_1_weight";
+    constants_info_[154].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[154].offset = 0;
+    constants_info_[154].data_size = 5120;
+    constants_info_[154].from_folded = false;
+    constants_info_[154].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[154].shape = {1280};
+    constants_info_[154].stride = {1};
+    constants_info_[154].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[154].original_fqn = "mv2.features.18.1.weight";
+    constants_info_[155].name = "mv2_features_18_1_bias";
+    constants_info_[155].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[155].offset = 0;
+    constants_info_[155].data_size = 5120;
+    constants_info_[155].from_folded = false;
+    constants_info_[155].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[155].shape = {1280};
+    constants_info_[155].stride = {1};
+    constants_info_[155].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[155].original_fqn = "mv2.features.18.1.bias";
+    constants_info_[156].name = "mv2_classifier_1_weight";
+    constants_info_[156].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[156].offset = 0;
+    constants_info_[156].data_size = 5120000;
+    constants_info_[156].from_folded = false;
+    constants_info_[156].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[156].shape = {1000, 1280};
+    constants_info_[156].stride = {1280, 1};
+    constants_info_[156].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[156].original_fqn = "mv2.classifier.1.weight";
+    constants_info_[157].name = "mv2_classifier_1_bias";
+    constants_info_[157].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[157].offset = 0;
+    constants_info_[157].data_size = 4000;
+    constants_info_[157].from_folded = false;
+    constants_info_[157].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[157].shape = {1000};
+    constants_info_[157].stride = {1};
+    constants_info_[157].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[157].original_fqn = "mv2.classifier.1.bias";
+    constants_info_[158].name = "mv2_features_0_1_running_mean";
+    constants_info_[158].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[158].offset = 0;
+    constants_info_[158].data_size = 128;
+    constants_info_[158].from_folded = false;
+    constants_info_[158].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[158].shape = {32};
+    constants_info_[158].stride = {1};
+    constants_info_[158].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[158].original_fqn = "mv2.features.0.1.running_mean";
+    constants_info_[159].name = "mv2_features_0_1_running_var";
+    constants_info_[159].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[159].offset = 0;
+    constants_info_[159].data_size = 128;
+    constants_info_[159].from_folded = false;
+    constants_info_[159].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[159].shape = {32};
+    constants_info_[159].stride = {1};
+    constants_info_[159].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[159].original_fqn = "mv2.features.0.1.running_var";
+    constants_info_[160].name = "mv2_features_1_conv_0_1_running_mean";
+    constants_info_[160].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[160].offset = 0;
+    constants_info_[160].data_size = 128;
+    constants_info_[160].from_folded = false;
+    constants_info_[160].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[160].shape = {32};
+    constants_info_[160].stride = {1};
+    constants_info_[160].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[160].original_fqn = "mv2.features.1.conv.0.1.running_mean";
+    constants_info_[161].name = "mv2_features_1_conv_0_1_running_var";
+    constants_info_[161].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[161].offset = 0;
+    constants_info_[161].data_size = 128;
+    constants_info_[161].from_folded = false;
+    constants_info_[161].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[161].shape = {32};
+    constants_info_[161].stride = {1};
+    constants_info_[161].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[161].original_fqn = "mv2.features.1.conv.0.1.running_var";
+    constants_info_[162].name = "mv2_features_1_conv_2_running_mean";
+    constants_info_[162].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[162].offset = 0;
+    constants_info_[162].data_size = 64;
+    constants_info_[162].from_folded = false;
+    constants_info_[162].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[162].shape = {16};
+    constants_info_[162].stride = {1};
+    constants_info_[162].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[162].original_fqn = "mv2.features.1.conv.2.running_mean";
+    constants_info_[163].name = "mv2_features_1_conv_2_running_var";
+    constants_info_[163].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[163].offset = 0;
+    constants_info_[163].data_size = 64;
+    constants_info_[163].from_folded = false;
+    constants_info_[163].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[163].shape = {16};
+    constants_info_[163].stride = {1};
+    constants_info_[163].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[163].original_fqn = "mv2.features.1.conv.2.running_var";
+    constants_info_[164].name = "mv2_features_2_conv_0_1_running_mean";
+    constants_info_[164].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[164].offset = 0;
+    constants_info_[164].data_size = 384;
+    constants_info_[164].from_folded = false;
+    constants_info_[164].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[164].shape = {96};
+    constants_info_[164].stride = {1};
+    constants_info_[164].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[164].original_fqn = "mv2.features.2.conv.0.1.running_mean";
+    constants_info_[165].name = "mv2_features_2_conv_0_1_running_var";
+    constants_info_[165].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[165].offset = 0;
+    constants_info_[165].data_size = 384;
+    constants_info_[165].from_folded = false;
+    constants_info_[165].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[165].shape = {96};
+    constants_info_[165].stride = {1};
+    constants_info_[165].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[165].original_fqn = "mv2.features.2.conv.0.1.running_var";
+    constants_info_[166].name = "mv2_features_2_conv_1_1_running_mean";
+    constants_info_[166].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[166].offset = 0;
+    constants_info_[166].data_size = 384;
+    constants_info_[166].from_folded = false;
+    constants_info_[166].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[166].shape = {96};
+    constants_info_[166].stride = {1};
+    constants_info_[166].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[166].original_fqn = "mv2.features.2.conv.1.1.running_mean";
+    constants_info_[167].name = "mv2_features_2_conv_1_1_running_var";
+    constants_info_[167].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[167].offset = 0;
+    constants_info_[167].data_size = 384;
+    constants_info_[167].from_folded = false;
+    constants_info_[167].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[167].shape = {96};
+    constants_info_[167].stride = {1};
+    constants_info_[167].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[167].original_fqn = "mv2.features.2.conv.1.1.running_var";
+    constants_info_[168].name = "mv2_features_2_conv_3_running_mean";
+    constants_info_[168].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[168].offset = 0;
+    constants_info_[168].data_size = 96;
+    constants_info_[168].from_folded = false;
+    constants_info_[168].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[168].shape = {24};
+    constants_info_[168].stride = {1};
+    constants_info_[168].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[168].original_fqn = "mv2.features.2.conv.3.running_mean";
+    constants_info_[169].name = "mv2_features_2_conv_3_running_var";
+    constants_info_[169].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[169].offset = 0;
+    constants_info_[169].data_size = 96;
+    constants_info_[169].from_folded = false;
+    constants_info_[169].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[169].shape = {24};
+    constants_info_[169].stride = {1};
+    constants_info_[169].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[169].original_fqn = "mv2.features.2.conv.3.running_var";
+    constants_info_[170].name = "mv2_features_3_conv_0_1_running_mean";
+    constants_info_[170].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[170].offset = 0;
+    constants_info_[170].data_size = 576;
+    constants_info_[170].from_folded = false;
+    constants_info_[170].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[170].shape = {144};
+    constants_info_[170].stride = {1};
+    constants_info_[170].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[170].original_fqn = "mv2.features.3.conv.0.1.running_mean";
+    constants_info_[171].name = "mv2_features_3_conv_0_1_running_var";
+    constants_info_[171].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[171].offset = 0;
+    constants_info_[171].data_size = 576;
+    constants_info_[171].from_folded = false;
+    constants_info_[171].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[171].shape = {144};
+    constants_info_[171].stride = {1};
+    constants_info_[171].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[171].original_fqn = "mv2.features.3.conv.0.1.running_var";
+    constants_info_[172].name = "mv2_features_3_conv_1_1_running_mean";
+    constants_info_[172].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[172].offset = 0;
+    constants_info_[172].data_size = 576;
+    constants_info_[172].from_folded = false;
+    constants_info_[172].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[172].shape = {144};
+    constants_info_[172].stride = {1};
+    constants_info_[172].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[172].original_fqn = "mv2.features.3.conv.1.1.running_mean";
+    constants_info_[173].name = "mv2_features_3_conv_1_1_running_var";
+    constants_info_[173].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[173].offset = 0;
+    constants_info_[173].data_size = 576;
+    constants_info_[173].from_folded = false;
+    constants_info_[173].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[173].shape = {144};
+    constants_info_[173].stride = {1};
+    constants_info_[173].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[173].original_fqn = "mv2.features.3.conv.1.1.running_var";
+    constants_info_[174].name = "mv2_features_3_conv_3_running_mean";
+    constants_info_[174].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[174].offset = 0;
+    constants_info_[174].data_size = 96;
+    constants_info_[174].from_folded = false;
+    constants_info_[174].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[174].shape = {24};
+    constants_info_[174].stride = {1};
+    constants_info_[174].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[174].original_fqn = "mv2.features.3.conv.3.running_mean";
+    constants_info_[175].name = "mv2_features_3_conv_3_running_var";
+    constants_info_[175].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[175].offset = 0;
+    constants_info_[175].data_size = 96;
+    constants_info_[175].from_folded = false;
+    constants_info_[175].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[175].shape = {24};
+    constants_info_[175].stride = {1};
+    constants_info_[175].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[175].original_fqn = "mv2.features.3.conv.3.running_var";
+    constants_info_[176].name = "mv2_features_4_conv_0_1_running_mean";
+    constants_info_[176].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[176].offset = 0;
+    constants_info_[176].data_size = 576;
+    constants_info_[176].from_folded = false;
+    constants_info_[176].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[176].shape = {144};
+    constants_info_[176].stride = {1};
+    constants_info_[176].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[176].original_fqn = "mv2.features.4.conv.0.1.running_mean";
+    constants_info_[177].name = "mv2_features_4_conv_0_1_running_var";
+    constants_info_[177].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[177].offset = 0;
+    constants_info_[177].data_size = 576;
+    constants_info_[177].from_folded = false;
+    constants_info_[177].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[177].shape = {144};
+    constants_info_[177].stride = {1};
+    constants_info_[177].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[177].original_fqn = "mv2.features.4.conv.0.1.running_var";
+    constants_info_[178].name = "mv2_features_4_conv_1_1_running_mean";
+    constants_info_[178].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[178].offset = 0;
+    constants_info_[178].data_size = 576;
+    constants_info_[178].from_folded = false;
+    constants_info_[178].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[178].shape = {144};
+    constants_info_[178].stride = {1};
+    constants_info_[178].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[178].original_fqn = "mv2.features.4.conv.1.1.running_mean";
+    constants_info_[179].name = "mv2_features_4_conv_1_1_running_var";
+    constants_info_[179].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[179].offset = 0;
+    constants_info_[179].data_size = 576;
+    constants_info_[179].from_folded = false;
+    constants_info_[179].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[179].shape = {144};
+    constants_info_[179].stride = {1};
+    constants_info_[179].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[179].original_fqn = "mv2.features.4.conv.1.1.running_var";
+    constants_info_[180].name = "mv2_features_4_conv_3_running_mean";
+    constants_info_[180].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[180].offset = 0;
+    constants_info_[180].data_size = 128;
+    constants_info_[180].from_folded = false;
+    constants_info_[180].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[180].shape = {32};
+    constants_info_[180].stride = {1};
+    constants_info_[180].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[180].original_fqn = "mv2.features.4.conv.3.running_mean";
+    constants_info_[181].name = "mv2_features_4_conv_3_running_var";
+    constants_info_[181].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[181].offset = 0;
+    constants_info_[181].data_size = 128;
+    constants_info_[181].from_folded = false;
+    constants_info_[181].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[181].shape = {32};
+    constants_info_[181].stride = {1};
+    constants_info_[181].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[181].original_fqn = "mv2.features.4.conv.3.running_var";
+    constants_info_[182].name = "mv2_features_5_conv_0_1_running_mean";
+    constants_info_[182].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[182].offset = 0;
+    constants_info_[182].data_size = 768;
+    constants_info_[182].from_folded = false;
+    constants_info_[182].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[182].shape = {192};
+    constants_info_[182].stride = {1};
+    constants_info_[182].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[182].original_fqn = "mv2.features.5.conv.0.1.running_mean";
+    constants_info_[183].name = "mv2_features_5_conv_0_1_running_var";
+    constants_info_[183].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[183].offset = 0;
+    constants_info_[183].data_size = 768;
+    constants_info_[183].from_folded = false;
+    constants_info_[183].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[183].shape = {192};
+    constants_info_[183].stride = {1};
+    constants_info_[183].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[183].original_fqn = "mv2.features.5.conv.0.1.running_var";
+    constants_info_[184].name = "mv2_features_5_conv_1_1_running_mean";
+    constants_info_[184].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[184].offset = 0;
+    constants_info_[184].data_size = 768;
+    constants_info_[184].from_folded = false;
+    constants_info_[184].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[184].shape = {192};
+    constants_info_[184].stride = {1};
+    constants_info_[184].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[184].original_fqn = "mv2.features.5.conv.1.1.running_mean";
+    constants_info_[185].name = "mv2_features_5_conv_1_1_running_var";
+    constants_info_[185].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[185].offset = 0;
+    constants_info_[185].data_size = 768;
+    constants_info_[185].from_folded = false;
+    constants_info_[185].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[185].shape = {192};
+    constants_info_[185].stride = {1};
+    constants_info_[185].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[185].original_fqn = "mv2.features.5.conv.1.1.running_var";
+    constants_info_[186].name = "mv2_features_5_conv_3_running_mean";
+    constants_info_[186].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[186].offset = 0;
+    constants_info_[186].data_size = 128;
+    constants_info_[186].from_folded = false;
+    constants_info_[186].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[186].shape = {32};
+    constants_info_[186].stride = {1};
+    constants_info_[186].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[186].original_fqn = "mv2.features.5.conv.3.running_mean";
+    constants_info_[187].name = "mv2_features_5_conv_3_running_var";
+    constants_info_[187].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[187].offset = 0;
+    constants_info_[187].data_size = 128;
+    constants_info_[187].from_folded = false;
+    constants_info_[187].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[187].shape = {32};
+    constants_info_[187].stride = {1};
+    constants_info_[187].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[187].original_fqn = "mv2.features.5.conv.3.running_var";
+    constants_info_[188].name = "mv2_features_6_conv_0_1_running_mean";
+    constants_info_[188].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[188].offset = 0;
+    constants_info_[188].data_size = 768;
+    constants_info_[188].from_folded = false;
+    constants_info_[188].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[188].shape = {192};
+    constants_info_[188].stride = {1};
+    constants_info_[188].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[188].original_fqn = "mv2.features.6.conv.0.1.running_mean";
+    constants_info_[189].name = "mv2_features_6_conv_0_1_running_var";
+    constants_info_[189].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[189].offset = 0;
+    constants_info_[189].data_size = 768;
+    constants_info_[189].from_folded = false;
+    constants_info_[189].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[189].shape = {192};
+    constants_info_[189].stride = {1};
+    constants_info_[189].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[189].original_fqn = "mv2.features.6.conv.0.1.running_var";
+    constants_info_[190].name = "mv2_features_6_conv_1_1_running_mean";
+    constants_info_[190].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[190].offset = 0;
+    constants_info_[190].data_size = 768;
+    constants_info_[190].from_folded = false;
+    constants_info_[190].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[190].shape = {192};
+    constants_info_[190].stride = {1};
+    constants_info_[190].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[190].original_fqn = "mv2.features.6.conv.1.1.running_mean";
+    constants_info_[191].name = "mv2_features_6_conv_1_1_running_var";
+    constants_info_[191].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[191].offset = 0;
+    constants_info_[191].data_size = 768;
+    constants_info_[191].from_folded = false;
+    constants_info_[191].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[191].shape = {192};
+    constants_info_[191].stride = {1};
+    constants_info_[191].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[191].original_fqn = "mv2.features.6.conv.1.1.running_var";
+    constants_info_[192].name = "mv2_features_6_conv_3_running_mean";
+    constants_info_[192].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[192].offset = 0;
+    constants_info_[192].data_size = 128;
+    constants_info_[192].from_folded = false;
+    constants_info_[192].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[192].shape = {32};
+    constants_info_[192].stride = {1};
+    constants_info_[192].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[192].original_fqn = "mv2.features.6.conv.3.running_mean";
+    constants_info_[193].name = "mv2_features_6_conv_3_running_var";
+    constants_info_[193].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[193].offset = 0;
+    constants_info_[193].data_size = 128;
+    constants_info_[193].from_folded = false;
+    constants_info_[193].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[193].shape = {32};
+    constants_info_[193].stride = {1};
+    constants_info_[193].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[193].original_fqn = "mv2.features.6.conv.3.running_var";
+    constants_info_[194].name = "mv2_features_7_conv_0_1_running_mean";
+    constants_info_[194].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[194].offset = 0;
+    constants_info_[194].data_size = 768;
+    constants_info_[194].from_folded = false;
+    constants_info_[194].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[194].shape = {192};
+    constants_info_[194].stride = {1};
+    constants_info_[194].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[194].original_fqn = "mv2.features.7.conv.0.1.running_mean";
+    constants_info_[195].name = "mv2_features_7_conv_0_1_running_var";
+    constants_info_[195].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[195].offset = 0;
+    constants_info_[195].data_size = 768;
+    constants_info_[195].from_folded = false;
+    constants_info_[195].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[195].shape = {192};
+    constants_info_[195].stride = {1};
+    constants_info_[195].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[195].original_fqn = "mv2.features.7.conv.0.1.running_var";
+    constants_info_[196].name = "mv2_features_7_conv_1_1_running_mean";
+    constants_info_[196].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[196].offset = 0;
+    constants_info_[196].data_size = 768;
+    constants_info_[196].from_folded = false;
+    constants_info_[196].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[196].shape = {192};
+    constants_info_[196].stride = {1};
+    constants_info_[196].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[196].original_fqn = "mv2.features.7.conv.1.1.running_mean";
+    constants_info_[197].name = "mv2_features_7_conv_1_1_running_var";
+    constants_info_[197].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[197].offset = 0;
+    constants_info_[197].data_size = 768;
+    constants_info_[197].from_folded = false;
+    constants_info_[197].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[197].shape = {192};
+    constants_info_[197].stride = {1};
+    constants_info_[197].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[197].original_fqn = "mv2.features.7.conv.1.1.running_var";
+    constants_info_[198].name = "mv2_features_7_conv_3_running_mean";
+    constants_info_[198].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[198].offset = 0;
+    constants_info_[198].data_size = 256;
+    constants_info_[198].from_folded = false;
+    constants_info_[198].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[198].shape = {64};
+    constants_info_[198].stride = {1};
+    constants_info_[198].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[198].original_fqn = "mv2.features.7.conv.3.running_mean";
+    constants_info_[199].name = "mv2_features_7_conv_3_running_var";
+    constants_info_[199].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[199].offset = 0;
+    constants_info_[199].data_size = 256;
+    constants_info_[199].from_folded = false;
+    constants_info_[199].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[199].shape = {64};
+    constants_info_[199].stride = {1};
+    constants_info_[199].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[199].original_fqn = "mv2.features.7.conv.3.running_var";
+    constants_info_[200].name = "mv2_features_8_conv_0_1_running_mean";
+    constants_info_[200].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[200].offset = 0;
+    constants_info_[200].data_size = 1536;
+    constants_info_[200].from_folded = false;
+    constants_info_[200].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[200].shape = {384};
+    constants_info_[200].stride = {1};
+    constants_info_[200].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[200].original_fqn = "mv2.features.8.conv.0.1.running_mean";
+    constants_info_[201].name = "mv2_features_8_conv_0_1_running_var";
+    constants_info_[201].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[201].offset = 0;
+    constants_info_[201].data_size = 1536;
+    constants_info_[201].from_folded = false;
+    constants_info_[201].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[201].shape = {384};
+    constants_info_[201].stride = {1};
+    constants_info_[201].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[201].original_fqn = "mv2.features.8.conv.0.1.running_var";
+    constants_info_[202].name = "mv2_features_8_conv_1_1_running_mean";
+    constants_info_[202].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[202].offset = 0;
+    constants_info_[202].data_size = 1536;
+    constants_info_[202].from_folded = false;
+    constants_info_[202].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[202].shape = {384};
+    constants_info_[202].stride = {1};
+    constants_info_[202].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[202].original_fqn = "mv2.features.8.conv.1.1.running_mean";
+    constants_info_[203].name = "mv2_features_8_conv_1_1_running_var";
+    constants_info_[203].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[203].offset = 0;
+    constants_info_[203].data_size = 1536;
+    constants_info_[203].from_folded = false;
+    constants_info_[203].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[203].shape = {384};
+    constants_info_[203].stride = {1};
+    constants_info_[203].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[203].original_fqn = "mv2.features.8.conv.1.1.running_var";
+    constants_info_[204].name = "mv2_features_8_conv_3_running_mean";
+    constants_info_[204].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[204].offset = 0;
+    constants_info_[204].data_size = 256;
+    constants_info_[204].from_folded = false;
+    constants_info_[204].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[204].shape = {64};
+    constants_info_[204].stride = {1};
+    constants_info_[204].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[204].original_fqn = "mv2.features.8.conv.3.running_mean";
+    constants_info_[205].name = "mv2_features_8_conv_3_running_var";
+    constants_info_[205].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[205].offset = 0;
+    constants_info_[205].data_size = 256;
+    constants_info_[205].from_folded = false;
+    constants_info_[205].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[205].shape = {64};
+    constants_info_[205].stride = {1};
+    constants_info_[205].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[205].original_fqn = "mv2.features.8.conv.3.running_var";
+    constants_info_[206].name = "mv2_features_9_conv_0_1_running_mean";
+    constants_info_[206].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[206].offset = 0;
+    constants_info_[206].data_size = 1536;
+    constants_info_[206].from_folded = false;
+    constants_info_[206].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[206].shape = {384};
+    constants_info_[206].stride = {1};
+    constants_info_[206].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[206].original_fqn = "mv2.features.9.conv.0.1.running_mean";
+    constants_info_[207].name = "mv2_features_9_conv_0_1_running_var";
+    constants_info_[207].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[207].offset = 0;
+    constants_info_[207].data_size = 1536;
+    constants_info_[207].from_folded = false;
+    constants_info_[207].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[207].shape = {384};
+    constants_info_[207].stride = {1};
+    constants_info_[207].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[207].original_fqn = "mv2.features.9.conv.0.1.running_var";
+    constants_info_[208].name = "mv2_features_9_conv_1_1_running_mean";
+    constants_info_[208].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[208].offset = 0;
+    constants_info_[208].data_size = 1536;
+    constants_info_[208].from_folded = false;
+    constants_info_[208].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[208].shape = {384};
+    constants_info_[208].stride = {1};
+    constants_info_[208].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[208].original_fqn = "mv2.features.9.conv.1.1.running_mean";
+    constants_info_[209].name = "mv2_features_9_conv_1_1_running_var";
+    constants_info_[209].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[209].offset = 0;
+    constants_info_[209].data_size = 1536;
+    constants_info_[209].from_folded = false;
+    constants_info_[209].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[209].shape = {384};
+    constants_info_[209].stride = {1};
+    constants_info_[209].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[209].original_fqn = "mv2.features.9.conv.1.1.running_var";
+    constants_info_[210].name = "mv2_features_9_conv_3_running_mean";
+    constants_info_[210].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[210].offset = 0;
+    constants_info_[210].data_size = 256;
+    constants_info_[210].from_folded = false;
+    constants_info_[210].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[210].shape = {64};
+    constants_info_[210].stride = {1};
+    constants_info_[210].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[210].original_fqn = "mv2.features.9.conv.3.running_mean";
+    constants_info_[211].name = "mv2_features_9_conv_3_running_var";
+    constants_info_[211].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[211].offset = 0;
+    constants_info_[211].data_size = 256;
+    constants_info_[211].from_folded = false;
+    constants_info_[211].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[211].shape = {64};
+    constants_info_[211].stride = {1};
+    constants_info_[211].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[211].original_fqn = "mv2.features.9.conv.3.running_var";
+    constants_info_[212].name = "mv2_features_10_conv_0_1_running_mean";
+    constants_info_[212].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[212].offset = 0;
+    constants_info_[212].data_size = 1536;
+    constants_info_[212].from_folded = false;
+    constants_info_[212].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[212].shape = {384};
+    constants_info_[212].stride = {1};
+    constants_info_[212].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[212].original_fqn = "mv2.features.10.conv.0.1.running_mean";
+    constants_info_[213].name = "mv2_features_10_conv_0_1_running_var";
+    constants_info_[213].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[213].offset = 0;
+    constants_info_[213].data_size = 1536;
+    constants_info_[213].from_folded = false;
+    constants_info_[213].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[213].shape = {384};
+    constants_info_[213].stride = {1};
+    constants_info_[213].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[213].original_fqn = "mv2.features.10.conv.0.1.running_var";
+    constants_info_[214].name = "mv2_features_10_conv_1_1_running_mean";
+    constants_info_[214].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[214].offset = 0;
+    constants_info_[214].data_size = 1536;
+    constants_info_[214].from_folded = false;
+    constants_info_[214].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[214].shape = {384};
+    constants_info_[214].stride = {1};
+    constants_info_[214].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[214].original_fqn = "mv2.features.10.conv.1.1.running_mean";
+    constants_info_[215].name = "mv2_features_10_conv_1_1_running_var";
+    constants_info_[215].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[215].offset = 0;
+    constants_info_[215].data_size = 1536;
+    constants_info_[215].from_folded = false;
+    constants_info_[215].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[215].shape = {384};
+    constants_info_[215].stride = {1};
+    constants_info_[215].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[215].original_fqn = "mv2.features.10.conv.1.1.running_var";
+    constants_info_[216].name = "mv2_features_10_conv_3_running_mean";
+    constants_info_[216].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[216].offset = 0;
+    constants_info_[216].data_size = 256;
+    constants_info_[216].from_folded = false;
+    constants_info_[216].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[216].shape = {64};
+    constants_info_[216].stride = {1};
+    constants_info_[216].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[216].original_fqn = "mv2.features.10.conv.3.running_mean";
+    constants_info_[217].name = "mv2_features_10_conv_3_running_var";
+    constants_info_[217].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[217].offset = 0;
+    constants_info_[217].data_size = 256;
+    constants_info_[217].from_folded = false;
+    constants_info_[217].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[217].shape = {64};
+    constants_info_[217].stride = {1};
+    constants_info_[217].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[217].original_fqn = "mv2.features.10.conv.3.running_var";
+    constants_info_[218].name = "mv2_features_11_conv_0_1_running_mean";
+    constants_info_[218].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[218].offset = 0;
+    constants_info_[218].data_size = 1536;
+    constants_info_[218].from_folded = false;
+    constants_info_[218].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[218].shape = {384};
+    constants_info_[218].stride = {1};
+    constants_info_[218].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[218].original_fqn = "mv2.features.11.conv.0.1.running_mean";
+    constants_info_[219].name = "mv2_features_11_conv_0_1_running_var";
+    constants_info_[219].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[219].offset = 0;
+    constants_info_[219].data_size = 1536;
+    constants_info_[219].from_folded = false;
+    constants_info_[219].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[219].shape = {384};
+    constants_info_[219].stride = {1};
+    constants_info_[219].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[219].original_fqn = "mv2.features.11.conv.0.1.running_var";
+    constants_info_[220].name = "mv2_features_11_conv_1_1_running_mean";
+    constants_info_[220].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[220].offset = 0;
+    constants_info_[220].data_size = 1536;
+    constants_info_[220].from_folded = false;
+    constants_info_[220].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[220].shape = {384};
+    constants_info_[220].stride = {1};
+    constants_info_[220].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[220].original_fqn = "mv2.features.11.conv.1.1.running_mean";
+    constants_info_[221].name = "mv2_features_11_conv_1_1_running_var";
+    constants_info_[221].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[221].offset = 0;
+    constants_info_[221].data_size = 1536;
+    constants_info_[221].from_folded = false;
+    constants_info_[221].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[221].shape = {384};
+    constants_info_[221].stride = {1};
+    constants_info_[221].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[221].original_fqn = "mv2.features.11.conv.1.1.running_var";
+    constants_info_[222].name = "mv2_features_11_conv_3_running_mean";
+    constants_info_[222].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[222].offset = 0;
+    constants_info_[222].data_size = 384;
+    constants_info_[222].from_folded = false;
+    constants_info_[222].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[222].shape = {96};
+    constants_info_[222].stride = {1};
+    constants_info_[222].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[222].original_fqn = "mv2.features.11.conv.3.running_mean";
+    constants_info_[223].name = "mv2_features_11_conv_3_running_var";
+    constants_info_[223].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[223].offset = 0;
+    constants_info_[223].data_size = 384;
+    constants_info_[223].from_folded = false;
+    constants_info_[223].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[223].shape = {96};
+    constants_info_[223].stride = {1};
+    constants_info_[223].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[223].original_fqn = "mv2.features.11.conv.3.running_var";
+    constants_info_[224].name = "mv2_features_12_conv_0_1_running_mean";
+    constants_info_[224].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[224].offset = 0;
+    constants_info_[224].data_size = 2304;
+    constants_info_[224].from_folded = false;
+    constants_info_[224].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[224].shape = {576};
+    constants_info_[224].stride = {1};
+    constants_info_[224].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[224].original_fqn = "mv2.features.12.conv.0.1.running_mean";
+    constants_info_[225].name = "mv2_features_12_conv_0_1_running_var";
+    constants_info_[225].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[225].offset = 0;
+    constants_info_[225].data_size = 2304;
+    constants_info_[225].from_folded = false;
+    constants_info_[225].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[225].shape = {576};
+    constants_info_[225].stride = {1};
+    constants_info_[225].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[225].original_fqn = "mv2.features.12.conv.0.1.running_var";
+    constants_info_[226].name = "mv2_features_12_conv_1_1_running_mean";
+    constants_info_[226].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[226].offset = 0;
+    constants_info_[226].data_size = 2304;
+    constants_info_[226].from_folded = false;
+    constants_info_[226].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[226].shape = {576};
+    constants_info_[226].stride = {1};
+    constants_info_[226].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[226].original_fqn = "mv2.features.12.conv.1.1.running_mean";
+    constants_info_[227].name = "mv2_features_12_conv_1_1_running_var";
+    constants_info_[227].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[227].offset = 0;
+    constants_info_[227].data_size = 2304;
+    constants_info_[227].from_folded = false;
+    constants_info_[227].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[227].shape = {576};
+    constants_info_[227].stride = {1};
+    constants_info_[227].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[227].original_fqn = "mv2.features.12.conv.1.1.running_var";
+    constants_info_[228].name = "mv2_features_12_conv_3_running_mean";
+    constants_info_[228].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[228].offset = 0;
+    constants_info_[228].data_size = 384;
+    constants_info_[228].from_folded = false;
+    constants_info_[228].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[228].shape = {96};
+    constants_info_[228].stride = {1};
+    constants_info_[228].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[228].original_fqn = "mv2.features.12.conv.3.running_mean";
+    constants_info_[229].name = "mv2_features_12_conv_3_running_var";
+    constants_info_[229].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[229].offset = 0;
+    constants_info_[229].data_size = 384;
+    constants_info_[229].from_folded = false;
+    constants_info_[229].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[229].shape = {96};
+    constants_info_[229].stride = {1};
+    constants_info_[229].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[229].original_fqn = "mv2.features.12.conv.3.running_var";
+    constants_info_[230].name = "mv2_features_13_conv_0_1_running_mean";
+    constants_info_[230].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[230].offset = 0;
+    constants_info_[230].data_size = 2304;
+    constants_info_[230].from_folded = false;
+    constants_info_[230].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[230].shape = {576};
+    constants_info_[230].stride = {1};
+    constants_info_[230].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[230].original_fqn = "mv2.features.13.conv.0.1.running_mean";
+    constants_info_[231].name = "mv2_features_13_conv_0_1_running_var";
+    constants_info_[231].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[231].offset = 0;
+    constants_info_[231].data_size = 2304;
+    constants_info_[231].from_folded = false;
+    constants_info_[231].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[231].shape = {576};
+    constants_info_[231].stride = {1};
+    constants_info_[231].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[231].original_fqn = "mv2.features.13.conv.0.1.running_var";
+    constants_info_[232].name = "mv2_features_13_conv_1_1_running_mean";
+    constants_info_[232].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[232].offset = 0;
+    constants_info_[232].data_size = 2304;
+    constants_info_[232].from_folded = false;
+    constants_info_[232].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[232].shape = {576};
+    constants_info_[232].stride = {1};
+    constants_info_[232].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[232].original_fqn = "mv2.features.13.conv.1.1.running_mean";
+    constants_info_[233].name = "mv2_features_13_conv_1_1_running_var";
+    constants_info_[233].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[233].offset = 0;
+    constants_info_[233].data_size = 2304;
+    constants_info_[233].from_folded = false;
+    constants_info_[233].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[233].shape = {576};
+    constants_info_[233].stride = {1};
+    constants_info_[233].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[233].original_fqn = "mv2.features.13.conv.1.1.running_var";
+    constants_info_[234].name = "mv2_features_13_conv_3_running_mean";
+    constants_info_[234].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[234].offset = 0;
+    constants_info_[234].data_size = 384;
+    constants_info_[234].from_folded = false;
+    constants_info_[234].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[234].shape = {96};
+    constants_info_[234].stride = {1};
+    constants_info_[234].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[234].original_fqn = "mv2.features.13.conv.3.running_mean";
+    constants_info_[235].name = "mv2_features_13_conv_3_running_var";
+    constants_info_[235].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[235].offset = 0;
+    constants_info_[235].data_size = 384;
+    constants_info_[235].from_folded = false;
+    constants_info_[235].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[235].shape = {96};
+    constants_info_[235].stride = {1};
+    constants_info_[235].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[235].original_fqn = "mv2.features.13.conv.3.running_var";
+    constants_info_[236].name = "mv2_features_14_conv_0_1_running_mean";
+    constants_info_[236].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[236].offset = 0;
+    constants_info_[236].data_size = 2304;
+    constants_info_[236].from_folded = false;
+    constants_info_[236].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[236].shape = {576};
+    constants_info_[236].stride = {1};
+    constants_info_[236].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[236].original_fqn = "mv2.features.14.conv.0.1.running_mean";
+    constants_info_[237].name = "mv2_features_14_conv_0_1_running_var";
+    constants_info_[237].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[237].offset = 0;
+    constants_info_[237].data_size = 2304;
+    constants_info_[237].from_folded = false;
+    constants_info_[237].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[237].shape = {576};
+    constants_info_[237].stride = {1};
+    constants_info_[237].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[237].original_fqn = "mv2.features.14.conv.0.1.running_var";
+    constants_info_[238].name = "mv2_features_14_conv_1_1_running_mean";
+    constants_info_[238].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[238].offset = 0;
+    constants_info_[238].data_size = 2304;
+    constants_info_[238].from_folded = false;
+    constants_info_[238].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[238].shape = {576};
+    constants_info_[238].stride = {1};
+    constants_info_[238].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[238].original_fqn = "mv2.features.14.conv.1.1.running_mean";
+    constants_info_[239].name = "mv2_features_14_conv_1_1_running_var";
+    constants_info_[239].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[239].offset = 0;
+    constants_info_[239].data_size = 2304;
+    constants_info_[239].from_folded = false;
+    constants_info_[239].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[239].shape = {576};
+    constants_info_[239].stride = {1};
+    constants_info_[239].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[239].original_fqn = "mv2.features.14.conv.1.1.running_var";
+    constants_info_[240].name = "mv2_features_14_conv_3_running_mean";
+    constants_info_[240].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[240].offset = 0;
+    constants_info_[240].data_size = 640;
+    constants_info_[240].from_folded = false;
+    constants_info_[240].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[240].shape = {160};
+    constants_info_[240].stride = {1};
+    constants_info_[240].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[240].original_fqn = "mv2.features.14.conv.3.running_mean";
+    constants_info_[241].name = "mv2_features_14_conv_3_running_var";
+    constants_info_[241].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[241].offset = 0;
+    constants_info_[241].data_size = 640;
+    constants_info_[241].from_folded = false;
+    constants_info_[241].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[241].shape = {160};
+    constants_info_[241].stride = {1};
+    constants_info_[241].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[241].original_fqn = "mv2.features.14.conv.3.running_var";
+    constants_info_[242].name = "mv2_features_15_conv_0_1_running_mean";
+    constants_info_[242].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[242].offset = 0;
+    constants_info_[242].data_size = 3840;
+    constants_info_[242].from_folded = false;
+    constants_info_[242].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[242].shape = {960};
+    constants_info_[242].stride = {1};
+    constants_info_[242].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[242].original_fqn = "mv2.features.15.conv.0.1.running_mean";
+    constants_info_[243].name = "mv2_features_15_conv_0_1_running_var";
+    constants_info_[243].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[243].offset = 0;
+    constants_info_[243].data_size = 3840;
+    constants_info_[243].from_folded = false;
+    constants_info_[243].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[243].shape = {960};
+    constants_info_[243].stride = {1};
+    constants_info_[243].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[243].original_fqn = "mv2.features.15.conv.0.1.running_var";
+    constants_info_[244].name = "mv2_features_15_conv_1_1_running_mean";
+    constants_info_[244].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[244].offset = 0;
+    constants_info_[244].data_size = 3840;
+    constants_info_[244].from_folded = false;
+    constants_info_[244].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[244].shape = {960};
+    constants_info_[244].stride = {1};
+    constants_info_[244].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[244].original_fqn = "mv2.features.15.conv.1.1.running_mean";
+    constants_info_[245].name = "mv2_features_15_conv_1_1_running_var";
+    constants_info_[245].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[245].offset = 0;
+    constants_info_[245].data_size = 3840;
+    constants_info_[245].from_folded = false;
+    constants_info_[245].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[245].shape = {960};
+    constants_info_[245].stride = {1};
+    constants_info_[245].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[245].original_fqn = "mv2.features.15.conv.1.1.running_var";
+    constants_info_[246].name = "mv2_features_15_conv_3_running_mean";
+    constants_info_[246].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[246].offset = 0;
+    constants_info_[246].data_size = 640;
+    constants_info_[246].from_folded = false;
+    constants_info_[246].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[246].shape = {160};
+    constants_info_[246].stride = {1};
+    constants_info_[246].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[246].original_fqn = "mv2.features.15.conv.3.running_mean";
+    constants_info_[247].name = "mv2_features_15_conv_3_running_var";
+    constants_info_[247].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[247].offset = 0;
+    constants_info_[247].data_size = 640;
+    constants_info_[247].from_folded = false;
+    constants_info_[247].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[247].shape = {160};
+    constants_info_[247].stride = {1};
+    constants_info_[247].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[247].original_fqn = "mv2.features.15.conv.3.running_var";
+    constants_info_[248].name = "mv2_features_16_conv_0_1_running_mean";
+    constants_info_[248].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[248].offset = 0;
+    constants_info_[248].data_size = 3840;
+    constants_info_[248].from_folded = false;
+    constants_info_[248].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[248].shape = {960};
+    constants_info_[248].stride = {1};
+    constants_info_[248].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[248].original_fqn = "mv2.features.16.conv.0.1.running_mean";
+    constants_info_[249].name = "mv2_features_16_conv_0_1_running_var";
+    constants_info_[249].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[249].offset = 0;
+    constants_info_[249].data_size = 3840;
+    constants_info_[249].from_folded = false;
+    constants_info_[249].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[249].shape = {960};
+    constants_info_[249].stride = {1};
+    constants_info_[249].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[249].original_fqn = "mv2.features.16.conv.0.1.running_var";
+    constants_info_[250].name = "mv2_features_16_conv_1_1_running_mean";
+    constants_info_[250].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[250].offset = 0;
+    constants_info_[250].data_size = 3840;
+    constants_info_[250].from_folded = false;
+    constants_info_[250].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[250].shape = {960};
+    constants_info_[250].stride = {1};
+    constants_info_[250].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[250].original_fqn = "mv2.features.16.conv.1.1.running_mean";
+    constants_info_[251].name = "mv2_features_16_conv_1_1_running_var";
+    constants_info_[251].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[251].offset = 0;
+    constants_info_[251].data_size = 3840;
+    constants_info_[251].from_folded = false;
+    constants_info_[251].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[251].shape = {960};
+    constants_info_[251].stride = {1};
+    constants_info_[251].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[251].original_fqn = "mv2.features.16.conv.1.1.running_var";
+    constants_info_[252].name = "mv2_features_16_conv_3_running_mean";
+    constants_info_[252].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[252].offset = 0;
+    constants_info_[252].data_size = 640;
+    constants_info_[252].from_folded = false;
+    constants_info_[252].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[252].shape = {160};
+    constants_info_[252].stride = {1};
+    constants_info_[252].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[252].original_fqn = "mv2.features.16.conv.3.running_mean";
+    constants_info_[253].name = "mv2_features_16_conv_3_running_var";
+    constants_info_[253].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[253].offset = 0;
+    constants_info_[253].data_size = 640;
+    constants_info_[253].from_folded = false;
+    constants_info_[253].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[253].shape = {160};
+    constants_info_[253].stride = {1};
+    constants_info_[253].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[253].original_fqn = "mv2.features.16.conv.3.running_var";
+    constants_info_[254].name = "mv2_features_17_conv_0_1_running_mean";
+    constants_info_[254].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[254].offset = 0;
+    constants_info_[254].data_size = 3840;
+    constants_info_[254].from_folded = false;
+    constants_info_[254].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[254].shape = {960};
+    constants_info_[254].stride = {1};
+    constants_info_[254].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[254].original_fqn = "mv2.features.17.conv.0.1.running_mean";
+    constants_info_[255].name = "mv2_features_17_conv_0_1_running_var";
+    constants_info_[255].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[255].offset = 0;
+    constants_info_[255].data_size = 3840;
+    constants_info_[255].from_folded = false;
+    constants_info_[255].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[255].shape = {960};
+    constants_info_[255].stride = {1};
+    constants_info_[255].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[255].original_fqn = "mv2.features.17.conv.0.1.running_var";
+    constants_info_[256].name = "mv2_features_17_conv_1_1_running_mean";
+    constants_info_[256].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[256].offset = 0;
+    constants_info_[256].data_size = 3840;
+    constants_info_[256].from_folded = false;
+    constants_info_[256].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[256].shape = {960};
+    constants_info_[256].stride = {1};
+    constants_info_[256].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[256].original_fqn = "mv2.features.17.conv.1.1.running_mean";
+    constants_info_[257].name = "mv2_features_17_conv_1_1_running_var";
+    constants_info_[257].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[257].offset = 0;
+    constants_info_[257].data_size = 3840;
+    constants_info_[257].from_folded = false;
+    constants_info_[257].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[257].shape = {960};
+    constants_info_[257].stride = {1};
+    constants_info_[257].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[257].original_fqn = "mv2.features.17.conv.1.1.running_var";
+    constants_info_[258].name = "mv2_features_17_conv_3_running_mean";
+    constants_info_[258].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[258].offset = 0;
+    constants_info_[258].data_size = 1280;
+    constants_info_[258].from_folded = false;
+    constants_info_[258].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[258].shape = {320};
+    constants_info_[258].stride = {1};
+    constants_info_[258].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[258].original_fqn = "mv2.features.17.conv.3.running_mean";
+    constants_info_[259].name = "mv2_features_17_conv_3_running_var";
+    constants_info_[259].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[259].offset = 0;
+    constants_info_[259].data_size = 1280;
+    constants_info_[259].from_folded = false;
+    constants_info_[259].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[259].shape = {320};
+    constants_info_[259].stride = {1};
+    constants_info_[259].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[259].original_fqn = "mv2.features.17.conv.3.running_var";
+    constants_info_[260].name = "mv2_features_18_1_running_mean";
+    constants_info_[260].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[260].offset = 0;
+    constants_info_[260].data_size = 5120;
+    constants_info_[260].from_folded = false;
+    constants_info_[260].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[260].shape = {1280};
+    constants_info_[260].stride = {1};
+    constants_info_[260].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[260].original_fqn = "mv2.features.18.1.running_mean";
+    constants_info_[261].name = "mv2_features_18_1_running_var";
+    constants_info_[261].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[261].offset = 0;
+    constants_info_[261].data_size = 5120;
+    constants_info_[261].from_folded = false;
+    constants_info_[261].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
+    constants_info_[261].shape = {1280};
+    constants_info_[261].stride = {1};
+    constants_info_[261].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[261].original_fqn = "mv2.features.18.1.running_var";
+    update_constants_map(std::move(constants_map));
+    update_constants_array(std::move(constants_array));
+    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
+    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
+    outputs_info_[0].name = "output0";
+    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
+}
+
+std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor,
+    bool initialization
+) {
+
+    if (!initialization) {
+        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                  << "aot_inductor.use_runtime_constant_folding=False\n";
+    }
+    return {};
+}
+} // namespace torch::aot_inductor
+using namespace torch::aot_inductor;
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_0(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_0', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 4, 'x': 65536}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 451584, 'x': 602112}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 3
+        xnumel = 50176
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x1 = xindex
+        y0 = yindex
+        tmp0 = tl.load(in_ptr0 + (x1 + 50176*y0), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = ((ynumel + (4 - 1)) / (4));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
+        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin", "triton_poi_fused_convolution_0", 4160, cubin_dir_); 
+    }
+    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_2 = ynumel;
+    int var_3 = xnumel;
+    CUdeviceptr global_scratch_4 = 0;
+    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
+    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4160, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_1(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_1', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 128, 'x': 16}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6912, 'x': 3456}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 96
+        xnumel = 9
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
+    uint32_t grid_1 = ((ynumel + (64 - 1)) / (64));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
+        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin", "triton_poi_fused_convolution_1", 4352, cubin_dir_); 
+    }
+    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_7 = ynumel;
+    int var_8 = xnumel;
+    CUdeviceptr global_scratch_9 = 0;
+    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
+    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 524288}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 4817408}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 401408
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = tl.full([XBLOCK], True, tl.int1)
+        x2 = xindex
+        x0 = (xindex % 32)
+        tmp0 = tl.load(in_out_ptr0 + (x2), None)
+        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, None)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 = loadKernel("/home/gasoonjia/executorch/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_12 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_13 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_14 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_15 = xnumel;
+    CUdeviceptr global_scratch_16 = 0;
+    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &var_14, &var_15, &global_scratch_16};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_3(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_3', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 262144}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 2408704}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_3(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 200704
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = tl.full([XBLOCK], True, tl.int1)
+        x2 = xindex
+        x0 = (xindex % 16)
+        tmp0 = tl.load(in_out_ptr0 + (x2), None)
+        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, None)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 = loadKernel("/home/gasoonjia/executorch/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_3", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_17 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_18 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_19 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_20 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_21 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_22 = xnumel;
+    CUdeviceptr global_scratch_23 = 0;
+    void* kernel_args_[] = {&var_17, &var_18, &var_19, &var_20, &var_21, &var_22, &global_scratch_23};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 2097152}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 14452224}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 1204224
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = tl.full([XBLOCK], True, tl.int1)
+        x2 = xindex
+        x0 = (xindex % 96)
+        tmp0 = tl.load(in_out_ptr0 + (x2), None)
+        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, None)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 = loadKernel("/home/gasoonjia/executorch/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_24 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_25 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_26 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_27 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_28 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_29 = xnumel;
+    CUdeviceptr global_scratch_30 = 0;
+    void* kernel_args_[] = {&var_24, &var_25, &var_26, &var_27, &var_28, &var_29, &global_scratch_30};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 524288}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 3614208}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 301056
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 96)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 = loadKernel("/home/gasoonjia/executorch/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_31 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_32 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_33 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_34 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_35 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_36 = xnumel;
+    CUdeviceptr global_scratch_37 = 0;
+    void* kernel_args_[] = {&var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &global_scratch_37};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_6(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_6', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 131072}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 903552}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_6(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 75264
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 24)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 = loadKernel("/home/gasoonjia/executorch/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_6", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_38 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_39 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_40 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_41 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_42 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_43 = xnumel;
+    CUdeviceptr global_scratch_44 = 0;
+    void* kernel_args_[] = {&var_38, &var_39, &var_40, &var_41, &var_42, &var_43, &global_scratch_44};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 524288}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 5421312}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 451584
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 144)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 = loadKernel("/home/gasoonjia/executorch/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_45 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_46 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_47 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_48 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_49 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_50 = xnumel;
+    CUdeviceptr global_scratch_51 = 0;
+    void* kernel_args_[] = {&var_45, &var_46, &var_47, &var_48, &var_49, &var_50, &global_scratch_51};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_8', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 131072}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1204608}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_add_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 75264
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 24)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x2), xmask)
+        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tmp1 - tmp2
+        tmp5 = 1e-05
+        tmp6 = tmp4 + tmp5
+        tmp7 = libdevice.sqrt(tmp6)
+        tmp8 = tl.full([1], 1, tl.int32)
+        tmp9 = (tmp8 / tmp7)
+        tmp10 = 1.0
+        tmp11 = tmp9 * tmp10
+        tmp12 = tmp3 * tmp11
+        tmp14 = tmp12 * tmp13
+        tmp16 = tmp14 + tmp15
+        tmp17 = tmp0 + tmp16
+        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 = loadKernel("/home/gasoonjia/executorch/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_8", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_52 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_53 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_54 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_55 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_56 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_57 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_58 = xnumel;
+    CUdeviceptr global_scratch_59 = 0;
+    void* kernel_args_[] = {&var_52, &var_53, &var_54, &var_55, &var_56, &var_57, &var_58, &global_scratch_59};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 131072}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1357056}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 112896
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 144)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 = loadKernel("/home/gasoonjia/executorch/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_60 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_61 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_62 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_63 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_64 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_65 = xnumel;
+    CUdeviceptr global_scratch_66 = 0;
+    void* kernel_args_[] = {&var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &global_scratch_66};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_10(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_10', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 32768}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 301568}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 25088
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 32)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 = loadKernel("/home/gasoonjia/executorch/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_10", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_67 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_68 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_69 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_70 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_71 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_72 = xnumel;
+    CUdeviceptr global_scratch_73 = 0;
+    void* kernel_args_[] = {&var_67, &var_68, &var_69, &var_70, &var_71, &var_72, &global_scratch_73};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 262144}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1809408}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 150528
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 192)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 = loadKernel("/home/gasoonjia/executorch/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_74 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_75 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_76 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_77 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_78 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_79 = xnumel;
+    CUdeviceptr global_scratch_80 = 0;
+    void* kernel_args_[] = {&var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &global_scratch_80};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_12', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 32768}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 401920}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_add_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 25088
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 32)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x2), xmask)
+        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tmp1 - tmp2
+        tmp5 = 1e-05
+        tmp6 = tmp4 + tmp5
+        tmp7 = libdevice.sqrt(tmp6)
+        tmp8 = tl.full([1], 1, tl.int32)
+        tmp9 = (tmp8 / tmp7)
+        tmp10 = 1.0
+        tmp11 = tmp9 * tmp10
+        tmp12 = tmp3 * tmp11
+        tmp14 = tmp12 * tmp13
+        tmp16 = tmp14 + tmp15
+        tmp17 = tmp0 + tmp16
+        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 = loadKernel("/home/gasoonjia/executorch/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_12", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_81 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_82 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_83 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_84 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_85 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_86 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_87 = xnumel;
+    CUdeviceptr global_scratch_88 = 0;
+    void* kernel_args_[] = {&var_81, &var_82, &var_83, &var_84, &var_85, &var_86, &var_87, &global_scratch_88};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 65536}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 454656}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 37632
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 192)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 = loadKernel("/home/gasoonjia/executorch/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_89 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_90 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_91 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_92 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_93 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_94 = xnumel;
+    CUdeviceptr global_scratch_95 = 0;
+    void* kernel_args_[] = {&var_89, &var_90, &var_91, &var_92, &var_93, &var_94, &global_scratch_95};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_14(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_14', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 16384}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_14', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 151552}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_14(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 12544
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 64)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 = loadKernel("/home/gasoonjia/executorch/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_14", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_96 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_97 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_98 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_99 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_100 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_101 = xnumel;
+    CUdeviceptr global_scratch_102 = 0;
+    void* kernel_args_[] = {&var_96, &var_97, &var_98, &var_99, &var_100, &var_101, &global_scratch_102};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 131072}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 909312}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 75264
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 384)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 = loadKernel("/home/gasoonjia/executorch/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_103 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_104 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_105 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_106 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_107 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_108 = xnumel;
+    CUdeviceptr global_scratch_109 = 0;
+    void* kernel_args_[] = {&var_103, &var_104, &var_105, &var_106, &var_107, &var_108, &global_scratch_109};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_16', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 16384}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_16', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 201728}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_add_16(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 12544
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 64)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x2), xmask)
+        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tmp1 - tmp2
+        tmp5 = 1e-05
+        tmp6 = tmp4 + tmp5
+        tmp7 = libdevice.sqrt(tmp6)
+        tmp8 = tl.full([1], 1, tl.int32)
+        tmp9 = (tmp8 / tmp7)
+        tmp10 = 1.0
+        tmp11 = tmp9 * tmp10
+        tmp12 = tmp3 * tmp11
+        tmp14 = tmp12 * tmp13
+        tmp16 = tmp14 + tmp15
+        tmp17 = tmp0 + tmp16
+        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 = loadKernel("/home/gasoonjia/executorch/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_16", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_110 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_111 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_112 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_113 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_114 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_115 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_116 = xnumel;
+    CUdeviceptr global_scratch_117 = 0;
+    void* kernel_args_[] = {&var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &global_scratch_117};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_17(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_17', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 32768}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_17', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 227328}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_17(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 18816
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 96)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 = loadKernel("/home/gasoonjia/executorch/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_17", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_118 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_119 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_120 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_121 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_122 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_123 = xnumel;
+    CUdeviceptr global_scratch_124 = 0;
+    void* kernel_args_[] = {&var_118, &var_119, &var_120, &var_121, &var_122, &var_123, &global_scratch_124};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 131072}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1363968}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 112896
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 576)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 = loadKernel("/home/gasoonjia/executorch/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_125 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_126 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_127 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_128 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_129 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_130 = xnumel;
+    CUdeviceptr global_scratch_131 = 0;
+    void* kernel_args_[] = {&var_125, &var_126, &var_127, &var_128, &var_129, &var_130, &global_scratch_131};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_19', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 32768}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_19', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 302592}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_add_19(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 18816
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 96)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x2), xmask)
+        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tmp1 - tmp2
+        tmp5 = 1e-05
+        tmp6 = tmp4 + tmp5
+        tmp7 = libdevice.sqrt(tmp6)
+        tmp8 = tl.full([1], 1, tl.int32)
+        tmp9 = (tmp8 / tmp7)
+        tmp10 = 1.0
+        tmp11 = tmp9 * tmp10
+        tmp12 = tmp3 * tmp11
+        tmp14 = tmp12 * tmp13
+        tmp16 = tmp14 + tmp15
+        tmp17 = tmp0 + tmp16
+        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 = loadKernel("/home/gasoonjia/executorch/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_19", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_132 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_133 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_134 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_135 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_136 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_137 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_138 = xnumel;
+    CUdeviceptr global_scratch_139 = 0;
+    void* kernel_args_[] = {&var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &global_scratch_139};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 32768}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 347904}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 28224
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 576)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 = loadKernel("/home/gasoonjia/executorch/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_140 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_141 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_142 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_143 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_144 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_145 = xnumel;
+    CUdeviceptr global_scratch_146 = 0;
+    void* kernel_args_[] = {&var_140, &var_141, &var_142, &var_143, &var_144, &var_145, &global_scratch_146};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_21(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_21', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 8192}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_21', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 96640}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_21(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 7840
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 160)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 = loadKernel("/home/gasoonjia/executorch/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_21", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_147 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_148 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_149 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_150 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_151 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_152 = xnumel;
+    CUdeviceptr global_scratch_153 = 0;
+    void* kernel_args_[] = {&var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &global_scratch_153};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 65536}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 579840}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 47040
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 960)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 = loadKernel("/home/gasoonjia/executorch/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_154 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_155 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_156 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_157 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_158 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_159 = xnumel;
+    CUdeviceptr global_scratch_160 = 0;
+    void* kernel_args_[] = {&var_154, &var_155, &var_156, &var_157, &var_158, &var_159, &global_scratch_160};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_23', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 8192}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_23', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 128000}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_add_23(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 7840
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 160)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x2), xmask)
+        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tmp1 - tmp2
+        tmp5 = 1e-05
+        tmp6 = tmp4 + tmp5
+        tmp7 = libdevice.sqrt(tmp6)
+        tmp8 = tl.full([1], 1, tl.int32)
+        tmp9 = (tmp8 / tmp7)
+        tmp10 = 1.0
+        tmp11 = tmp9 * tmp10
+        tmp12 = tmp3 * tmp11
+        tmp14 = tmp12 * tmp13
+        tmp16 = tmp14 + tmp15
+        tmp17 = tmp0 + tmp16
+        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 = loadKernel("/home/gasoonjia/executorch/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_23", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_161 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_162 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_163 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_164 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_165 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_166 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_167 = xnumel;
+    CUdeviceptr global_scratch_168 = 0;
+    void* kernel_args_[] = {&var_161, &var_162, &var_163, &var_164, &var_165, &var_166, &var_167, &global_scratch_168};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
+static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_24(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_24', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 16384}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_24', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 193280}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused__native_batch_norm_legit_no_training_24(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 15680
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x2 = xindex
+        x0 = (xindex % 320)
+        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
+        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 == nullptr) {
+        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 = loadKernel("/home/gasoonjia/executorch/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_24", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_169 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_170 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_171 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_172 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_173 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    int var_174 = xnumel;
+    CUdeviceptr global_scratch_175 = 0;
+    void* kernel_args_[] = {&var_169, &var_170, &var_171, &var_172, &var_173, &var_174, &global_scratch_175};
+    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
+static inline void call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(
+    const in_out_ptr0_type_& in_out_ptr0,
+    const in_ptr0_type_& in_ptr0,
+    const in_ptr1_type_& in_ptr1,
+    const in_ptr2_type_& in_ptr2,
+    const in_ptr3_type_& in_ptr3,
+    const in_ptr4_type_& in_ptr4,
+    int64_t xnumel,
+    int64_t r0_numel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.persistent_reduction(
+        size_hints={'x': 2048, 'r0_': 64},
+        reduction_hint=ReductionHint.OUTER,
+        filename=__file__,
+        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 1, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 281600, 'r0_': 0}}
+    )
+    @triton.jit
+    def triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr):
+        xnumel = 1280
+        r0_numel = 49
+        R0_BLOCK: tl.constexpr = 64
+        rnumel = r0_numel
+        RBLOCK: tl.constexpr = R0_BLOCK
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
+        xmask = xindex < xnumel
+        r0_index = tl.arange(0, R0_BLOCK)[None, :]
+        r0_offset = 0
+        r0_mask = r0_index < r0_numel
+        roffset = r0_offset
+        rindex = r0_index
+        r0_1 = r0_index
+        x0 = xindex
+        tmp0 = tl.load(in_ptr0 + (x0 + 1280*r0_1), r0_mask & xmask, other=0.0)
+        tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
+        tmp3 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
+        tmp12 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
+        tmp14 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
+        tmp2 = tmp0 - tmp1
+        tmp4 = 1e-05
+        tmp5 = tmp3 + tmp4
+        tmp6 = libdevice.sqrt(tmp5)
+        tmp7 = tl.full([1, 1], 1, tl.int32)
+        tmp8 = (tmp7 / tmp6)
+        tmp9 = 1.0
+        tmp10 = tmp8 * tmp9
+        tmp11 = tmp2 * tmp10
+        tmp13 = tmp11 * tmp12
+        tmp15 = tmp13 + tmp14
+        tmp16 = 0.0
+        tmp17 = triton_helpers.maximum(tmp15, tmp16)
+        tmp18 = 6.0
+        tmp19 = triton_helpers.minimum(tmp17, tmp18)
+        tmp20 = tl.broadcast_to(tmp19, [XBLOCK, R0_BLOCK])
+        tmp22 = tl.where(r0_mask & xmask, tmp20, 0)
+        tmp23 = tl.sum(tmp22, 1)[:, None]
+        tmp24 = 49.0
+        tmp25 = (tmp23 / tmp24)
+        tl.debug_barrier()
+        tl.store(in_out_ptr0 + (x0), tmp25, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 == nullptr) {
+        kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 = loadKernel("/home/gasoonjia/executorch/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin", "triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25", 1024, cubin_dir_); 
+    }
+    CUdeviceptr var_176 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
+    CUdeviceptr var_177 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_178 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
+    CUdeviceptr var_179 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
+    CUdeviceptr var_180 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
+    CUdeviceptr var_181 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
+    int var_182 = xnumel;
+    int var_183 = r0_numel;
+    CUdeviceptr global_scratch_184 = 0;
+    void* kernel_args_[] = {&var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184};
+    launchKernel(kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25, grid_0, grid_1, grid_2, 8, 1024, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_permute_copy_26(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_permute_copy_26', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'x': 2097152}, 
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_permute_copy_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 15360000}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_permute_copy_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
+        xnumel = 1280000
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x0 = xindex
+        tmp0 = tl.load(in_ptr0 + (x0), xmask)
+        tl.store(out_ptr0 + (x0), tmp0, xmask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
+    uint32_t grid_1 = 1;
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_permute_copy_26 == nullptr) {
+        kernels_.triton_poi_fused_permute_copy_26 = loadKernel("/home/gasoonjia/executorch/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin", "triton_poi_fused_permute_copy_26", 0, cubin_dir_); 
+    }
+    CUdeviceptr var_185 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_186 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_187 = xnumel;
+    CUdeviceptr global_scratch_188 = 0;
+    void* kernel_args_[] = {&var_185, &var_186, &var_187, &global_scratch_188};
+    launchKernel(kernels_.triton_poi_fused_permute_copy_26, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
+}
+
+namespace torch::aot_inductor {
+
+void AOTInductorModel::_const_run_impl(
+    std::vector<AtenTensorHandle>& output_handles,
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {}
+
+AOTI_NOINLINE static void check_input_0(
+    AtenTensorHandle* input_handles
+) {
+    ConstantHandle arg262_1 = ConstantHandle(input_handles[0]);
+    int32_t arg262_1_dtype;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg262_1, &arg262_1_dtype));
+
+    int32_t arg262_1_expected_dtype = aoti_torch_dtype_float32();
+    if (arg262_1_expected_dtype != arg262_1_dtype) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dtype, "
+           << "expected: " << arg262_1_expected_dtype << "(at::kFloat), "
+           << "but got: " << arg262_1_dtype << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg262_1_size = arg262_1.sizes();
+
+    if (1 != arg262_1_size[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 0, "
+           << "expected: 1, " << "but got: " << arg262_1_size[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (3 != arg262_1_size[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 1, "
+           << "expected: 3, " << "but got: " << arg262_1_size[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (224 != arg262_1_size[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 2, "
+           << "expected: 224, " << "but got: " << arg262_1_size[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (224 != arg262_1_size[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 3, "
+           << "expected: 224, " << "but got: " << arg262_1_size[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg262_1_stride = arg262_1.strides();
+
+    if (150528 != arg262_1_stride[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 0, "
+           << "expected: 150528, " << "but got: " << arg262_1_stride[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (50176 != arg262_1_stride[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 1, "
+           << "expected: 50176, " << "but got: " << arg262_1_stride[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (224 != arg262_1_stride[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 2, "
+           << "expected: 224, " << "but got: " << arg262_1_stride[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (1 != arg262_1_stride[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 3, "
+           << "expected: 1, " << "but got: " << arg262_1_stride[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    int32_t arg262_1_device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg262_1, &arg262_1_device_type));
+
+    int32_t arg262_1_expected_device_type = 1;
+    if (arg262_1_expected_device_type != arg262_1_device_type) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched device type, "
+        << "expected: " << arg262_1_expected_device_type << "1(cuda), "
+        << "but got: " << arg262_1_device_type << "\n";
+        throw std::runtime_error(ss.str());
+    }
+}
+
+static bool _check_aoti_runtime_check_inputs_env() {
+    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
+    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
+    return result;
+}
+
+AOTI_NOINLINE static void __check_inputs_outputs(
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+    if (!_check_aoti_runtime_check_inputs_env()){
+        return;
+    }
+    check_input_0(input_handles);
+}
+
+void AOTInductorModel::run_impl(
+    AtenTensorHandle*
+        input_handles, // array of input AtenTensorHandle; handles
+                        // are stolen; the array itself is borrowed
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {
+    __check_inputs_outputs(input_handles, output_handles);
+
+    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
+    auto arg262_1 = std::move(inputs[0]);
+    [[maybe_unused]] auto& mv2_features_0_0_weight = constants_->at(0);
+    [[maybe_unused]] auto& mv2_features_0_1_weight = constants_->at(1);
+    [[maybe_unused]] auto& mv2_features_0_1_bias = constants_->at(2);
+    [[maybe_unused]] auto& mv2_features_1_conv_0_0_weight = constants_->at(3);
+    [[maybe_unused]] auto& mv2_features_1_conv_0_1_weight = constants_->at(4);
+    [[maybe_unused]] auto& mv2_features_1_conv_0_1_bias = constants_->at(5);
+    [[maybe_unused]] auto& mv2_features_1_conv_1_weight = constants_->at(6);
+    [[maybe_unused]] auto& mv2_features_1_conv_2_weight = constants_->at(7);
+    [[maybe_unused]] auto& mv2_features_1_conv_2_bias = constants_->at(8);
+    [[maybe_unused]] auto& mv2_features_2_conv_0_0_weight = constants_->at(9);
+    [[maybe_unused]] auto& mv2_features_2_conv_0_1_weight = constants_->at(10);
+    [[maybe_unused]] auto& mv2_features_2_conv_0_1_bias = constants_->at(11);
+    [[maybe_unused]] auto& mv2_features_2_conv_1_0_weight = constants_->at(12);
+    [[maybe_unused]] auto& mv2_features_2_conv_1_1_weight = constants_->at(13);
+    [[maybe_unused]] auto& mv2_features_2_conv_1_1_bias = constants_->at(14);
+    [[maybe_unused]] auto& mv2_features_2_conv_2_weight = constants_->at(15);
+    [[maybe_unused]] auto& mv2_features_2_conv_3_weight = constants_->at(16);
+    [[maybe_unused]] auto& mv2_features_2_conv_3_bias = constants_->at(17);
+    [[maybe_unused]] auto& mv2_features_3_conv_0_0_weight = constants_->at(18);
+    [[maybe_unused]] auto& mv2_features_3_conv_0_1_weight = constants_->at(19);
+    [[maybe_unused]] auto& mv2_features_3_conv_0_1_bias = constants_->at(20);
+    [[maybe_unused]] auto& mv2_features_3_conv_1_0_weight = constants_->at(21);
+    [[maybe_unused]] auto& mv2_features_3_conv_1_1_weight = constants_->at(22);
+    [[maybe_unused]] auto& mv2_features_3_conv_1_1_bias = constants_->at(23);
+    [[maybe_unused]] auto& mv2_features_3_conv_2_weight = constants_->at(24);
+    [[maybe_unused]] auto& mv2_features_3_conv_3_weight = constants_->at(25);
+    [[maybe_unused]] auto& mv2_features_3_conv_3_bias = constants_->at(26);
+    [[maybe_unused]] auto& mv2_features_4_conv_0_0_weight = constants_->at(27);
+    [[maybe_unused]] auto& mv2_features_4_conv_0_1_weight = constants_->at(28);
+    [[maybe_unused]] auto& mv2_features_4_conv_0_1_bias = constants_->at(29);
+    [[maybe_unused]] auto& mv2_features_4_conv_1_0_weight = constants_->at(30);
+    [[maybe_unused]] auto& mv2_features_4_conv_1_1_weight = constants_->at(31);
+    [[maybe_unused]] auto& mv2_features_4_conv_1_1_bias = constants_->at(32);
+    [[maybe_unused]] auto& mv2_features_4_conv_2_weight = constants_->at(33);
+    [[maybe_unused]] auto& mv2_features_4_conv_3_weight = constants_->at(34);
+    [[maybe_unused]] auto& mv2_features_4_conv_3_bias = constants_->at(35);
+    [[maybe_unused]] auto& mv2_features_5_conv_0_0_weight = constants_->at(36);
+    [[maybe_unused]] auto& mv2_features_5_conv_0_1_weight = constants_->at(37);
+    [[maybe_unused]] auto& mv2_features_5_conv_0_1_bias = constants_->at(38);
+    [[maybe_unused]] auto& mv2_features_5_conv_1_0_weight = constants_->at(39);
+    [[maybe_unused]] auto& mv2_features_5_conv_1_1_weight = constants_->at(40);
+    [[maybe_unused]] auto& mv2_features_5_conv_1_1_bias = constants_->at(41);
+    [[maybe_unused]] auto& mv2_features_5_conv_2_weight = constants_->at(42);
+    [[maybe_unused]] auto& mv2_features_5_conv_3_weight = constants_->at(43);
+    [[maybe_unused]] auto& mv2_features_5_conv_3_bias = constants_->at(44);
+    [[maybe_unused]] auto& mv2_features_6_conv_0_0_weight = constants_->at(45);
+    [[maybe_unused]] auto& mv2_features_6_conv_0_1_weight = constants_->at(46);
+    [[maybe_unused]] auto& mv2_features_6_conv_0_1_bias = constants_->at(47);
+    [[maybe_unused]] auto& mv2_features_6_conv_1_0_weight = constants_->at(48);
+    [[maybe_unused]] auto& mv2_features_6_conv_1_1_weight = constants_->at(49);
+    [[maybe_unused]] auto& mv2_features_6_conv_1_1_bias = constants_->at(50);
+    [[maybe_unused]] auto& mv2_features_6_conv_2_weight = constants_->at(51);
+    [[maybe_unused]] auto& mv2_features_6_conv_3_weight = constants_->at(52);
+    [[maybe_unused]] auto& mv2_features_6_conv_3_bias = constants_->at(53);
+    [[maybe_unused]] auto& mv2_features_7_conv_0_0_weight = constants_->at(54);
+    [[maybe_unused]] auto& mv2_features_7_conv_0_1_weight = constants_->at(55);
+    [[maybe_unused]] auto& mv2_features_7_conv_0_1_bias = constants_->at(56);
+    [[maybe_unused]] auto& mv2_features_7_conv_1_0_weight = constants_->at(57);
+    [[maybe_unused]] auto& mv2_features_7_conv_1_1_weight = constants_->at(58);
+    [[maybe_unused]] auto& mv2_features_7_conv_1_1_bias = constants_->at(59);
+    [[maybe_unused]] auto& mv2_features_7_conv_2_weight = constants_->at(60);
+    [[maybe_unused]] auto& mv2_features_7_conv_3_weight = constants_->at(61);
+    [[maybe_unused]] auto& mv2_features_7_conv_3_bias = constants_->at(62);
+    [[maybe_unused]] auto& mv2_features_8_conv_0_0_weight = constants_->at(63);
+    [[maybe_unused]] auto& mv2_features_8_conv_0_1_weight = constants_->at(64);
+    [[maybe_unused]] auto& mv2_features_8_conv_0_1_bias = constants_->at(65);
+    [[maybe_unused]] auto& mv2_features_8_conv_1_0_weight = constants_->at(66);
+    [[maybe_unused]] auto& mv2_features_8_conv_1_1_weight = constants_->at(67);
+    [[maybe_unused]] auto& mv2_features_8_conv_1_1_bias = constants_->at(68);
+    [[maybe_unused]] auto& mv2_features_8_conv_2_weight = constants_->at(69);
+    [[maybe_unused]] auto& mv2_features_8_conv_3_weight = constants_->at(70);
+    [[maybe_unused]] auto& mv2_features_8_conv_3_bias = constants_->at(71);
+    [[maybe_unused]] auto& mv2_features_9_conv_0_0_weight = constants_->at(72);
+    [[maybe_unused]] auto& mv2_features_9_conv_0_1_weight = constants_->at(73);
+    [[maybe_unused]] auto& mv2_features_9_conv_0_1_bias = constants_->at(74);
+    [[maybe_unused]] auto& mv2_features_9_conv_1_0_weight = constants_->at(75);
+    [[maybe_unused]] auto& mv2_features_9_conv_1_1_weight = constants_->at(76);
+    [[maybe_unused]] auto& mv2_features_9_conv_1_1_bias = constants_->at(77);
+    [[maybe_unused]] auto& mv2_features_9_conv_2_weight = constants_->at(78);
+    [[maybe_unused]] auto& mv2_features_9_conv_3_weight = constants_->at(79);
+    [[maybe_unused]] auto& mv2_features_9_conv_3_bias = constants_->at(80);
+    [[maybe_unused]] auto& mv2_features_10_conv_0_0_weight = constants_->at(81);
+    [[maybe_unused]] auto& mv2_features_10_conv_0_1_weight = constants_->at(82);
+    [[maybe_unused]] auto& mv2_features_10_conv_0_1_bias = constants_->at(83);
+    [[maybe_unused]] auto& mv2_features_10_conv_1_0_weight = constants_->at(84);
+    [[maybe_unused]] auto& mv2_features_10_conv_1_1_weight = constants_->at(85);
+    [[maybe_unused]] auto& mv2_features_10_conv_1_1_bias = constants_->at(86);
+    [[maybe_unused]] auto& mv2_features_10_conv_2_weight = constants_->at(87);
+    [[maybe_unused]] auto& mv2_features_10_conv_3_weight = constants_->at(88);
+    [[maybe_unused]] auto& mv2_features_10_conv_3_bias = constants_->at(89);
+    [[maybe_unused]] auto& mv2_features_11_conv_0_0_weight = constants_->at(90);
+    [[maybe_unused]] auto& mv2_features_11_conv_0_1_weight = constants_->at(91);
+    [[maybe_unused]] auto& mv2_features_11_conv_0_1_bias = constants_->at(92);
+    [[maybe_unused]] auto& mv2_features_11_conv_1_0_weight = constants_->at(93);
+    [[maybe_unused]] auto& mv2_features_11_conv_1_1_weight = constants_->at(94);
+    [[maybe_unused]] auto& mv2_features_11_conv_1_1_bias = constants_->at(95);
+    [[maybe_unused]] auto& mv2_features_11_conv_2_weight = constants_->at(96);
+    [[maybe_unused]] auto& mv2_features_11_conv_3_weight = constants_->at(97);
+    [[maybe_unused]] auto& mv2_features_11_conv_3_bias = constants_->at(98);
+    [[maybe_unused]] auto& mv2_features_12_conv_0_0_weight = constants_->at(99);
+    [[maybe_unused]] auto& mv2_features_12_conv_0_1_weight = constants_->at(100);
+    [[maybe_unused]] auto& mv2_features_12_conv_0_1_bias = constants_->at(101);
+    [[maybe_unused]] auto& mv2_features_12_conv_1_0_weight = constants_->at(102);
+    [[maybe_unused]] auto& mv2_features_12_conv_1_1_weight = constants_->at(103);
+    [[maybe_unused]] auto& mv2_features_12_conv_1_1_bias = constants_->at(104);
+    [[maybe_unused]] auto& mv2_features_12_conv_2_weight = constants_->at(105);
+    [[maybe_unused]] auto& mv2_features_12_conv_3_weight = constants_->at(106);
+    [[maybe_unused]] auto& mv2_features_12_conv_3_bias = constants_->at(107);
+    [[maybe_unused]] auto& mv2_features_13_conv_0_0_weight = constants_->at(108);
+    [[maybe_unused]] auto& mv2_features_13_conv_0_1_weight = constants_->at(109);
+    [[maybe_unused]] auto& mv2_features_13_conv_0_1_bias = constants_->at(110);
+    [[maybe_unused]] auto& mv2_features_13_conv_1_0_weight = constants_->at(111);
+    [[maybe_unused]] auto& mv2_features_13_conv_1_1_weight = constants_->at(112);
+    [[maybe_unused]] auto& mv2_features_13_conv_1_1_bias = constants_->at(113);
+    [[maybe_unused]] auto& mv2_features_13_conv_2_weight = constants_->at(114);
+    [[maybe_unused]] auto& mv2_features_13_conv_3_weight = constants_->at(115);
+    [[maybe_unused]] auto& mv2_features_13_conv_3_bias = constants_->at(116);
+    [[maybe_unused]] auto& mv2_features_14_conv_0_0_weight = constants_->at(117);
+    [[maybe_unused]] auto& mv2_features_14_conv_0_1_weight = constants_->at(118);
+    [[maybe_unused]] auto& mv2_features_14_conv_0_1_bias = constants_->at(119);
+    [[maybe_unused]] auto& mv2_features_14_conv_1_0_weight = constants_->at(120);
+    [[maybe_unused]] auto& mv2_features_14_conv_1_1_weight = constants_->at(121);
+    [[maybe_unused]] auto& mv2_features_14_conv_1_1_bias = constants_->at(122);
+    [[maybe_unused]] auto& mv2_features_14_conv_2_weight = constants_->at(123);
+    [[maybe_unused]] auto& mv2_features_14_conv_3_weight = constants_->at(124);
+    [[maybe_unused]] auto& mv2_features_14_conv_3_bias = constants_->at(125);
+    [[maybe_unused]] auto& mv2_features_15_conv_0_0_weight = constants_->at(126);
+    [[maybe_unused]] auto& mv2_features_15_conv_0_1_weight = constants_->at(127);
+    [[maybe_unused]] auto& mv2_features_15_conv_0_1_bias = constants_->at(128);
+    [[maybe_unused]] auto& mv2_features_15_conv_1_0_weight = constants_->at(129);
+    [[maybe_unused]] auto& mv2_features_15_conv_1_1_weight = constants_->at(130);
+    [[maybe_unused]] auto& mv2_features_15_conv_1_1_bias = constants_->at(131);
+    [[maybe_unused]] auto& mv2_features_15_conv_2_weight = constants_->at(132);
+    [[maybe_unused]] auto& mv2_features_15_conv_3_weight = constants_->at(133);
+    [[maybe_unused]] auto& mv2_features_15_conv_3_bias = constants_->at(134);
+    [[maybe_unused]] auto& mv2_features_16_conv_0_0_weight = constants_->at(135);
+    [[maybe_unused]] auto& mv2_features_16_conv_0_1_weight = constants_->at(136);
+    [[maybe_unused]] auto& mv2_features_16_conv_0_1_bias = constants_->at(137);
+    [[maybe_unused]] auto& mv2_features_16_conv_1_0_weight = constants_->at(138);
+    [[maybe_unused]] auto& mv2_features_16_conv_1_1_weight = constants_->at(139);
+    [[maybe_unused]] auto& mv2_features_16_conv_1_1_bias = constants_->at(140);
+    [[maybe_unused]] auto& mv2_features_16_conv_2_weight = constants_->at(141);
+    [[maybe_unused]] auto& mv2_features_16_conv_3_weight = constants_->at(142);
+    [[maybe_unused]] auto& mv2_features_16_conv_3_bias = constants_->at(143);
+    [[maybe_unused]] auto& mv2_features_17_conv_0_0_weight = constants_->at(144);
+    [[maybe_unused]] auto& mv2_features_17_conv_0_1_weight = constants_->at(145);
+    [[maybe_unused]] auto& mv2_features_17_conv_0_1_bias = constants_->at(146);
+    [[maybe_unused]] auto& mv2_features_17_conv_1_0_weight = constants_->at(147);
+    [[maybe_unused]] auto& mv2_features_17_conv_1_1_weight = constants_->at(148);
+    [[maybe_unused]] auto& mv2_features_17_conv_1_1_bias = constants_->at(149);
+    [[maybe_unused]] auto& mv2_features_17_conv_2_weight = constants_->at(150);
+    [[maybe_unused]] auto& mv2_features_17_conv_3_weight = constants_->at(151);
+    [[maybe_unused]] auto& mv2_features_17_conv_3_bias = constants_->at(152);
+    [[maybe_unused]] auto& mv2_features_18_0_weight = constants_->at(153);
+    [[maybe_unused]] auto& mv2_features_18_1_weight = constants_->at(154);
+    [[maybe_unused]] auto& mv2_features_18_1_bias = constants_->at(155);
+    [[maybe_unused]] auto& mv2_classifier_1_weight = constants_->at(156);
+    [[maybe_unused]] auto& mv2_classifier_1_bias = constants_->at(157);
+    [[maybe_unused]] auto& mv2_features_0_1_running_mean = constants_->at(158);
+    [[maybe_unused]] auto& mv2_features_0_1_running_var = constants_->at(159);
+    [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_mean = constants_->at(160);
+    [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_var = constants_->at(161);
+    [[maybe_unused]] auto& mv2_features_1_conv_2_running_mean = constants_->at(162);
+    [[maybe_unused]] auto& mv2_features_1_conv_2_running_var = constants_->at(163);
+    [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_mean = constants_->at(164);
+    [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_var = constants_->at(165);
+    [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_mean = constants_->at(166);
+    [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_var = constants_->at(167);
+    [[maybe_unused]] auto& mv2_features_2_conv_3_running_mean = constants_->at(168);
+    [[maybe_unused]] auto& mv2_features_2_conv_3_running_var = constants_->at(169);
+    [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_mean = constants_->at(170);
+    [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_var = constants_->at(171);
+    [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_mean = constants_->at(172);
+    [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_var = constants_->at(173);
+    [[maybe_unused]] auto& mv2_features_3_conv_3_running_mean = constants_->at(174);
+    [[maybe_unused]] auto& mv2_features_3_conv_3_running_var = constants_->at(175);
+    [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_mean = constants_->at(176);
+    [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_var = constants_->at(177);
+    [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_mean = constants_->at(178);
+    [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_var = constants_->at(179);
+    [[maybe_unused]] auto& mv2_features_4_conv_3_running_mean = constants_->at(180);
+    [[maybe_unused]] auto& mv2_features_4_conv_3_running_var = constants_->at(181);
+    [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_mean = constants_->at(182);
+    [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_var = constants_->at(183);
+    [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_mean = constants_->at(184);
+    [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_var = constants_->at(185);
+    [[maybe_unused]] auto& mv2_features_5_conv_3_running_mean = constants_->at(186);
+    [[maybe_unused]] auto& mv2_features_5_conv_3_running_var = constants_->at(187);
+    [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_mean = constants_->at(188);
+    [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_var = constants_->at(189);
+    [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_mean = constants_->at(190);
+    [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_var = constants_->at(191);
+    [[maybe_unused]] auto& mv2_features_6_conv_3_running_mean = constants_->at(192);
+    [[maybe_unused]] auto& mv2_features_6_conv_3_running_var = constants_->at(193);
+    [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_mean = constants_->at(194);
+    [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_var = constants_->at(195);
+    [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_mean = constants_->at(196);
+    [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_var = constants_->at(197);
+    [[maybe_unused]] auto& mv2_features_7_conv_3_running_mean = constants_->at(198);
+    [[maybe_unused]] auto& mv2_features_7_conv_3_running_var = constants_->at(199);
+    [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_mean = constants_->at(200);
+    [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_var = constants_->at(201);
+    [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_mean = constants_->at(202);
+    [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_var = constants_->at(203);
+    [[maybe_unused]] auto& mv2_features_8_conv_3_running_mean = constants_->at(204);
+    [[maybe_unused]] auto& mv2_features_8_conv_3_running_var = constants_->at(205);
+    [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_mean = constants_->at(206);
+    [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_var = constants_->at(207);
+    [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_mean = constants_->at(208);
+    [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_var = constants_->at(209);
+    [[maybe_unused]] auto& mv2_features_9_conv_3_running_mean = constants_->at(210);
+    [[maybe_unused]] auto& mv2_features_9_conv_3_running_var = constants_->at(211);
+    [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_mean = constants_->at(212);
+    [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_var = constants_->at(213);
+    [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_mean = constants_->at(214);
+    [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_var = constants_->at(215);
+    [[maybe_unused]] auto& mv2_features_10_conv_3_running_mean = constants_->at(216);
+    [[maybe_unused]] auto& mv2_features_10_conv_3_running_var = constants_->at(217);
+    [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_mean = constants_->at(218);
+    [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_var = constants_->at(219);
+    [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_mean = constants_->at(220);
+    [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_var = constants_->at(221);
+    [[maybe_unused]] auto& mv2_features_11_conv_3_running_mean = constants_->at(222);
+    [[maybe_unused]] auto& mv2_features_11_conv_3_running_var = constants_->at(223);
+    [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_mean = constants_->at(224);
+    [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_var = constants_->at(225);
+    [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_mean = constants_->at(226);
+    [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_var = constants_->at(227);
+    [[maybe_unused]] auto& mv2_features_12_conv_3_running_mean = constants_->at(228);
+    [[maybe_unused]] auto& mv2_features_12_conv_3_running_var = constants_->at(229);
+    [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_mean = constants_->at(230);
+    [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_var = constants_->at(231);
+    [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_mean = constants_->at(232);
+    [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_var = constants_->at(233);
+    [[maybe_unused]] auto& mv2_features_13_conv_3_running_mean = constants_->at(234);
+    [[maybe_unused]] auto& mv2_features_13_conv_3_running_var = constants_->at(235);
+    [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_mean = constants_->at(236);
+    [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_var = constants_->at(237);
+    [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_mean = constants_->at(238);
+    [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_var = constants_->at(239);
+    [[maybe_unused]] auto& mv2_features_14_conv_3_running_mean = constants_->at(240);
+    [[maybe_unused]] auto& mv2_features_14_conv_3_running_var = constants_->at(241);
+    [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_mean = constants_->at(242);
+    [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_var = constants_->at(243);
+    [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_mean = constants_->at(244);
+    [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_var = constants_->at(245);
+    [[maybe_unused]] auto& mv2_features_15_conv_3_running_mean = constants_->at(246);
+    [[maybe_unused]] auto& mv2_features_15_conv_3_running_var = constants_->at(247);
+    [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_mean = constants_->at(248);
+    [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_var = constants_->at(249);
+    [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_mean = constants_->at(250);
+    [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_var = constants_->at(251);
+    [[maybe_unused]] auto& mv2_features_16_conv_3_running_mean = constants_->at(252);
+    [[maybe_unused]] auto& mv2_features_16_conv_3_running_var = constants_->at(253);
+    [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_mean = constants_->at(254);
+    [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_var = constants_->at(255);
+    [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_mean = constants_->at(256);
+    [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_var = constants_->at(257);
+    [[maybe_unused]] auto& mv2_features_17_conv_3_running_mean = constants_->at(258);
+    [[maybe_unused]] auto& mv2_features_17_conv_3_running_var = constants_->at(259);
+    [[maybe_unused]] auto& mv2_features_18_1_running_mean = constants_->at(260);
+    [[maybe_unused]] auto& mv2_features_18_1_running_var = constants_->at(261);
+
+    if ((long(arg262_1.data_ptr()) & (16 -1)) != 0) {
+        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
+        AtenTensorHandle arg262_1_aligned;
+        aoti_torch_clone_preserve_strides(arg262_1, &arg262_1_aligned);
+        arg262_1 = std::move(RAIIAtenTensorHandle(arg262_1_aligned));
+    }
+    inputs.clear();
+    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
+
+    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
+    static constexpr int64_t int_array_0[] = {1L, 3L, 224L, 224L};
+    static constexpr int64_t int_array_1[] = {150528L, 1L, 672L, 3L};
+    AtenTensorHandle buf0_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
+    RAIIAtenTensorHandle buf0(buf0_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_0(arg262_1, buf0, 3L, 50176L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    arg262_1.reset();
+    static constexpr int64_t int_array_2[] = {32L, 3L, 3L, 3L};
+    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
+    AtenTensorHandle buf1_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
+    RAIIAtenTensorHandle buf1(buf1_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_1(mv2_features_0_0_weight, buf1, 96L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    AtenTensorHandle buf2_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
+    RAIIAtenTensorHandle buf2(buf2_handle);
+    buf0.reset();
+    buf1.reset();
+    auto buf3 = std::move(buf2);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf3, mv2_features_0_1_running_mean, mv2_features_0_1_running_var, mv2_features_0_1_weight, mv2_features_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default, aten_convolution_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf4_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf3, mv2_features_1_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 32L, &buf4_handle));
+    RAIIAtenTensorHandle buf4(buf4_handle);
+    buf3.reset();
+    auto buf5 = std::move(buf4);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf5, mv2_features_1_conv_0_1_running_mean, mv2_features_1_conv_0_1_running_var, mv2_features_1_conv_0_1_weight, mv2_features_1_conv_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1, aten_convolution_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf6_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf5, mv2_features_1_conv_1_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf6_handle));
+    RAIIAtenTensorHandle buf6(buf6_handle);
+    buf5.reset();
+    auto buf7 = std::move(buf6);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_3(buf7, mv2_features_1_conv_2_running_mean, mv2_features_1_conv_2_running_var, mv2_features_1_conv_2_weight, mv2_features_1_conv_2_bias, 200704L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2, aten_convolution_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution]
+    AtenTensorHandle buf8_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf7, mv2_features_2_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf8_handle));
+    RAIIAtenTensorHandle buf8(buf8_handle);
+    buf7.reset();
+    auto buf9 = std::move(buf8);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(buf9, mv2_features_2_conv_0_1_running_mean, mv2_features_2_conv_0_1_running_var, mv2_features_2_conv_0_1_weight, mv2_features_2_conv_0_1_bias, 1204224L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2, aten_convolution_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf10_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf9, mv2_features_2_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 96L, &buf10_handle));
+    RAIIAtenTensorHandle buf10(buf10_handle);
+    buf9.reset();
+    auto buf11 = std::move(buf10);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(buf11, mv2_features_2_conv_1_1_running_mean, mv2_features_2_conv_1_1_running_var, mv2_features_2_conv_1_1_weight, mv2_features_2_conv_1_1_bias, 301056L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3, aten_convolution_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf12_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf11, mv2_features_2_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf12_handle));
+    RAIIAtenTensorHandle buf12(buf12_handle);
+    buf11.reset();
+    auto buf13 = std::move(buf12);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_5], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_6(buf13, mv2_features_2_conv_3_running_mean, mv2_features_2_conv_3_running_var, mv2_features_2_conv_3_weight, mv2_features_2_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default_6], Original ATen: [aten.convolution]
+    AtenTensorHandle buf14_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf13, mv2_features_3_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf14_handle));
+    RAIIAtenTensorHandle buf14(buf14_handle);
+    auto buf15 = std::move(buf14);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf15, mv2_features_3_conv_0_1_running_mean, mv2_features_3_conv_0_1_running_var, mv2_features_3_conv_0_1_weight, mv2_features_3_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4, aten_convolution_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf16_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf15, mv2_features_3_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 144L, &buf16_handle));
+    RAIIAtenTensorHandle buf16(buf16_handle);
+    buf15.reset();
+    auto buf17 = std::move(buf16);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf17, mv2_features_3_conv_1_1_running_mean, mv2_features_3_conv_1_1_running_var, mv2_features_3_conv_1_1_weight, mv2_features_3_conv_1_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5, aten_convolution_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf18_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf17, mv2_features_3_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf18_handle));
+    RAIIAtenTensorHandle buf18(buf18_handle);
+    buf17.reset();
+    auto buf19 = std::move(buf13);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(buf19, buf18, mv2_features_3_conv_3_running_mean, mv2_features_3_conv_3_running_var, mv2_features_3_conv_3_weight, mv2_features_3_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf18.reset();
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor, aten_convolution_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
+    AtenTensorHandle buf20_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf19, mv2_features_4_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf20_handle));
+    RAIIAtenTensorHandle buf20(buf20_handle);
+    buf19.reset();
+    auto buf21 = std::move(buf20);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf21, mv2_features_4_conv_0_1_running_mean, mv2_features_4_conv_0_1_running_var, mv2_features_4_conv_0_1_weight, mv2_features_4_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6, aten_convolution_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf22_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf21, mv2_features_4_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 144L, &buf22_handle));
+    RAIIAtenTensorHandle buf22(buf22_handle);
+    buf21.reset();
+    auto buf23 = std::move(buf22);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(buf23, mv2_features_4_conv_1_1_running_mean, mv2_features_4_conv_1_1_running_var, mv2_features_4_conv_1_1_weight, mv2_features_4_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7, aten_convolution_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf24_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf23, mv2_features_4_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf24_handle));
+    RAIIAtenTensorHandle buf24(buf24_handle);
+    buf23.reset();
+    auto buf25 = std::move(buf24);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_11], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_10(buf25, mv2_features_4_conv_3_running_mean, mv2_features_4_conv_3_running_var, mv2_features_4_conv_3_weight, mv2_features_4_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default_12], Original ATen: [aten.convolution]
+    AtenTensorHandle buf26_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf25, mv2_features_5_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf26_handle));
+    RAIIAtenTensorHandle buf26(buf26_handle);
+    auto buf27 = std::move(buf26);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf27, mv2_features_5_conv_0_1_running_mean, mv2_features_5_conv_0_1_running_var, mv2_features_5_conv_0_1_weight, mv2_features_5_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8, aten_convolution_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf28_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf27, mv2_features_5_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf28_handle));
+    RAIIAtenTensorHandle buf28(buf28_handle);
+    buf27.reset();
+    auto buf29 = std::move(buf28);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf29, mv2_features_5_conv_1_1_running_mean, mv2_features_5_conv_1_1_running_var, mv2_features_5_conv_1_1_weight, mv2_features_5_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9, aten_convolution_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf30_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf29, mv2_features_5_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf30_handle));
+    RAIIAtenTensorHandle buf30(buf30_handle);
+    buf29.reset();
+    auto buf31 = std::move(buf25);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_14, aten_add_tensor_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf31, buf30, mv2_features_5_conv_3_running_mean, mv2_features_5_conv_3_running_var, mv2_features_5_conv_3_weight, mv2_features_5_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf30.reset();
+    // Topologically Sorted Source Nodes: [aten_convolution_default_15], Original ATen: [aten.convolution]
+    AtenTensorHandle buf32_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf31, mv2_features_6_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf32_handle));
+    RAIIAtenTensorHandle buf32(buf32_handle);
+    auto buf33 = std::move(buf32);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf33, mv2_features_6_conv_0_1_running_mean, mv2_features_6_conv_0_1_running_var, mv2_features_6_conv_0_1_weight, mv2_features_6_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10, aten_convolution_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf34_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf33, mv2_features_6_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf34_handle));
+    RAIIAtenTensorHandle buf34(buf34_handle);
+    buf33.reset();
+    auto buf35 = std::move(buf34);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf35, mv2_features_6_conv_1_1_running_mean, mv2_features_6_conv_1_1_running_var, mv2_features_6_conv_1_1_weight, mv2_features_6_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11, aten_convolution_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf36_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf35, mv2_features_6_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf36_handle));
+    RAIIAtenTensorHandle buf36(buf36_handle);
+    buf35.reset();
+    auto buf37 = std::move(buf31);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf37, buf36, mv2_features_6_conv_3_running_mean, mv2_features_6_conv_3_running_var, mv2_features_6_conv_3_weight, mv2_features_6_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf36.reset();
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2, aten_convolution_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
+    AtenTensorHandle buf38_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf37, mv2_features_7_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf38_handle));
+    RAIIAtenTensorHandle buf38(buf38_handle);
+    buf37.reset();
+    auto buf39 = std::move(buf38);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf39, mv2_features_7_conv_0_1_running_mean, mv2_features_7_conv_0_1_running_var, mv2_features_7_conv_0_1_weight, mv2_features_7_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12, aten_convolution_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf40_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf39, mv2_features_7_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf40_handle));
+    RAIIAtenTensorHandle buf40(buf40_handle);
+    buf39.reset();
+    auto buf41 = std::move(buf40);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(buf41, mv2_features_7_conv_1_1_running_mean, mv2_features_7_conv_1_1_running_var, mv2_features_7_conv_1_1_weight, mv2_features_7_conv_1_1_bias, 37632L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13, aten_convolution_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf42_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf41, mv2_features_7_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf42_handle));
+    RAIIAtenTensorHandle buf42(buf42_handle);
+    buf41.reset();
+    auto buf43 = std::move(buf42);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_20], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_14(buf43, mv2_features_7_conv_3_running_mean, mv2_features_7_conv_3_running_var, mv2_features_7_conv_3_weight, mv2_features_7_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default_21], Original ATen: [aten.convolution]
+    AtenTensorHandle buf44_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf43, mv2_features_8_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf44_handle));
+    RAIIAtenTensorHandle buf44(buf44_handle);
+    auto buf45 = std::move(buf44);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf45, mv2_features_8_conv_0_1_running_mean, mv2_features_8_conv_0_1_running_var, mv2_features_8_conv_0_1_weight, mv2_features_8_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14, aten_convolution_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf46_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf45, mv2_features_8_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf46_handle));
+    RAIIAtenTensorHandle buf46(buf46_handle);
+    buf45.reset();
+    auto buf47 = std::move(buf46);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf47, mv2_features_8_conv_1_1_running_mean, mv2_features_8_conv_1_1_running_var, mv2_features_8_conv_1_1_weight, mv2_features_8_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15, aten_convolution_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf48_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf47, mv2_features_8_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf48_handle));
+    RAIIAtenTensorHandle buf48(buf48_handle);
+    buf47.reset();
+    auto buf49 = std::move(buf43);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_23, aten_add_tensor_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf49, buf48, mv2_features_8_conv_3_running_mean, mv2_features_8_conv_3_running_var, mv2_features_8_conv_3_weight, mv2_features_8_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf48.reset();
+    // Topologically Sorted Source Nodes: [aten_convolution_default_24], Original ATen: [aten.convolution]
+    AtenTensorHandle buf50_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf49, mv2_features_9_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf50_handle));
+    RAIIAtenTensorHandle buf50(buf50_handle);
+    auto buf51 = std::move(buf50);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf51, mv2_features_9_conv_0_1_running_mean, mv2_features_9_conv_0_1_running_var, mv2_features_9_conv_0_1_weight, mv2_features_9_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16, aten_convolution_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf52_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf51, mv2_features_9_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf52_handle));
+    RAIIAtenTensorHandle buf52(buf52_handle);
+    buf51.reset();
+    auto buf53 = std::move(buf52);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf53, mv2_features_9_conv_1_1_running_mean, mv2_features_9_conv_1_1_running_var, mv2_features_9_conv_1_1_weight, mv2_features_9_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17, aten_convolution_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf54_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf53, mv2_features_9_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf54_handle));
+    RAIIAtenTensorHandle buf54(buf54_handle);
+    buf53.reset();
+    auto buf55 = std::move(buf49);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_26, aten_add_tensor_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf55, buf54, mv2_features_9_conv_3_running_mean, mv2_features_9_conv_3_running_var, mv2_features_9_conv_3_weight, mv2_features_9_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf54.reset();
+    // Topologically Sorted Source Nodes: [aten_convolution_default_27], Original ATen: [aten.convolution]
+    AtenTensorHandle buf56_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf55, mv2_features_10_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf56_handle));
+    RAIIAtenTensorHandle buf56(buf56_handle);
+    auto buf57 = std::move(buf56);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf57, mv2_features_10_conv_0_1_running_mean, mv2_features_10_conv_0_1_running_var, mv2_features_10_conv_0_1_weight, mv2_features_10_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18, aten_convolution_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf58_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf57, mv2_features_10_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf58_handle));
+    RAIIAtenTensorHandle buf58(buf58_handle);
+    buf57.reset();
+    auto buf59 = std::move(buf58);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf59, mv2_features_10_conv_1_1_running_mean, mv2_features_10_conv_1_1_running_var, mv2_features_10_conv_1_1_weight, mv2_features_10_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19, aten_convolution_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf60_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf59, mv2_features_10_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf60_handle));
+    RAIIAtenTensorHandle buf60(buf60_handle);
+    buf59.reset();
+    auto buf61 = std::move(buf55);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf61, buf60, mv2_features_10_conv_3_running_mean, mv2_features_10_conv_3_running_var, mv2_features_10_conv_3_weight, mv2_features_10_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf60.reset();
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5, aten_convolution_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
+    AtenTensorHandle buf62_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf61, mv2_features_11_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf62_handle));
+    RAIIAtenTensorHandle buf62(buf62_handle);
+    buf61.reset();
+    auto buf63 = std::move(buf62);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf63, mv2_features_11_conv_0_1_running_mean, mv2_features_11_conv_0_1_running_var, mv2_features_11_conv_0_1_weight, mv2_features_11_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20, aten_convolution_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf64_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf63, mv2_features_11_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf64_handle));
+    RAIIAtenTensorHandle buf64(buf64_handle);
+    buf63.reset();
+    auto buf65 = std::move(buf64);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf65, mv2_features_11_conv_1_1_running_mean, mv2_features_11_conv_1_1_running_var, mv2_features_11_conv_1_1_weight, mv2_features_11_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21, aten_convolution_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf66_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf65, mv2_features_11_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf66_handle));
+    RAIIAtenTensorHandle buf66(buf66_handle);
+    buf65.reset();
+    auto buf67 = std::move(buf66);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_32], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_17(buf67, mv2_features_11_conv_3_running_mean, mv2_features_11_conv_3_running_var, mv2_features_11_conv_3_weight, mv2_features_11_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default_33], Original ATen: [aten.convolution]
+    AtenTensorHandle buf68_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf67, mv2_features_12_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf68_handle));
+    RAIIAtenTensorHandle buf68(buf68_handle);
+    auto buf69 = std::move(buf68);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf69, mv2_features_12_conv_0_1_running_mean, mv2_features_12_conv_0_1_running_var, mv2_features_12_conv_0_1_weight, mv2_features_12_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22, aten_convolution_default_34], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf70_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf69, mv2_features_12_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf70_handle));
+    RAIIAtenTensorHandle buf70(buf70_handle);
+    buf69.reset();
+    auto buf71 = std::move(buf70);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf71, mv2_features_12_conv_1_1_running_mean, mv2_features_12_conv_1_1_running_var, mv2_features_12_conv_1_1_weight, mv2_features_12_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23, aten_convolution_default_35], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf72_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf71, mv2_features_12_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf72_handle));
+    RAIIAtenTensorHandle buf72(buf72_handle);
+    buf71.reset();
+    auto buf73 = std::move(buf67);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_35, aten_add_tensor_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf73, buf72, mv2_features_12_conv_3_running_mean, mv2_features_12_conv_3_running_var, mv2_features_12_conv_3_weight, mv2_features_12_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf72.reset();
+    // Topologically Sorted Source Nodes: [aten_convolution_default_36], Original ATen: [aten.convolution]
+    AtenTensorHandle buf74_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf73, mv2_features_13_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf74_handle));
+    RAIIAtenTensorHandle buf74(buf74_handle);
+    auto buf75 = std::move(buf74);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf75, mv2_features_13_conv_0_1_running_mean, mv2_features_13_conv_0_1_running_var, mv2_features_13_conv_0_1_weight, mv2_features_13_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24, aten_convolution_default_37], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf76_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf75, mv2_features_13_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf76_handle));
+    RAIIAtenTensorHandle buf76(buf76_handle);
+    buf75.reset();
+    auto buf77 = std::move(buf76);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf77, mv2_features_13_conv_1_1_running_mean, mv2_features_13_conv_1_1_running_var, mv2_features_13_conv_1_1_weight, mv2_features_13_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25, aten_convolution_default_38], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf78_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf77, mv2_features_13_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf78_handle));
+    RAIIAtenTensorHandle buf78(buf78_handle);
+    buf77.reset();
+    auto buf79 = std::move(buf73);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf79, buf78, mv2_features_13_conv_3_running_mean, mv2_features_13_conv_3_running_var, mv2_features_13_conv_3_weight, mv2_features_13_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf78.reset();
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7, aten_convolution_default_39], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
+    AtenTensorHandle buf80_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf79, mv2_features_14_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf80_handle));
+    RAIIAtenTensorHandle buf80(buf80_handle);
+    buf79.reset();
+    auto buf81 = std::move(buf80);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf81, mv2_features_14_conv_0_1_running_mean, mv2_features_14_conv_0_1_running_var, mv2_features_14_conv_0_1_weight, mv2_features_14_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26, aten_convolution_default_40], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf82_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf81, mv2_features_14_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf82_handle));
+    RAIIAtenTensorHandle buf82(buf82_handle);
+    buf81.reset();
+    auto buf83 = std::move(buf82);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(buf83, mv2_features_14_conv_1_1_running_mean, mv2_features_14_conv_1_1_running_var, mv2_features_14_conv_1_1_weight, mv2_features_14_conv_1_1_bias, 28224L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27, aten_convolution_default_41], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf84_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf83, mv2_features_14_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf84_handle));
+    RAIIAtenTensorHandle buf84(buf84_handle);
+    buf83.reset();
+    auto buf85 = std::move(buf84);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_41], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_21(buf85, mv2_features_14_conv_3_running_mean, mv2_features_14_conv_3_running_var, mv2_features_14_conv_3_weight, mv2_features_14_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default_42], Original ATen: [aten.convolution]
+    AtenTensorHandle buf86_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf85, mv2_features_15_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf86_handle));
+    RAIIAtenTensorHandle buf86(buf86_handle);
+    auto buf87 = std::move(buf86);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf87, mv2_features_15_conv_0_1_running_mean, mv2_features_15_conv_0_1_running_var, mv2_features_15_conv_0_1_weight, mv2_features_15_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28, aten_convolution_default_43], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf88_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf87, mv2_features_15_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf88_handle));
+    RAIIAtenTensorHandle buf88(buf88_handle);
+    buf87.reset();
+    auto buf89 = std::move(buf88);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf89, mv2_features_15_conv_1_1_running_mean, mv2_features_15_conv_1_1_running_var, mv2_features_15_conv_1_1_weight, mv2_features_15_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29, aten_convolution_default_44], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf90_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf89, mv2_features_15_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf90_handle));
+    RAIIAtenTensorHandle buf90(buf90_handle);
+    buf89.reset();
+    auto buf91 = std::move(buf85);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_44, aten_add_tensor_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf91, buf90, mv2_features_15_conv_3_running_mean, mv2_features_15_conv_3_running_var, mv2_features_15_conv_3_weight, mv2_features_15_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf90.reset();
+    // Topologically Sorted Source Nodes: [aten_convolution_default_45], Original ATen: [aten.convolution]
+    AtenTensorHandle buf92_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf91, mv2_features_16_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf92_handle));
+    RAIIAtenTensorHandle buf92(buf92_handle);
+    auto buf93 = std::move(buf92);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf93, mv2_features_16_conv_0_1_running_mean, mv2_features_16_conv_0_1_running_var, mv2_features_16_conv_0_1_weight, mv2_features_16_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30, aten_convolution_default_46], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf94_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf93, mv2_features_16_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf94_handle));
+    RAIIAtenTensorHandle buf94(buf94_handle);
+    buf93.reset();
+    auto buf95 = std::move(buf94);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf95, mv2_features_16_conv_1_1_running_mean, mv2_features_16_conv_1_1_running_var, mv2_features_16_conv_1_1_weight, mv2_features_16_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31, aten_convolution_default_47], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf96_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf95, mv2_features_16_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf96_handle));
+    RAIIAtenTensorHandle buf96(buf96_handle);
+    buf95.reset();
+    auto buf97 = std::move(buf91);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf97, buf96, mv2_features_16_conv_3_running_mean, mv2_features_16_conv_3_running_var, mv2_features_16_conv_3_weight, mv2_features_16_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf96.reset();
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9, aten_convolution_default_48], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
+    AtenTensorHandle buf98_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf97, mv2_features_17_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf98_handle));
+    RAIIAtenTensorHandle buf98(buf98_handle);
+    buf97.reset();
+    auto buf99 = std::move(buf98);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf99, mv2_features_17_conv_0_1_running_mean, mv2_features_17_conv_0_1_running_var, mv2_features_17_conv_0_1_weight, mv2_features_17_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32, aten_convolution_default_49], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf100_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf99, mv2_features_17_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf100_handle));
+    RAIIAtenTensorHandle buf100(buf100_handle);
+    buf99.reset();
+    auto buf101 = std::move(buf100);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf101, mv2_features_17_conv_1_1_running_mean, mv2_features_17_conv_1_1_running_var, mv2_features_17_conv_1_1_weight, mv2_features_17_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33, aten_convolution_default_50], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
+    AtenTensorHandle buf102_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf101, mv2_features_17_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf102_handle));
+    RAIIAtenTensorHandle buf102(buf102_handle);
+    buf101.reset();
+    auto buf103 = std::move(buf102);  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50], Original ATen: [aten._native_batch_norm_legit_no_training]
+    call_triton_poi_fused__native_batch_norm_legit_no_training_24(buf103, mv2_features_17_conv_3_running_mean, mv2_features_17_conv_3_running_var, mv2_features_17_conv_3_weight, mv2_features_17_conv_3_bias, 15680L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50, aten_convolution_default_51], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution]
+    AtenTensorHandle buf104_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf103, mv2_features_18_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf104_handle));
+    RAIIAtenTensorHandle buf104(buf104_handle);
+    buf103.reset();
+    static constexpr int64_t int_array_4[] = {1L, 1280L, 1L, 1L};
+    static constexpr int64_t int_array_5[] = {1280L, 1L, 1280L, 1280L};
+    AtenTensorHandle buf105_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf105_handle));
+    RAIIAtenTensorHandle buf105(buf105_handle);
+    static constexpr int64_t int_array_6[] = {1280L, 1L, 1L, 1L};
+    auto buf106 = wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf105, 4, int_array_4, int_array_6, 0L)); buf105.reset();  // reuse
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean]
+    call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(buf106, buf104, mv2_features_18_1_running_mean, mv2_features_18_1_running_var, mv2_features_18_1_weight, mv2_features_18_1_bias, 1280L, 49L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf104.reset();
+    static constexpr int64_t int_array_7[] = {1280L, 1000L};
+    static constexpr int64_t int_array_8[] = {1L, 1280L};
+    AtenTensorHandle buf107_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_7, int_array_8, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf107_handle));
+    RAIIAtenTensorHandle buf107(buf107_handle);
+    // Topologically Sorted Source Nodes: [aten_permute_copy_default], Original ATen: [aten.permute_copy]
+    call_triton_poi_fused_permute_copy_26(mv2_classifier_1_weight, buf107, 1280000L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    static constexpr int64_t int_array_9[] = {1L, 1000L};
+    static constexpr int64_t int_array_10[] = {1000L, 1L};
+    AtenTensorHandle buf108_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_9, int_array_10, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf108_handle));
+    RAIIAtenTensorHandle buf108(buf108_handle);
+    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim, aten_view_copy_default, aten_permute_copy_default, aten_addmm_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean, aten.view_copy, aten.permute_copy, aten.addmm]
+    static constexpr int64_t int_array_11[] = {0L, 1L};
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_addmm_out(buf108, mv2_classifier_1_bias, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf106, 2, int_array_8, int_array_11, 0L)), buf107, 1L, 1L));
+    buf106.reset();
+    buf107.reset();
+    output_handles[0] = buf108.release();
+} // AOTInductorModel::run_impl
+} // namespace torch::aot_inductor
+
+
+
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin b/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..5098c505ebb138fa361a0971a8c7d89086e12f3b
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJz`FBm61Q{^C@`iK(E)oA44uyQB{n$c>-iF#
zmeRCLXrPHsd-*VF9}u4=HffrKCN@o5KLDx_+R#2seC#4LAzHd56ofQ}_xnFDm$RMh
zB~7bs;*{L;{NG=n=Q*E!?9KysMI({?CMK>Z`z@Q7a~j^hNdRvTi}ArL9%K`&$og=<
zibWVRn$@Q7`Bn)tHO@^7n2%<&>Qq|HaAu8~GudE<Z?}BI_iBE_G2KSZoSbRe71MMq
zzcy=|CCe{YO{d{an{|7t=3{F5o>g;d&XifTyozr*RWq9n6mF2JCUo1CjaBB^P1~D=
z)(rP{s@^DBb!4IDbGY1anux|PP-=~eU7DFPCp~N0RzSVxVD8y<Nep1C1L4e?LYU}g
z(`q(_&>DCPrRWt1aIW7n{T4hVyC|CS(r{~KjwJa2D|}yZQTh*r@=;<)#;Vt+Jj<=}
z=v}YjH~cvl5r*_-HlPBcQ!nNHlcrx~F{*CAf7+fl8k3Vv+ZTN=mzJVqHDI;RnhL)E
zo;&X`1;3dwt$NKu|Dq+r%f;MfVt4826yA9Pk9$t7ng6u~6fCRG4d6doKpV4}s)c?@
zmtYRdQe%TXi*pD^$<7cqCg-;SuRk{wqKIh;c#UoC!F#|*Lihp6F~|8+J^ar|e#NIW
z{*d!4FIL2!=lJ*vLeBoc@qIn`tD@wmI$c-DKF4ofLCD#6fgc(S<wvUgBj>-ehrf+$
zm&DSXE{(+=r!lqV`273`OSF&Ak0kh%F~aZe(IOGjR%{=F5wbCrfhT~9vOj|{68T$l
zGwu;!SS0OFyJ^4Sm8&(UGE>IPoU)pYhVw|xO1mwVt~RFa^a_d&j*6AGo!MsEYWTHu
zy;e%QbAGkq<c(}5-K_cc=UuD(s5NCbh2pfSRHwZe$FJdhWM#KCULUX7waTH{#~!Iw
zoYEt$x_!8kuXwrXL*tG&=}!5xg@!XXSG7l5h0#L2Vz_fG@|(YW|CPSiw#gM!wd=0!
zHHi}a_;-JLx$oC6BC3AmT3oS+wl%W+{(0?ibTIP17LA@^kwvWq;Jve2B&tW2G%cEm
zoLkn80XPppk1U=$zpOnGeS<}o7SAp(zJHE0YDYo3@XiIWoQXuvU3fPZ)m~>%1QWkJ
zMq^aF6~mI&kGnQQ(?!tsLoE7S6#3S81*kJRD*!~uVWw@1XjhVo&(e)t)xJr$U37b#
z+|sV5=@z=}r`sUiJlvL+mKK+^>j^aJ#`(BAj{-caeVvf6kkdlirkO*xVY*FsDZe+M
za9dXpIE&lj(&F;++4n?Rjv;-RWg_Pnk;6=6`P~ceesGp2VVP(Bz2&7P${NxzPidgS
z7_%r^t>NfJV;s+umN#YldUM*`ld+g#RVp}Hnr5?h*w)ABl#nNhZs2U0a`dcjl#04I
zIUdSpAJ)I1AK5dyTc`iA@!fin#v=YDZaf2A0rbq#oBx?EI)xs}j{i%2Mbejp{yoAT
z*)8VrpB4CG0d^2$tywG8ZQZaP-<#7fdXFDrhHK&JQa5JC3cBI?p5Bv~i#B3%6J}=j
zWi*z(oW^pO(^&p;8Y_g1<#QL`Z?Y9eLuxO1-;J>w=_AS^oDvm1H#)|S5IBWy$7ege
zf7soZ&EJAW%yA_|*+1;<E96#C@DQlVz8vA$t^+1MojPT+gioSQW-skMbdJz6GD?y+
zi{<;!hAo_jZ(ha1*alk2!UDyGbDIL1Zp<1pA`w~aLBnO=s#S~@f^SyqhD=cwyWg-!
z^gXM&hdpz{5|CRp0oaW<%t9U(I>JW%hS4t+bU0=t9WjHLT`gwp-5cgn0UlXSCCE1N
z3CplQn$5u1h8Iwf2T%mPKsB;`*QonOlULBi0Rl3ZvZI;NT#@3OsT<Xrq<dL-Ef~hL
zqs82Kv6!crJ=Fm@oJ^pDE03VV1fdy(QMg*ED1|v3W5ZsKQ*-c@%+(tKl^V(;p6s3+
z^v#q4ifAYvP7XeB!N+P9TQO9mE8sw!+@e};q_aYG!9X@wiV4?u>zNk$A-fvZC1zsS
z&FWaY@<Zipj_33UlnCebqbMlXtbb0==+q;tI3;-De3w%^&<d4|oI>uTQ}7uG@N=)q
zo^AQI+o;#da|S-AmVG?K;DykE+}^$RY_05@b*t&~jYUXhjjs4$rz<|CyDPo}*$$Ot
zp=_0_wu#4d-v~v|Jud{&=SB294r;KD)Qj*$`}IbNu3Sfykm`vNwe5-$mN{m<QSxpX
z3!)qgMLFIbC0`>IT1BhXGK(d$jxb5ACr*{Im2QR6i;9!)48<VMJ)t=HR?<<<EML48
zu_^S{leYWyL<+H<NRd;nY9-PjkLxWdFH<%X#G1{8gQY!%2RkXwwjA#OYOKlpcy4_E
z=xzJOwxZ~~x2erSbk;oN`G)1<O=ZiN^bZ?~nWs62t#B;{v9XQ83EK3O7<ZW`9ZM@j
z$ct^48V-I_j^w$Ur~Pa)e^)U>7qnGsv&Q61y{=T|xi(_MPKQqJq4VV}>FPX??&SF%
zENthoJ=5N;NIY9P?r)w_IXP_^o)eIHvV=TO1)d7JI5;}b)$m+U_sOS@ES@^Vxn)a*
zuHfd`=aKSYoA_)VSfqt9sc1Y=<b#WlyHs&&c_>_AC!UZh>cvh~WZp$~q=UJ@L0>s2
zGQpiFWcG7El;wbjIvNG!>Q|VAe~zV+=Vp1a^1@FT<^G51qNKti5-XgS4N(v_Ae_w<
z$MU(tc%J7WOUE?@sUFxuM!1X%O<{(DV1Sh_MRZPw6@|RvGH$4nd3qq`>lde1=fN62
zN_Xr0nB^9+N{=EY=GlIb@50`d2W{J_&{OGAuoz97ACUd>#i&&(dG;)x=+|`}?a{}-
zj9RC*b-8e9gn9#;w#pP+|B-&AF_uoSD`0sW{$pFkoE(V_h34ZiHcz-uZ6OlA+siO^
zVP^ox*$88cTLU=8^bWeluI_-N^nN!aA7P0Om>U6IJBW9q+uJO5ZGxSl7mawW8XJTU
zc*#TWdFWrerpRKO6D*1OU=;JQ7-{1dz00J@$8!9;96!Q%Ua;VuCFAi+kbQa$ywXYh
zy>Dyj-6wZXsbP1lp(kMYHzy~KMB0pA1Cs;SCD}D|QgA0mC&^A6NwnFwF(;nq0k>jo
zQL?aw6!_(v>tP@llI%yV1b=6H>o#Zv{Y0yuO;94~dNswq*Xr-5x8R8npG~pTDe(9A
zUj_KqKFR-hg565I<PpgqNw(QjfXIG^;K4R~M#3(^Lv8jQ36uXVQMWd|Oa6Tt^u9Lx
zmZX=GYzXv^;ZOV2vzy5NR&Rg@(7v1ClWa?i4X^`I=9U&U*pJ-+_@^(ta8o-9`fG@{
zrL}iJ9oq&aT%O*qL*JVxH?g-;EZ&kYZ`+_F-d(LYn?OGdkUaQ@TZuTeT7v7lmmPmP
zn&7qHZfs!-G0>Boo+SE0Djsi%HNi+9^6{&~2<!wDP(EO9xHZW62ZjFVaBGOeLtK6b
z{2^<&q(8&0z5&AYae5m0k9E}x{=yW$c)j9rs#h$iS3KTXuUJRDjz1ks2Ju`Mfqilm
z7uo0e;Zy-HQ2bl^_-p^|pV`7jQh+xPa*rt=KS=^k3<WUyZ`4m=&CTCJ>e#{#w$qPB
zfA-1^<J`u=Nx+&G!0<oO6l;g+;K(EaCwaX%ybxtOy;zKAnCnYL2GAeS{{*{z8TH>0
z+v&yQw14r<0Qr}6*#1CcFGGJzLwe@Xvh#>99RnR(1<7B~*jpzR4E_%R6MstbLnHjt
zv2}s}793<eKl)1=+G+mFD=??8z#~c0FZj_<6d%8^<<a`pPjmf(AO7m6Ilq8kC;lYg
z%lKDdSP#0ViF}a#(>n-`Q9pD0B=1F&RDTY?nr7E~nno4nRYd(jkBg3<;On8^i{bkp
zx=v94f<GD^0Mt(?#Ebl(oTnz(8RYM+RBXUwy#Lrd@{RoTd-1l=2YJK?e(cJ`&+|w8
zaUhTa<QMuj6-gX)%KsaH?<O+A$S>^Q!wmys^BOysVo%MnJ_xYB?bzsvuJ${kzmsBL
zocoB;{@rdrsj*Wjc6C69(ElN9n)|<c$8F|a4?Ku3#rgE$eRu9R@Bhku4^YxRxjxCj
ze?<mpdlvgjr#ll8{DIgs0wVE;FWQ4vlOL7H>$-^D^7PGjmkF-^8~pHMtSSYFht3Om
z%F&Z4zrF-|w@xCQk`H@T!4}-RdB3)X{D?GkqD!C)WKXUk|AO2fH+2bgf$a1e^1qPB
zyP<x#AS39Ozfs`bAV4`evt3MfNm!ME_UQ0+SOD%s7>Rl8T7h^`;z^#4!fyEy6b3%^
z*ndgN>mhlXDtT2$`dtU%CWsii-jwpH&Sam?CncxwKVL(BaTR%Lsc!kdt|6b0A1E@?
zC$dlHb+`OB#J`&VMJccRBzfZNmM_9S{(9EHK`Bo;4_``N;plfaBrlhXLbG}Gfk+a_
zKCPemfJ011q<x}?ulef*B-0p>$X58G-}jJD-SR({@?l<@cgx=)`iI@##i-=(23=jN
z^&f>tc~!q>RlI6EC=IGn<&V%8Bd6-4?5I)IOO0WB)W1Pplz<vl0p%E$Bb2I#VppT8
Vj~c^r6rakY;#Z@}s~VNuKLGA1>n8vJ

literal 0
HcmV?d00001

diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
new file mode 100644
index 00000000000..90c865f5f5e
--- /dev/null
+++ b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
@@ -0,0 +1,965 @@
+
+#include <torch/csrc/inductor/aoti_include/cuda.h>
+// Definition of AOTI runtime interface functions
+
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+
+#include <iostream>
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+  return AOTI_RUNTIME_SUCCESS;
+
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+};
+
+extern "C" {
+
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+
+} // extern "C"
+
+
+#define CUDA_DRIVER_CHECK(EXPR)                    \
+do {                                               \
+    CUresult code = EXPR;                          \
+    const char *msg;                               \
+    CUresult code_get_error = cuGetErrorString(code, &msg); \
+    if (code_get_error != CUDA_SUCCESS) {          \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string("invalid error code!"));   \
+    }                                              \
+    if (code != CUDA_SUCCESS) {                    \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string(msg));                     \
+    }                                              \
+} while (0);
+
+static inline CUfunction loadKernel(
+        std::string filePath,
+        const std::string &funcName,
+        uint32_t sharedMemBytes,
+        const std::optional<std::string> &cubinDir = std::nullopt) {
+    if (cubinDir) {
+        std::filesystem::path p1{*cubinDir};
+        std::filesystem::path p2{filePath};
+        filePath = (p1 / p2.filename()).string();
+    }
+
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline void launchKernel(
+        CUfunction func,
+        uint32_t gridX,
+        uint32_t gridY,
+        uint32_t gridZ,
+        uint32_t numWarps,
+        uint32_t sharedMemBytes,
+        void* args[],
+        cudaStream_t stream) {
+    CUDA_DRIVER_CHECK(cuLaunchKernel(
+        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+    ));
+}
+CACHE_TORCH_DTYPE(float32);
+CACHE_TORCH_DEVICE(cuda);
+CACHE_TORCH_LAYOUT(strided);
+namespace torch::aot_inductor {
+namespace {
+class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
+  public:
+    CUfunction triton_poi_fused_convolution_0{nullptr};
+    CUfunction triton_poi_fused_convolution_1{nullptr};
+    CUfunction triton_poi_fused_convolution_2{nullptr};
+};
+}  // namespace
+
+
+
+AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                   const std::string& device_str,
+                                   std::optional<std::string> cubin_dir)
+    : AOTInductorModelBase(1,
+                           1,
+                           1,
+                           device_str,
+                           std::move(cubin_dir),
+                           true) {
+    inputs_info_[0].name = "arg2_1";
+    constants_info_[0].name = "conv_weight";
+    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[0].offset = 0;
+    constants_info_[0].data_size = 540;
+    constants_info_[0].from_folded = false;
+    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[0].shape = {5, 3, 3, 3};
+    constants_info_[0].stride = {27, 9, 3, 1};
+    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[0].original_fqn = "conv.weight";
+    update_constants_map(std::move(constants_map));
+    update_constants_array(std::move(constants_array));
+    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
+    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
+    outputs_info_[0].name = "output0";
+    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
+}
+
+std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor,
+    bool initialization
+) {
+
+    if (!initialization) {
+        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                  << "aot_inductor.use_runtime_constant_folding=False\n";
+    }
+    return {};
+}
+} // namespace torch::aot_inductor
+using namespace torch::aot_inductor;
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_0(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_0', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 12
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
+        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
+    }
+    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_2 = ynumel;
+    int var_3 = xnumel;
+    CUdeviceptr global_scratch_4 = 0;
+    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
+    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_1(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_1', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 15
+        xnumel = 9
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
+        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
+    }
+    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_7 = ynumel;
+    int var_8 = xnumel;
+    CUdeviceptr global_scratch_9 = 0;
+    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
+    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_2(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_2', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 20
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y0 = (yindex % 5)
+        y1 = yindex // 5
+        y3 = yindex
+        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
+        tmp1 = y0
+        tmp2 = tl.full([1, 1], 2, tl.int64)
+        tmp3 = tmp1 < tmp2
+        tmp4 = tl.full([1, 1], 1, tl.int64)
+        tmp5 = tmp1 < tmp4
+        tmp6 = -0.0312186349183321
+        tmp7 = -0.18273277580738068
+        tmp8 = tl.where(tmp5, tmp6, tmp7)
+        tmp9 = tl.full([1, 1], 3, tl.int64)
+        tmp10 = tmp1 < tmp9
+        tmp11 = tl.full([1, 1], 4, tl.int64)
+        tmp12 = tmp1 < tmp11
+        tmp13 = -0.12337345629930496
+        tmp14 = 0.12138354778289795
+        tmp15 = tl.where(tmp12, tmp13, tmp14)
+        tmp16 = 0.05455135554075241
+        tmp17 = tl.where(tmp10, tmp16, tmp15)
+        tmp18 = tl.where(tmp3, tmp8, tmp17)
+        tmp19 = tmp0 + tmp18
+        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
+    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
+        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
+    }
+    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_12 = ynumel;
+    int var_13 = xnumel;
+    CUdeviceptr global_scratch_14 = 0;
+    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
+    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
+}
+
+namespace torch::aot_inductor {
+
+void AOTInductorModel::_const_run_impl(
+    std::vector<AtenTensorHandle>& output_handles,
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {}
+
+AOTI_NOINLINE static void check_input_0(
+    AtenTensorHandle* input_handles
+) {
+    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
+    int32_t arg2_1_dtype;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
+
+    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
+    if (arg2_1_expected_dtype != arg2_1_dtype) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dtype, "
+           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
+           << "but got: " << arg2_1_dtype << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_size = arg2_1.sizes();
+
+    if (4 != arg2_1_size[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 0, "
+           << "expected: 4, " << "but got: " << arg2_1_size[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (3 != arg2_1_size[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 1, "
+           << "expected: 3, " << "but got: " << arg2_1_size[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_size[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 3, "
+           << "expected: 8, " << "but got: " << arg2_1_size[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_stride = arg2_1.strides();
+
+    if (192 != arg2_1_stride[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 0, "
+           << "expected: 192, " << "but got: " << arg2_1_stride[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (64 != arg2_1_stride[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 1, "
+           << "expected: 64, " << "but got: " << arg2_1_stride[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_stride[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_stride[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (1 != arg2_1_stride[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 3, "
+           << "expected: 1, " << "but got: " << arg2_1_stride[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    int32_t arg2_1_device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
+
+    int32_t arg2_1_expected_device_type = 1;
+    if (arg2_1_expected_device_type != arg2_1_device_type) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched device type, "
+        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
+        << "but got: " << arg2_1_device_type << "\n";
+        throw std::runtime_error(ss.str());
+    }
+}
+
+static bool _check_aoti_runtime_check_inputs_env() {
+    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
+    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
+    return result;
+}
+
+AOTI_NOINLINE static void __check_inputs_outputs(
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+    if (!_check_aoti_runtime_check_inputs_env()){
+        return;
+    }
+    check_input_0(input_handles);
+}
+
+void AOTInductorModel::run_impl(
+    AtenTensorHandle*
+        input_handles, // array of input AtenTensorHandle; handles
+                        // are stolen; the array itself is borrowed
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {
+    __check_inputs_outputs(input_handles, output_handles);
+
+    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
+    auto arg2_1 = std::move(inputs[0]);
+    [[maybe_unused]] auto& conv_weight = constants_->at(0);
+
+    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
+        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
+        AtenTensorHandle arg2_1_aligned;
+        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
+        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
+    }
+    inputs.clear();
+    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
+
+    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
+    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
+    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
+    AtenTensorHandle buf0_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
+    RAIIAtenTensorHandle buf0(buf0_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    arg2_1.reset();
+    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
+    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
+    AtenTensorHandle buf1_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
+    RAIIAtenTensorHandle buf1(buf1_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    AtenTensorHandle buf2_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
+    RAIIAtenTensorHandle buf2(buf2_handle);
+    buf0.reset();
+    buf1.reset();
+    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
+    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
+    AtenTensorHandle buf3_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
+    RAIIAtenTensorHandle buf3(buf3_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf2.reset();
+    output_handles[0] = buf3.release();
+} // AOTInductorModel::run_impl
+} // namespace torch::aot_inductor
+
+
+
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..000ca4c1209b77cdaec3c8757e532677b79ccc0f
GIT binary patch
literal 8968
zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ
znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8Y<D&eE)yW*yHhT
zn}&iw>XC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J
z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7
zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z
zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH
zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO
z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM#
zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f
zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z
zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb
z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew<A*5o`3mwBfmn%ve>$8dP>`$
zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb
z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut
zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L
zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z
z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz
zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY
z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa
zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1)
zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5<e2;d$N<O;
zmxSG_MZ$|D43;&wBq}RTWE|ULa_O*9ToP?B5~sjtJa89bJVv&N%#Oi`(F;l(%mI3l
z=+YP%0xvMG*$k9-m$v3vut`xRc<jW2-5q$eMbCn5;>GuFVIK-CY(EZ`C4GTUaK$2L
zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3
zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U
zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty
z02F5dZe(2`e4Kc=$%pwRQS}@<aNKsYQSF%cQmN9npxrc&7mgfpHX7BS+_e3G55Ls1
zOlb8>3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8K<dajkCB9D4>FIpOH77K*R{H
z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn<yKJQZnyvW5c<nMCbTo;?dO@}z(=
zNC-$x9F3m94kade=o2NzN3|x8P|X6K!_~?5DZYd%pVpo#m|RlohAnXI!n##JMa8b(
z0(K(rO<97GJO@I|LJ4)_*S+Wn7uXNj*g4f~q`(J?m<2koxbzjHh|?T<<A4&KMfpG|
z@);!woK(6=`j3R7JcJ0F{KQlkIi&~|YFvFnvcJb?yke1y5vl86fkg=#tl?ohu$>|U
zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC<S(oZSlT2tHAMrU$H&Y@3g8$b`avC#8P40
zw_jqQl<PsbN{nP-ScPGfiX}i2L%769QmqecqQp>@q#g<oH$xm?;ERJl<CRMCEn$!c
zUFO|aEYO=JPHWLXhSb$U8R9!*Av%V1l@yJ$bP^$3+DUdXE?bhao)h%mF{~r~*B_@<
zqgRfz^h2A&H)2-(WMo$=p0k11tYkYoB+9?L4GSg}Kdtp}0Bd%YV#uD(Vk1JI9px`k
z`nuHj%Q^Yhr(*L3U2GEVb7RE9m+`GaT)nHe*2GQtadLle%^z5?bxMp5$g>gXYY*W&
z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ
zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{<S8=`O~9a@eJm)nV(@D
z>S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM#
z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f
zaF0gz)9(?V<TIJRd@glcNWX)q_GV6ug<?+@OSgh*z<SEl@!K}+zYl#{ha(xz{fBZ5
zd7bRLlMvj>>4(C7I_d98v%g93M<m}>{PB?<|F=0Y6OQboct=_OL6Henm2?q5@R<om
z$9sI>b8dR?2(87mIVOHSA<kv@jrP`CY4Mu~pXVS0e5OJ@{WPZC3mlJ5`Re_w1U#n0
z@h)YI@nU@Nflu;`C+Sb<o3y6=Bi^ZSeEbetlRe-u9Zs<R1mTVKR5&(9^$_u)KXG(m
zo#uRVKCmC)lV*Ir1H6IXm%>c?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma
z2PCeK{b#acB;VbBF-P?<e`e1F<D>YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN
z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa
z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV
z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s
z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS?
zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy
zX;T0AF8WpVCl87*-V#gk(k}YzYNH<zz_FIpzgP0ym<Ue;dWk_hB!^y()U!xYh4{9d
zxywC@oV=p+f28P?V|q>|^)Jf!hx{MDSt=TG81f(G@hxR@S;<AiLrOndNB$$dDE~|F
zPcbb=f@+NPBRPJ?ftS<|chRpa|8adPiKPB7cG2J5L7(DG>eJuxU_y-WA7n6z+AV6L
z&H=?w^}nge(LnE3Wb<ngAsTO2lUTwifAnq@Z?9oN`ec8gMfwB8;rib~@D%@76LLc7
z=VOO^r}`mGDa7dibtKK)O2*NmF~X0?H>$^wM)<PzSf}vjmu`^b{`gAR1(%)$g?C(!
NNxSGp_(Wr*_b<c8Te<)M

literal 0
HcmV?d00001

diff --git a/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin b/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0849e4dda28f19e5fb30c5711328fcfd5f0769b3
GIT binary patch
literal 9784
zcmeHN-ESP#6~D7HcI=JuIw>E4&}Q7kG>CM(ANG0^Xb1#Sq$YwAN-H8Xo!uGROV+!a
z+3~U)ld?$>LZ||@eX7)ls!s^<K;NiTp;Q8ds<wjvAd1ooK|mWPK}e-+f9HOTXLs$=
z+C-u%H~8LrzR$V$+%q$GdHVjthhmzR*~CPPvDaBk&S`jWmjK?|A;!n9dyE}mIW~y?
zdZsaE)#ht~A2@l;)Yx1QFdyyOeA#U<t6aB=<=G0e0<RHRfnN+NWxHA_+Ovx_&$aEc
z6BO&7op*x5yj`yN3wFtyD+ZX_f$tQ{#qyk;3Ds;!L2cggJ@<=H;d?dDuR~Xodp1|9
z<ed`YECw7dRLZr$DF;)uc$Vws7w7C*-&yb!P%4%&_q~!NhOpCxDA#QvOmw^E)M`R#
z4ZMTJ?iUDg)u3Sq4R}a)u`S9=t6D5@B*}+Z;roh<(tjY7j}k*NPN_8KJJop}ebui7
zm7rNggb{tob+5a~z%Rc4KTTnkyGIwi1*<YUTk`^u#qxM4I&vGQvu-Q+=#l%6*n;0q
z+D@tHARDp4(d4&x*{B9x`_A)7;L)em3HV<-4&fo`KF<Huao9-lQ-}2$9iudkm(Ire
z4%ro8#v@om&UfMwVE$%ngd(Q*0k5+yefUR!*CP1MlH(ZXPxSGZB){TQ8lU9+%8M1T
zXE=Uf1tDj@<M_co{Kp)xo_;IjM`PrFd<7wAKJZ(|R@SE_U3i)Ef31(dgC-xLjONsI
zW;{h>V&rtIwVMrfPPcXs@hRgxd-}8pgtBGOW*B5xC?M)N{KZ%my{7$RG>%?FWK5e3
z7OIm$#V^bk%kE+UGkeadRVw8tiq2%d$yg!ZtUgh!HY-nLY73d-lbJdmVHr2$r~Tu{
z-E6avcd9e_lZ{|;;dr6oyC)W_S*zM)+8_S@%QpvKzk#t~gf5z)>07krs~_nnV`JJ?
zJr+C1w6@*=uzV5!bS;+DF6mFlObs+s`*8WAi<dvp<Dg#zO=9(>*f~wp2LMCn5+}sY
zYKZ7BuoC0B*&QR$5g(X-qo!|#PJJhJWL>wZ8>jB6*gH(qx6yQjx<lx$esKB2%kB2E
zegk2KsXI>H52^bjqy7YS+tFRrAEjxQx(BJ-PhA~d`%?R=h=K>m<LREY-)}E3f4F=J
zee6{Vh*r(2dw#80DVsTK2Jc3WKj#HzZNc7~beQG1E~;+Lt`$#uW|k_btg@zsx;s}k
zQ>K;AnWCy5OQjw+zhf@#o!(>8Uv_4XnWHg>KZ%=30yhPE^3<LGP8U_^W2u?X)t4iE
zY3Sc8?2+AE27f7m&rQJ&Vl38*`I2W^UODia<~r-&Ze~>-ygixLVs^^3s)29zEyp@l
z+g!1o+;h!Mq^`Y*^tCsUx%MWeA|^8F_2!J+*y)I}Ym_@#c00X1pC|`K*Gx}m*%E<s
zIH2%;c*4IcGrb>YPN3ieQ0`snG;nO!VH;mT<$|5Z7f(64kLp6n70GVpC2tBx>(~a<
z`|&$h(VN|{-W2pw40zU%^9z-_wJ3s4p%zRO0;lL&4Fp=7FRA57p$bj-MBlrbQ>a}V
z3^u)Lu&9L_)SH3ct`M;=6WDVKFd84074d<XS}i`*%MJQ51wWj!8?Gl-4EA7kI+cWn
z6+fgPW>5ru3Z(q3XVv8Xzz?wtX;g7|TATOP-lb97K?6rR2UD)AgwiPPAaJ#5*ot^T
zS|)sA_;xA_ZHxI(h%!M5rD+KlOV*P`H+037_MjF{a&s#4Nz$dI#6SjD)*l-8n8^lk
zD(PTdVsL`wCrLfc*J6nr;QpDXu>4%9d54)aX-8M_NASY$9)EZwE-f|rgGH16z+E7a
z*W96k?>T{2t(1y|riHJZLV%|{E(Z>$_wDoQ#X?}0oLaz-w~$I%J)yx)PiRW_N@#~u
zUD`-_!70ppHs1FFD-t+&dMXS&BLe3kP)>9$xQI+-On-n#q;&)cslEWQqdft_GRLeV
zK%O4iFu-giz?t3vdDgokgZl$aA@SD{A&K>cs1|D_J(1t*4Ur$=To~fsNQnH*cWq*d
zA4QkW?`mmEm3KWsLaZ-HnBb~bf((~%9UI9VP9?)IQ|V}&8MT+QqJpLQbA&Q!HZzl+
zIXHdKK~X&vUF1bKoz+hGf#p<jL)EZmgOiqG=9!<yfmn-yZ$Ww*PEc)AVxkn%LX&8O
zAYvf`{HEoRF9R7$vDwU_T#_2Jxy8CQyI3kIl^I@qhz$h|m9&q}cURIWzp0!`cQX8}
zr@8G6%2SlYS1Zl^%}^DdU2rVFtRz#yDwpSx;@RB8A?$KBT#HCN6teDG8?CN;OkR1p
zZfxc$m}R~fgzpTgeJgK3UfHErh*S7zsCm3(4$F%cC!hDdI^L>!+o?WQe$U*`*em!&
z!Tk;_I0f=ycxf1cFk^CvU5AlwNsKLGPPUD$k@<|lT7<i4gh;qwPcn98JcI|>ZpPYM
zLfBwt7hPxDx?ujD9Ff=9P#2vW0bPFx_Z-_h%(&HH@93~U2K)Fh<(CIw(r?Z&BR<4N
zF&~Rzo;64lZ*+%2lh5S%PdR>yk>aLCcLNl=YG0R=UcAyt`2O1%p(lE)bKe?zl%4-{
zJhP;A_%#nXa6Z9q#+3+f<LIqXc4jHoVb5VsS3!Be4TH5g13Jzxy`2F3+fnwbhQV)U
z?!FPr1p1lA06Rd7N$sr!`$c15fNr^rPhLo{w-ewW7`Ptr-Gh?<DTCcjyiCGNLml=k
zAhO>kf8rhXyo8g`zckWeKa+4C{x`(lb?B<=jU9lE&W)IEx*kqkc#v%c{WDBEXJ6RF
zE+p73@Rz$s3lDgUzljIbc>r|yx5YOMLUha*IlLA8ZR{_X2Yt*>Y{E@ct8=cy^_^KV
zI_wz;kiWN$vWZ440XmDtpzqI<>~u30BbaTsMrgez8bh2uMD+77vfrPC<7Yr8d92q&
zBhKk@qF+c1oNf*ch4L3jeuUFU1U+`T8IOnbT_kUif6U<Y9k4$#64G}k01q1cR&e__
zM%ejLHXS68)2;m-S`_Txco2)d^5(Z@00Djwu&#$N{2Qu?++-%`$x*<gxHm*282Pdp
z>kWNBVfVImc43DhVd%P|vo{h%$I&ACGU$^;Cm8gPb@tZ;Gx@=2?Pu%&qx~>n;`+YF
zVfbfm8zue~`fD5x>Es`8W9YvL{~9{duO(6p`pI89+x%J${th(~lHTI+n=k1IrzP}}
z{nt1>)IW}V8RiZ{2lj8=9~^|s5GEM>zAn~-`@e(WLB2i&b2=Lpx4wYio&;STe?i~r
z$K%A$tKwS;ZKuC^GeA*O{*++PHc_4c4{k@{g1#f_{IZtL-cPXaHa}%lk2;;7>g;TS
zZ42oT`VzpleeehO-D4km<T3on!lw^CeE&iF==UFfgu?u)eYFB#iWQ)8D9&xSI|n3K
zign|OKicTft|m{_g<AV_<ms1{9u_Y5iF_%Tmj;Q4-ZyyJ#E7>d-tGXs*C!(USU$ez
z1zXtn^5)i%e_P652qnd&SKeDg{uSxyyPqTf@*47|P<*NVE&7rLxYz#eoQwZJBxbUA
zZxWMT5>|U;ba62ySUu2#e@@E38j+_*O|N_$8v~!JJ>o1UH^_|0(^Sc;eWc$F33q^q
zk@h1gulAYzr<71~3jgyp<g2U5)B8ZL{JU$&KPTmr(kHS{&$C|nPu7rsZ58_zXRrJ%
zkjH<0XW$(vPir1+N?zgUcU_X_?>Y)e<IMtS$pt6>DV16R9rkf)U*(4)wB!r@tqLJW
zvK4LglO_4oEB|ZtK@ereyjT7<kw5I-9zG@i0O+c%mVXpCg{c02CdI49honI@s`VrE
zM&#7~C_8FY`=!RHJzmn}z9@b*s(n&pRE}a)`=R*NsP;#VQ8|iFt)t>sqgq!rD!G3F
DI_8#s

literal 0
HcmV?d00001

diff --git a/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin b/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..88b88a29bf7f3c8af0026294261be1288801c901
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+tXHT<*}C&<5~p-+N})qq5glck1Vg8beU42X+dW@g
zr=`^GCN`#xPVCd9NkeEKHi?gDf@#|LKv6YSh)?^lPhEy4L`#)~f{@1We*fp?a<-FA
zOVetbI5qb?|M%DDdG5|V{m`L<!9XChfr%@~e$U!+PQ!a&7QlOPF+Oq26Ks;@*Z}Ug
zu>fO6qttL6*P6mijdL>s=A+ptRSPX<RGUV*TC6d{wOg*?I%T(3HS4vqS)6Uy1=Flr
zZn<fjQ<j@Anbn#zV^-|xvWuzdI##(_u1=dJ%PF{4wPa3s`qoEN1Dfr^y6ST5hV3+=
zG|7FPuGFTi3Q|yZIh?Om8wkbCQeKUMJvBRR79DHGRzRg(#oVzglIX!!7oyrUg)q_0
zhSg{Yp;hn}>d-IX;p%S7bX)L{?4n}IOQT-Sb0o=oSmFD!i_(7}l#dcaGFGKB?O62^
zkG}5I+?qRAM}$6o$?GqG=+YZ_|GeLOSD|Y3BQy4lQ7aZ3wkuj*E-FRGLcl6-nhHMh
z$U~2qg5OM<R;6sAb<qv}m16ENv8!}732!=q$33l<%m3Qi@m5s#+W(K%&bn-(YKh<2
z#h1pC)Yx#}!i?bv+3v%J<a{si`b#4|ikO<fYix5Lz6N~2haZw0P0k<d<Nv<oSA0t2
zi=1D1u`Korjt?&*<m@8H5A@+T2gy%$sxFg{as0+*gq-~t_>o~>ex%B`IsdJF{2g37
zB$noMX)N>%jj>H<+U=bz(mB)K8R1jL2*0aOi$q9Up*;vj$c9t~o&_q%{sP88;P26m
zxCek?frL9#Pq;NFUn*A%vw6(SX{%AIRi7?fiDH^1O0^j~v5cZKqhKZMYO|5BYHm4E
zDNiNpb8e|t%^0a<qEUA3uhy;nQES?62*n9gsZKbvRkw_zk>!i&>FHMS_*knsopnd;
z>6xd?wPvO@-kO~|(K<eU>}V^So-SqVQtPOFEL$2ocHA=Rb1d+?zy0+5z=bVx#gyzy
z-F6y834Zp6KfO8d+gB0QAaX6PP(a%pSo-Lyb|N?&_(%%|FR{Ra)&lV1Wi1fY1B;p#
zOa`tjX{P~P1)v8Ou3TNxo(;ak0*ecmmli&{!Wp$wAYFU^8dxp`0#~km5DID+85F_9
zZ%)$~)b?Rm)CO_aW@)+r+5w0KUkW1M8m|C#Mt22(@Hx!1EdlLTQt>6ak*nHw=(dAy
z&yZW%?KIs)w*zz=rkjJ?;^N}MqIM5~Cfzt6cjqX;%i6aId7hjW(l*Vibc@q%rbqds
z9);Vwg}^3m3yTX&OP4<sX*rGbVU`SBT|f?#fu#?weem&Ro`fZy^$(X87b$B<Lz~h-
zg)y2aTDex$bH+HHBQ0mzcJ;=LIgzxOVHFBESsG@ee8Sep=#-EriEiL*nXc+7-I&Vh
z;^cTTm3m75ntpO(beB$lW8=H@9E~~rN!)l6xGd<&Q}_QXU33aPnHvAM`f{W%4gC|s
z9@))h@Rt(!To!f^W4TeDs@S?=S6ydLzurB5Co}36o-1`@b}XwKb=T4R@^Za4Om4wU
z?z(xlQa907`X<`S+(cVhpRG*#x;ssF!KhE`jqbQHb|1Z!9K-2Q(9@%1>?DEHXmh;H
z>4Wk5o>XQp7B9z5A;!VDvnQKgM!`X#3VYIoV>=F+cpI(e%_+QjR+GDFr|`Z1;?
zZwkxy$@N+>Pu;(QfwA>7kb(h<3a2%NGgE6Cvmy;C>^$+jYn2N|3!yhk6+<Q{g*`9s
z5Pf1L@32#@SNQ1_!^al9UIsES&=oXl7e}v9$o`0taYPJaYNd#=U$2)#SvX`>3tp;`
zOc;jw(Nq$y)*O$5EPx{DSt^k2){Tm5G<f;yI5R*7Q!16tO^i{PvlXLMmUJftrv<~r
zXf~S}PmPaL$c`$18ix_+;L0FoA_&D!T4)Ag6s{Hu9v&xGTvF5UmCRLY9+kSuNrNYq
zg}05_DUTw$iHDQUNI#F43$|h?NLRoCNp4ZCG}2k1vS1*a%e92-yYys>{E#gT>k=~=
zucve@TlpT@O!J(cgc2TFT0ez)a?Scz^rTKL!lUVSO7Oz@9;bMq1u7Xih1^M};4={5
z=U(L<+j8xCty0d<8F)3#yLfED=buCA-Mj5(Iq#YktKsrJMM$NLp7>y=CqAXSC%!|e
zE|p{<Z{<t2i6?W{@I}u(&w9~kMD#okYOt=<i||DI^+$=WTtk$Q>WdP!?THeWIcBX<
z@@^UPq8#%@i51WtC0`>2T16|>GKD3whA>I2FHV)Q<!*)1>xz@_3^^~(316IiE9oj{
ziZ9-R*cAHfNqhZTB86CAq{t~(wH&FJ$F-J}mnoI>Vojy}!P1_>gDr}KEzLWC8mpKY
zPmdoM-G4xAD~is0o7&7rXN_ZyYgl!BmuwkD_k^LCd79JM3Rhzg8yjuFwCO1^?lMoh
z+zllmBeq>?IQUIDl80`F_OoK<U@l1)v=z!tqc~fsD3uwmjo7f$p_BXQe0fWH4hm?4
zR#y2QENo}eV((TYo~<<ZH$$l`&RB+1^(1++GF+ag0*?ee92}kDYIrWF`{W}>3XdG(
z+_I&@y5Q#7=aKSY8+aY}EYiX#DjH7|`B10hiywiWDsC+Uh0E;3<5NYw*sY4p*O49R
zVA^xgRSt?wa3`|K1KbZ~+2f&(Mgh6{WhUWY)l$i;r+Be4!cQ3G{`={oq{1Q+E1Z`N
zkrg%|oJ!`#GU@DihUXzg$2A41p4NOuxJ(_I{0v#a04qI;&<K7$u}4r)afi#)eU;46
zvoK%3IJLSDR{v4DOXtTdw}@4G3NbOqcD;OOcP~F<+tmU+lAZ#K(XjaeIVit{T2oVw
z-NeKEnyyoQ`uOioYt*(T7jB49e_S(Go<bWuIY?P#p+tn;f^iGYH;XyPkND=}A=W1P
z7d8<IzvCqtySCkf!)zyG3!6PS#Plw@#%}L|gY-?;Cm&#uE|?nuT|12Lp<6pFbZ3NJ
zqK}LClo}d_5BRV{-*)IvyCcU!8zU@=`EU^Pu@GtF7k$5^$!Bu>ha5kZ<5j_iZ<UPa
zK!WV9Rq#qL@%MkFrEi|xMWu#4xQZT+VU3fsCj%WupM%MvyQA!mIjLwn#%h$EJsIh+
zA7D;AF9U9c*n(tX^D*$tFW7M)=cDXrtqA{iyKf6Lf_}C&$R;VdbiEa0FSZ5;>1%l8
zlNVy_LJa(agSP?RHz4_+iLiadOCFK@$!Lc?2Z-z^2_EjSA4<4R@JNUKNW$cQOVqMM
z-<01@fIiS+-;?yIC>sI&Gx*ba<An`mf3q{h1L!<V@Hw`r#fI4H0al+{&|p7wFW`w6
z{+jCqLH{G-ZE2ltP{+1F374nu?9lh;a~s&ZF&1vgkGLJs5$}#xnAOoELnIIWcq<a7
z_KR?RkFqn*2P3=}wi=t*d<gU?r$>oC9}9<DVx2J3hkU$#0)d@{0?G&M#aqLie^}@b
z##<vC9^vxa;15~DCH;xF28IYT!08F(Kh#sNa8JF$VX9Zit5-POU9V7Az0N!zihA+f
z9e{mu6c^d&`QcOn&r|%H2KeXytzX>4cE$j29OfQVK7JJi92xOo^xvo(!#Zs5C3S3m
z8{6&%gRh*wcbwapKL=RTJQ(?pG{jnBIyjP1z)@Z=4$lYKb|)0#8Rq(8fg$t<^gqiU
zSVH}`g|<84Fzso4gFyZz9kxH#*qhMb(vY4uTDFb&5+TsBeUSV$jlFwL!QlS{F!9GE
zKQzKW9orfB_u_Qp`O#m~&`#}dEW@0>4DXDRe!-7^qWJiQZIITleu3*3{P0)5!1)Dy
zk@%y0f8#%eVLj-MCh|e{FKi<?ME%U|le`m*QvEsnR)XE*Xc|?NR}u9CJuEtYlCOtB
zCxqXD=sHXN3;tkm2+$y*5HIqBa=tOiE+K#K#zI35<Ne3l$T#vc=!82$ALJ1q__0$H
zKhGcWhk-x}kYDKAkRx%>DgW;PewfGvBfqf!2saFfwKaAn#-5vF0}x;XTe0~QUF~;7
ze?P{)KKCi3J-yTUuEySovD-a5g#Hg<(>(C42ltx?AAbTrj`Qi^#~wOh9{J{Dk5ke<
zzdp~ve?<mpdlvgjw>y&({E65!JR<S?FWQ4vl21zHT|LBJdHMys#{^e@m>*t@m81ai
z(0L(GIeJ><*Fn&GbrRt<`S4c~Y~H<>_sA;px-@jQN1z8}?N#Jomiyy|9)TW^onJ-%
z*YbGR)qh}+5%kL6C-8MIKsmX%RZMnBSe1eHX#XX*;rgT>iTQ$*e^ug1o{qv^`8Wy#
zpZe_o@=hVBi^1y=C9mpOgD~MHC7&8DNO@IfvQOuel2iCsR*}E7f;_cUul$u&<Uf(}
zN$C*Tr}Mg3{#L}llK(s93ySiS<cY6WeiZicuWt=ZN_on;|5EY_NB{qZ<mGZvXx3IQ
zNRmMIY5lZ44l#*K`$YF&?Rx|y)8LWFmj8+YK|b}$|6I!Zd1>A&zhCqZd!UC=$v+Ib
zx>o8x3Xk%t{y(eYRpVi4P>m{oguWg*RUc(XjjCR1^xLEU_3EMo)Tjz5N534QR6P{C
X8dZJN=$E7TR2~(-8dYA^sO0_$U6JTG

literal 0
HcmV?d00001

diff --git a/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin b/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..cd3b21f44c86f0181124fbc89ca5c63a37d6c9ea
GIT binary patch
literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAnD(U3fdh&!kyPv?nqzwiU?ahTVOz6|p-AeH$P{x)
z$ya_Xaux*Lhc)Xqpbh$^$hU47_UM3OU%Yf})1bhx7Tbrep8_Kvio_ih+g$_HEBE{T
z&$*YE4<(kD4H%XS_}+8=@7MXCb9L#LPoF<T0)rLUm6rZ&x(kFAWJBog}h`CXq^
zb1JX;aNn&0O2z9-^@iOj&ST_0H_q$1JPY-uYS~fo>Uw;+y0EI^jf&HVH|*ucYPC>X
zT`nxF)hp#fX|=k(y0X?-1~lbS-wj=Tsc2WqJD{{H^@_a?mV}Dim6f|Q2!zGEqbyhE
z*A@#4cJX{g#r0%mxr#A$b<B%xfa-dog)h{L^}1%-23|x=yBR!Kt>F|JPQ$O)7$v^6
znTh0-iYqINcCof3!LQk?jn&3R4Q3O!TCF#V)kflzqO6PLp6~ycipi~_XU<p7$5$5?
z>XioS3BfY;=;Bdjg#wb|&YwAV`dmTN7ZQcy%5t&ZJ6@mFrd~oHQ=50uMLwm{O(92?
z8tux|*8w@~gNDrbM|j4*Hs-_lhG?6eR}nBvx!f&mRp5OsfEj&N;Qd|jcLnZt^A>t<
zkn#sx05eJgJ|1oPwP{G67W(ln`X;VP1Lc?(?7qzB=<wCe&7&&Pyt;WbB15GJf4EDE
z2DrLHXQ52<hN6ac1%3oo4YNSt7bAnX2LMrl>BjlmbYs;nEiG5eYbA^di^cltYW0QX
z;`9ROOAEE?3-$SOs<BpCYFwO2U9c~{m|1zTwC*%EN-vg+_1sdacJZY(Ae=@noxV`3
z$7>rZ@Z+ET>393Tw@;}7Sfj5{z#0qOv<8FGfOR352v`FgZ3KUy0yl2m_;Bl%brFCM
zZ{GNs46OcOEPzogz`4GIzMq^VBs78hh2Yov0@gvwTIFpFxA$*+aO3uE$i|#nZdwZz
zb|2^dQNn}0r8or5B54lbw)O8;fVy5LG=LlD7?y(Xg7%;uf>tz_$aN6p!5>(N?vrpj
zs2KhBcmv>zhq87DtX<^%meTyuz%A>|mpT5iK>qe+M#r)V+09#>H%9QLHNoNU@ixNS
ze%@Z-jmE=E+#aBji@e>hVHizv^c-*E&9_C*HPI=I%k=r51bSJNKL!3RovUfGJwtg@
z9PQ!lNg?{4FqCC{7T_CO)?>sz!P|EQd5%M^?wg|ShXVc^8Rvlfxu$+phBtqo<lhSL
z{@c7A<!v6fpMLPuTe?(!4hN%w53GL`+P@_2E$g3U@(oTt)V21H0{I6KRRvfg`Jr`9
zz{3Q5Q*i$xfU7cokYFZtYfF@DNnPB~$u1KvV9S%Y1;THLfPa&?{<(~QD8l|)#(yQ%
zbcBNGhINRyFAM5@lHIx~Wo`W~#<wNFH-mp1L;-R2#Mdi!9f!GCKAyvwuV^n;8nOEM
z!fc|b;>B_q+hM&>U%pt0WqHJm<4{<!t1+{C#^Tr*7pt*kHWr`H#}-#s=Zh<aI=0$p
zpF2+B%k|~?l}apLsW$A5*vIb!M^(I5#9k7MuVpi_c&%Z_;##7o<`X0ogujq@_|uY<
z^hrvm?oNB^9(d9|p8uZj8TAlPk}p*o%jH-yl~<QIT|_u>_>K)s)lQ^xj~zF#dH6Ii
zWuKT$v_Mb~Q29i97BDq=wt$0wwN#j|EG}0QkFaD`$~sT+c|)7ZJsf-X<8$j~2366`
zp1)i6B-m*YHW!+5esw**rd>$FgsD=axLl5J!0h_cO58Y?g!Cz!_!%JlEVDo$N^$47
zDpt!beJUaJK&F$&)%-%mRgp@1h=*et$Xc~Mo|K?OQz~)Xk4Y0_So3X|j9`FIW@&My
zQH!rMjIvr%l%*h-@;Z^CE(O+JNUagliS#?1nt?_gBe?93Q7S980M8iJmu$~xhSg3Y
z7(n=Ap@T{|^uWMYyi*LTumJ9hYg*z;YzlGkg($6E)xkq3T47C#pTr~FKCpO5a9cpr
z5>SGZ(Cj0p+qS^lrHj*qx2G&EJ`22Et+r6Gsdh=?avFHL*3z62myP58B-f>AYe~=I
zEHt@Ec9Gg_grFBv+m1l=LTU#fycbfuNJDrpq;>?tdm#0~V03yR5k>?V{JoHpM@EB4
zb$)FnzO?L?SVlNm>G_%b>})!dm9hoR%4%H4!p>*`phOwg*XBLAu62z?(nYk7P2x)h
zX;}-t%t&xE8lNU(vFz99m4>E+R&0fyR*7#0G?eN`NNkD7bRZ<IM9?`O$Ba}65%~^;
zM1_cCq64G7C5GzT4WH3V1L{1(B_<b^%N5uBvR<KLrZL9t<IG{1-JFkE2uBv$SamID
zK!HSxABI)bxvXj0)y1kkLWnw8C1@TidTuwXb{SrEu<9=9@uC8{c_ku@b0@DP-{VCQ
zbn^<y3=Hq&H6hZEG_c*g5|J5L-odMvk*61>)y<3KPT9<5C$Bi;@v^0`!@65LBdm80
zv=-#YhhhX>6sy^XqD&VBiGCFQcTu@^6y0-CXnKAwTJNGzM1B+lW>B(rd}go=Toj7H
z&&AbKV}}5N*;G82xqw_87qZ^)P?acQc$RQbizT~KY*cEiE6b&gIR3U);$JU|EAg|b
zM;@uHFP9pHm14ai#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2n`{$6d8J}$sZzjOFXVys
z5}9LHaKNEKso1etBK90ihH;(-Z7ia855}?$bQ+65i6j`C+Oe@I&)6BCv8*(IWRa13
zFcw^0#=1FzvFRNfoA!*&`;0}+wHcdq1G2L^AX}HYSP^b)Gdngn<C&XG`s`&*w;PPb
zbr0c9g00J7R}|tqvtxs2S_WtQ2BU+t8;r$!F9w6H%V2jMz~JnT4bHX<&iM`IiPK-d
zJdodm!DQ<)7(LA`iQJA2&Upr>5<Y`d=z#48yM?#28cu<&%V0NIh;V+#2IpG_r~L+F
zzv+63;Ty`7oxz<IF6Z0IOXe6-qiGt+UR$rM)Zz>JU2cZHE}VSg%$bv?d7{MlboylS
zbmo-wdTMgd%KGiECT}*Ph+VZ4$^A%>n$xF<i(}#BsWVUf+DWY_ed<hpHg`sED*D9k
zBWGzYq)*M9$tBaLDRc$`&}6wB+8mH`0o?!zwbxR#i{~ZRMecU?jL>nj^I^Ez84UNj
z7A7laMa=_oehfE$!^3^w$HPt2V7Tr3Fp?WBjk~@NqwN;kxas>abk|7Zit@enWhKZ=
zIE4A(B8ItUG%3p*>9Z06&DL>R&PoWf!bwz*1Crn<!nj*^Pq1QPIVHl&X_()AxYCND
zqmx%$@OV2M%t`dkn=lV2=FbZ`i9NBiv*v8#VkGuFY|kVh%=cEElh~7!r(OIJ!{tDj
ze1AgIu{ZI`iDx;<O%Cbz#W*L1Q=BOnmqn*gGq$HCCqZDCI#?w1In79;8}4T^G4a_c
zJ2xxGmziyl$C6o%$0pp1`B*w7c)kbq#AosvkI3}IXEPcPyL;mE35{ondKiCfR@*0U
zslD*Yq%jc7Wxi;~>l@WlY=G9b{=|{@YJqFFRSS8k1!7YjC7qXA(Amq=HJ>nD3?OJv
zMP5>^8T0U__41n683o4-j}E@V%1b2>-(i>}l-SIyS?s1p@=^%KFtgfSl+=PIpaD4c
znop+d8C|iLaJCie^&c{L{s1I<1*hF9J3o6Yl|VMPVHDv>9Y_gVGG+KO{L3-+rr8)0
zXb##&qdr7?2U8xdd&r4hfdEQKCbkoib+;}#pZD-V%djgA?+=d2M_PXjtyIhW`{Nb<
ztfC@s69eW40>$}xyRwcyx^`Yyx=7_$7TtuMjGxxL+ko@M5^Wf`G;l?QrX%vz@j<8y
zjp;EphsJ&5Tu6R%652m3MupS?rEVWq?%Yy)@I#a_Z!X`deDi>cw9$(Y@LNygo9F#a
z6?!0|VgpLm@Gle{7G3f&5ro!6Ooav`Y6#<K5aVo!(&XZ|OAdZ%p8si{cko1t<l?tN
z|5w$6+rUq41ApaGpeX{V^vH;s*a&IL&*4!WQLkJIHr1;bQ^}i{J0Z2DDU`ZCswVAS
zVq{Z&G0geIhWS=}7(~}c)EiEJh!P*)2e*KK#R;o9`9K5U+oS4FoN$<5fBQfB;;4FO
z6!hURqOBh9H}seL)#I4s68?z|JyBD59)W%*6pCS}^-rq_XQ)XF`u0J|JFly8CmK4$
z`A6EmNeAD>J^InWea_I3_q`SN{Hha<X42`j@h3Fws3`5JJ-=nC_eLSFe@G(qOIKbr
z{0rnG8r1V~Co&}Y^Y8;<`rCBzc9^MF%{!onJjm~#qx|v9A(3zQ5AZ9f;J+jIu*TaO
z?+gw}m3s2SVfx!|<U?LW(_b|7!$Xv(4E>Z73Df?d&_4x#MuzC0;I%!2bC}k~k!a{2
zM|`jvJpQW_yCU+{e<D7t-VCZSdywi?^GB~s{<F@Q<c~1s8}<y%VOSg2_G3P*=j$Qj
zv(C`q1Hj9L`5R88qh1C&>SbVn^%C*wWniGaULtMvGHDOD)hG0i%z?i)-Z1)ihmVSK
z+};}vWu3j@6Ro*IyvADbnvV84BO2R8QgytHe>IUe<&5uc(T}RD%fqdB{ST~%DQC}K
zzrHuH9yo8UH~dl#rphJjt*?*q3wiPD>uZl+s4ae1e`ge(5*PD*#KL;Jf5=~NQD-QO
zHm92JHkp6KZ-2N<dR~}ceR5<^i+>3Do(N+J>I?Ckw8vWc!F<Z;zu)gqpJRRl!g#+u
z4g8?2KdE2%b9G?6pY+;3@<;Z2{KU_W4SMxT`|Uk`eZeUwIN{Nw|6F}&Pb*(bm{05u
zdh+K3=x_VA?T7I|zx=k_@4letsIMsdiS!Hj&wlCstOoTx*%)lj{>7a~-F`)S%C9~D
z9~R>^+1S^dz1BS8_S0coMY##oe&eRa^+<Y4=x<9uia6Hb)|WqeR0THs)O&|i#0l;@
z_u%nsn9KT)M1t01FBKjIOMvU`OLnM_`W}A^{g(DPd-umMclAv;Vd<ayrGIk$@7?<@
z6gJ;Nze0S*F852lH9vZNl=Xj~V}-r`iTRk_znAfDMnV65%8y#qC;bWdNe92XaP?um
zO*;LOFVT;DjXRdb{YCVF9{sle4pJ|3;Io%4kq43p?U{5Ul7GweKiJ<V@@WL{<JfQ5
zU&8GFB2V?>hbI31tNZIb`a9xr*xnV~1$w!@|CIVdT}5TiA{P8`P@fVwe?4^A9u0cG
zUb;Toci0{YnhqiR8~KOXfXV-pfXEw-nYqXt4|>0HLjDiiV?pm%TZj+&M-qPeJwfT$
z0KuOA!}jjrZjO0<XH?{m&0%cpmHqoQ%(;Irj0k=Y7p&N$>32WhJ|C}{IqW;=9psP%
z%qN^-{NBplUk@SweZ$PJT)@A&5us%Ko(lgyH=@SvKB+f%zngGET**qYzH~nNqL>;y
z_gm>lwJ7!r_=7o5U#b?>{dLmm>zl(o?Cz(|aA?a{A5)I5Z|M*3Oru_Qx%SEV;qS36
zs#hR@@jyLII=iHPb{ToEO{fqZ2pz;h?fRHH_lKCswP~sM>EAHQ0TlFaegynD&tNF_
zpx)3=TKg;N|FsPz^`hlXt8?!8MQM4zzHvwKJlAagfu-IWRR=sgg#JI6!^_XE&m!=@
z76JAi_OH%s&HzTadH62rRlYe7@4fi{$nahH1b+7F4ib?bl~Xj{O^ia)utn)+s`y4K
z*EsN<HW87KLyl5Qn#{X*(jMA||0{;S*1^y*RcE*1H(g(y>u|1<|4ZBO|0bQqrC&Dm
znD<Wphct>iRNM9Z5j~wWP&Y^3-elmvrJuDqGj|`a8vZ{s0P@Q@*(ET~x3&FU`Vahy
zCWsljPW^5ke*!#hnlp6nBX9WKxvSshbMt>3`ah1}r6WH}s*~T@hW}}!-_OsI=;Z&q
zZTOdZ@YBvt{uE#kfkPN~{noW<_%}@*(0&d#TO=}i8F$Kjq64nt20*3eV*c8-DD`!~
z$WQrgm&@M<?5@A-FrN0G?4Wb?CxR_Ruliw3X`=i8np)I)bb!KfxlcEK1m7zk>%fgK
hW&1BT-tN<{Pg=q@aq(oBc>DPTv<;sdANT3<{TGw<t{ngX

literal 0
HcmV?d00001

diff --git a/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin b/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c77efe3508a984392845250ee19f6c9eb29dc6e3
GIT binary patch
literal 11320
zcmeHNZEPFm9e?bzQ@bg48c+u+Td&ZJuyyCxBu?pQpum_`L<j6eFmyWGm)NUgU(c7+
zX(>(1gaA!++RKMY`+)c~O_QccXo6|l`T<ad(1!M5+Q%+K6QZR{LP1Dlc)$Piayi?{
zUedJMCQivc&;R}Pd7ksxC+<FQPdF6HZDQgIvtP4$Ij7;R&j{eHAu&FD&BJVh6<8nc
z*Rc>|MzhlNT+b?ErpCEx0rSypR_f&zGwQQOwLaNkhG(}t!*i=%qi#BlsyR8+w9BSh
zx4i1CZ5A!BR59xfciODkQ&kUB({rtAy;`3#E0$aKta`=F6nupnq^b$sc6no!xpvcb
zXQ4ICy`8ExidGF-sCpbOHR?@7<K-!}M%gaTOqr9eHEk=PR;^?1+BHe^VXFgCpEZRr
z(aomSYzm<@@D@tZE8ycCuVs2Icu00pH07n?R7)I5@;+AhzT%?v9|+~6#E^_tt4+C<
zQ{mA&Zo_MMa}FX5=u2-v1w^M_$@|Amzsh1%-TvUTJ#925C!4k>`d%(AMaOEuYM(U~
z{NR0e-)9PbGi_S6s)hbVO9YpTxy!`v($Oir^8_CEtXebwYYWI<R-GHbf3$!$W;0a_
z{faKZES9Cl26`6fAdZrq0c=Fh?*LwZZZJR*(<1O1+uDP7fe!`n{gPvj^Cx@wpOO5E
zPig!C=T}~=h&|8o(G`T8{f^`NdhpkU$xn5<u8=*B-@Jm5v+n>uI1tE>RQY?(e{Bzc
z8`mz0r8!+1i#$eSa?8p2`C%4opPV0#@hM}3-`%4{BBZUzUIZg#BPs)r0~Ka}0%Iuj
zx5Q@LL%^_5%A0mlUc)U_s`c_r2{UubYBn16Z&t08cZj7bjcGfzf}(??Y^Ch_Y%^sw
zylSdeEvB3~uhOXJj7&P!ta|q69IJH5nzEZhamrMxQ|?UNtKxiQCGXJTd~@14T=vSv
zBS*4cZOU<HiZi3F+*r9Wm7A%J&Ui=M?4jwSlg>!l$~JR(!<l2DU;X8Kuk^jPO|F=V
zU2|-=NtEzMzxl(<eZPDGQN@vKaYaJf*3j~M7qz3|fzW$eIDC$U7PS_Dch75~upU~{
zv~W6fVOcu?;35D$w0Pm-vi5lR4HjBjJiom7-UZI69S7;s+n2y{E)=?O>77Ved!0cM
zO#I>mjbUvch9xbIyEa49MbP#`Ec{#;`PO&^s53e%07SrHrfmyp*OH1)(~Vr!zCpKL
zbbE~4(ypiJ7P{@H+W_5M+?JM>7MHY}2{h@(`M5iW0G`*rM#xvlX(4UXtWLKfx=nW}
zzu%{DTh|aci`(MT;_~wOcSTxGAbpsnLl+m3!*po*olEb$f1W2{nP>go<)tOc8qzRN
zX`sRwvnX1%QP&H`7@j9Bcgptk=CnDUwwPg+%Q#t@X0v+K)<@}-kSB?5;B1+y>lxiB
z7IblPJd(*gs()TTHa@ajr~lEh-Fkt>0{$g#EDc;9^z`xD{+TX1g&xU_{Y!lX(wBw)
zabb__7IOH{2z((AJBYE`tQKpwZrF9tozpLSj~`}+W8vviH)cljy5V@P-jkTiHezxU
zW_tHkG?uxV#<Ew_Sng^X%Lk0*vX|d)vK2-GYOi?Tjj~(mBg$c%5@kI*GRlq-IE8M<
zXFEMU<m}DlZpR|#xFVvA54n5u*%cH#1gg9@OE|XcfQe71^^#e{C((L(5A8j*vRFn&
zQSxT6d>`4ch4bibt5_J_KnodIptx{uQ$W*=Sz|^dB7;3>sN`AIve82D%}ULXDav5?
z8*+(0zM6a3GdC;&*;NyO-FU++<Y1vAY}9WE{X#(pV@A>uGl-ehV#eORVIJk-kyS7I
z*+xEL8TLmqY53Z3eG2jbilFDIMz-e|HP2}B3OYDIKn7DLlP!#oQk*k2qf(W0Hv_K)
z!}v%(pBu}JjZw_5>VPaxCeXo^L(oJJiruWx3<3mj!1{O`V?&afg|B3;*6^v+P;M4H
zBV%x;IaBm0qM>*=*_`zANVRM$hO%@89Eg)!RLhNYmZ>fn$mU8h;rebp-6B6^SHrr*
zObj_09cx#9sGQC6oF0P`9$HpEj)HQ{`e*gDPCde->2yl)!uc+zc%Wq}899aANvGg5
z5a8!tm0a8MY^PDHmgWq6PAz$OhQSM=1KB-$?AdC`Giz4U;~R^R${1bo!A@6vN_SU$
z2QnQh$wJ90RcsTF>7EgYo_n76qtA)xc^uSW9jO=LiT3M_5?#5DC?VAoC2HFhB`kBy
zdZXmsGU`V;8i*1N-5DicBV}4etJN}tC9;k%NvtPMm9dp>h0)83lkW@#KhE($oO~<k
zC})N*-m=&fdh1Es{dyvWSWl$LDOa@;sh`L7mXwz%llEiHWP`!dp2CBj6lYtOcK|ik
zWNs`wwtwV~{bE~Dbl%(4W&t{D9(Fy$a`2|IWlVZU4aLmUoE^cIwg!XP*k}W$O;3q&
zmwD2$v;u^j*mkMm;5X$+p1V2P&n9#C6w-7-Te&)GOwQD5N@b2~BR1@G=;R(cU*3{l
zfCAc}r8_yk2MgP|tk}C1iDxUz{moG-C#NmLt^1NZSwfzt0#5~992}kFYIrWF`{Ywc
z22UO0+_I%YM{x7(^GJEHO?)=@Ez-i6R5YF_^1-3wiywi$DsC+Yg)8jD6HrCH*r|%l
zJIIc7FzY+$DF;O+xD)yGe(r~|?DJ4Zqkvrf3X|}!ZmHxs8D6ZM@DoP4|3SJasj!H|
z3g=}*<b@3gXVQhyTsA+J<9W!?aZN$02eyC_F5^H`kRdM^V5LhD8X?Fh_6Q1U$mKFl
zpprRyAm-~Ar&i~|8azsO>-?DI7O_f?BPQnBo}cgho|Ok}yI!WJ(&Jz;nl?Wm<MPF*
zRV=#pES~7sbsg`~$G?nPr?z#uaAkyg1Dm!=6kGgQ9BGWCV(c1N-iH6kRxu|>B7=eX
zScJ_J?h{*xgzxs!j9uF4!%;TO*y2_njxfE0uCePo;4r=44akRBtOMpoK-Ui9-RSl<
zi`*Dv=jcTvUaLk1-~(Rr(0d;G*KR1V$mSSJU_KDWd^AGZ_(ktBY4VXA|1QT5F<ulb
zcxTCY{1RlJTm!Fk5`XX8T6*`%-BW7VU2Ev^8UD@5sbirwqu0RXz)cBu!<-b{iLstw
zr;f$i>?zEN=Xt=b2wRjaY#|AL`Q~~E$b|&^VJpVp+3wp0ji8@u#n}WUlCD>i?7OXa
zoZf=RK6p0C&L+Vhk6#CPU!UZEEXMW`FL^}r#}aM!Bp|Y%CU~IDo{_La@L-#LTf*dj
zOVq7R?~;F;0==)zz9s3!1RDhXBly!k^Xw+Fzt!#M0krQW_%z$nV*Tu;5Oa!)8tg}I
z0X+Wfp9}3U=&vE(me$?@b!;1yaCv&a4t=kk-o)NavS>@bylsPycz3m;><k*CpX9+m
z)QUx^)nZ)V{p{q^;TW&|c4G@$h=88p^aRltlF?{OtO-W?kdK#+BCu0XK>2{Zq1FKB
z9}xP(L#;s$4|4e(@Q1A7lKu>}`uYje$LT5LKhjmNXji?WQL0zOuU9nMS+7V(y-q$I
zN%-;H6oP$n6c^d&`QcOnFHrnj`uJ=A?VsAhhLeCd4{(nuA3sh2jt%-S`ftQbV$IFp
zPU_gg4z|+^hkyFYEo0oq!fC*o=EKN;tSQzG)4`EW08a3Fad;ujcDj)W&oI}Q4E3Ww
zp#O1p=Q8TQBeK(tMrr@zn*s7K>9GC2#$Ja0mWK4qqh;q2Un&ARwhEHJq_H<oD;WGA
z04DyV<cCK1r(^2^|Lr))cz*O3HMG<GmsVg-Ux9}cq+jr(pC~?lVaucStDoii1wZ`N
z&vJeNzfSxKzL)W@z_1>4R}=Xl`)79$9HD;Z_DS9iC#e1$el^8zb~TME%BzU_fgTkd
zKf%{S+>PM-AG%Ia|AIdp?gtbn6yimGP|h<G>>Tp<W-`+6GTwh|9{EOo;%>Ap^g$l+
zfgigv@$>u<e-sF$0QrT!O$8DMo$~(%;CqQoF!Brg_i@93*u2IrB-xX5tPcXLZ#y=6
zqO1Lm=x-<47v?@>w12nTPiX8+l3nl9A@qL;o96zn+;xX}&qEL6OL0Cu_`u!!%?H2y
zz(bU@kFSq2@L!Pu+MdO}(&^5G1ivpf4WCH-!Hf2w)#OJc@}@3gw>*9G-DQHS|2jXs
z7^_GD;-T|Go^teL%C9ei-mQ}eXXL|PMX>qzZr-o1AwMh)o$3<k0@)L5$p1v{kDIy#
zx<Gbz4f&tT<K0j{T#ylT%ik*Sjvt_$oZBuYyCkg2KznrXIw}BXB8bF1a-%@JAn_zm
zM`5@8FbV^odhEX_<@JC(O_jW=BmJ&}a1%rfTz{1Es?KDe&L<_O@IPBaesL9fYN>Ad
zzpf!4lOHJ3(kHS{=XJOIHpIW0{{<<p{3Lne>y|ITKK^>vz(FZbIS*b+Ug79>HzY5Y
zi$b$`^?^td$Ud!~d7nc}hNXR?2e0{?1tinxlgL)^qTlzBPu=oAlJY@bns>|JCHjZm
z*~O^j?*(05tMwm+M|oAhXH~pvJSYvSQRR=&mm{a@qwJ_r)k}>*d(^*vU6g<tRRQG~
elp~a?hhkTws*f6jaulD+qvBVi%Bvcc+&=(X9qb7J

literal 0
HcmV?d00001

diff --git a/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin b/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..0ecf0f0b34a0a2aa1b740bffdad44c885099c559
GIT binary patch
literal 10296
zcmeHN+ix7z89%$TcGixw>ljEAk~R|(hfvb-UhMTI+zcU<szFeoMIuGR?C#j!c0IeE
z89VEkz$QfqRTZf16H*^gpHSKtq(0D#Djo`gKA`qLs3KG$<fdyk&`70hf8V*x&hFYp
zwMo?rqxhWj-7n`m-&{7&oIG`3G!n^fW8#XkU$cf>)AZh90larW%#ZJQoK3Mj>&JLK
zi!f$*v!3tzRskzDFU<*<&!#u)l<LfI7LBqqU1f%E*L}lx%YM}{Yt^zjz2Mm;({wDq
zyl9&R%P-EFPSu?=EA~v;$I|p&tL&7W88a8C*^&Zp)^hFAr=h~NJ=<M`t`zrdrcy0f
z6~tNgIb5tdo^Lt+I3>?0*@cA}bK14$Yz0)x4%V(+k;DMDIuOpHDTIk`dY0!2p$+gB
zlHDs1;A(!|^y~1D>|$G#mqx8z<Vcbau)_DYi_(7}l#dcaGFGKB<65;@9(~QN`c;3a
zh6qFYQk%}ZsK8Hu{68&WRlASO*>gs9dfKynQN{9jC^~8zr*qL%@FNeLe83d^X3DfG
zWee4a4GtH-dB{XJ=-78&M*@#AqfWqo+i?gEN#}9?myW|$nx8tX*XS5!aJ)1&+;hmT
z03&Y!oEDQD2XPC~Umoe<zZZCo?d-un3VbAl-!3^$bN*xxe?{^uKBe&t=T}~|#h&H(
zzBWRxUgh}y9{h(KukL<r@;9R7|6m&-S1#})!|nVuq^XOX|GFOj7Oq1QOKZ9`7JG{3
z<k0y>;|Pnl&Nq(4`I0fhAMMd15Xx3;ADkAlF_nQogC)vp7$cFtB?d7@z{DbB{#<R$
zue!zAvQt_pVr9-)UbX6ct89%`ON<q(rKNeVaK_8dIrVaOK9kM4-n^Ze&z_x`t)BUI
zdA#i9&N`L(S=VtK+pf+mW{lbri~Q=3KYXMA)f*TaK<MI%MYNrfwGS?9XQRWB542~Z
zzhjZ+s<s5^uUZ|W7K!SSRZWYgA}ed!(*T;6CGH$CuQgZJE;lcKxWbWI6b#yO^be7U
z)`#WV%GxR?L|=|lm@pdUrP>)X&>@$Ywkx7tN45{sz`gh;tq#%PkyUMgmb+<qkcK;G
zIEi6(brtR(B+#URvvB4Dfg?0{G(1MbmuT2W!;@SqU*144x3?Fo6>W%MpN2g&EMjP`
zHa`&YK11<RijcbSs`-9%ZS7BMtIai}PLD(z6h5WgShQWQTy^xkF^LB&%bl@(-J3Iy
zr7UJxr4m}UXL{wcww|N*D_gW~plQ!IdRjLMd0n*R)9LgR`W^c6v5BKP{pBW)>Uo;;
z_>;KF6ma99r_SB>&va3HKb@ZZM16VEmx2Cc!XDYpXYrR7`20BRAjYy+E>vvYupQrB
z(l^;Wk1(TV;n7Ps7INddQS)8BCmowqZF0qC>gY8$k-qjOGS}Wj_S%~m51GhjHd`}t
zV<$q&u2Jpe*e&$rJ@5GClAf8!v1I~hP^WlN(FaCrw@==QGsjT{1T-+}-k#01QE&^W
z((RcHaO}`26K_>c(JbH%%1IrkR#7R5`Zo%aH;vPFdW#kR#BJ;7&23q48hR-P+`Gv6
zx$2^^Ac9V#J&YE8t6Vbb2-KUc7&0AcG^SCP=*QM`3hiu*!DiMC7VUA1db7~m5hC_w
z6lFjGhT}t85g&-@_2NSd-J&1k@WXOSK|YZ&*n`!HbP66;-GG9aK@s$E+6CLM85Q5~
z_#V~Jy+H=k<U}S%^tpwKF<X{oHw{-MJw1`nOy=`h3e;6woWWfIbZ})6X_z1sgOGuz
zrIJ#Z!7XIe&2VZ4u9CS*HK0<(xWtozFWy2SAc%tD!DQe9mpfA~*@~egJpl(Kxj}Vw
zNM(uk0uz~RZy-EBs;BDYhOAl`mzb&1T3W|BlrKtnj-uLKh7#eseh!<+HS4$PDV^$K
z9j62@obPgq$62BcBd3r$=@fhh0{q;oqH9~eU8`2g#U%spUPT{|5BOwpDs%j}y;v^#
zX2tS+-Xw%n+USZ8cDmwIy1U{#mF`eU7K&DJ);94d>Kmcxx##0S^jQ%-kAupmBlRLY
zQGC5oq9!*HC8T<y#NKvA3CkR_*(iCn<bo)3p(rQ2qvS`VL}zHdy-eeTY$8k&>xolk
ztX-`zx}`XIU&se>9t*|E8%f7@ruo?|iJs8ApVZDb6Dh=cB1KNQs&=G79yhzCe4Em#
zAl7sy94z$|9_+NZcV&15P+?7HCo_{LChk5V+KQs{+NLrK(U~{z`i51*x3s!3?VmLi
zGf#770xfL=2GQ7ZaDtki65}rOq|0t730cu~so>x@<w)LFv((R~v-jmwbU|CGyl6}>
zR4PhkmTMz6bUIXW51k)xNzX$8^=D<3_h4Z=i>@9f@oZ(dzgbG<^qgh5j*?7sds!~e
zQ-QmF7l-hRtKqp2)yMZ59j>zphuiK^(UQ3V?C3ezX0GiAk0j~i?FTH|DbW+kIlQKL
zHt(ea^1F~#D7f|_p5wZ&b3Ls5vv@CKx8SD*_c}0V70HKz<pBi3SZplLc3|E`>z!iF
z@gt%2WQ;Y4{<$F{;rD<PV^<CYa34FuSaW9p$C%zh*Vyh3n7_z}<RdKJLFYz5*B-)m
znY}F*+ZSg~;^RjHf3acsfcIf~C8j@ZPoBjF<1B&oa1`rYjI{BKzOT^YV>$m_&L7E{
zCjfN%ra?JS*XQM;8?W>dzV|DV&=dX|b?*jxl%0R~d|^4#;-9%Fked>0&yw`u0A?q_
zE-VkW*z;J^r?4LbuE$tYvam}@@XM%2fxMJpFV*Axo77#qkaEy3)CbrUZ3<m)CfN__
z0|WGxFMjofBzr3f{(*t(0pHax`JaljyNDMoTz)y$V$T91`zh#KPPEu_60Q+E++yFA
zF!^5>HPE6@eeaBcKGI^}lk`G@je!0!{As=Z!ZxzM)7{1cXx&fni)^TlZ9E)dwZc^m
z_G32#{>jCQUu;D||1ILJYpwlI$96#pm#6Q9(ARu%8+$j&`s(s)SPOK-d#JvHy@%S`
zM)Kevtq<-%$+AJN??HC{?I<c0*Ir|YU5bGo+eY*l(Jv(<eRcWGlJs%?vf>Cvet&)z
z^b4dAe}G5p39dgO<fAXuV_o~v*R>ygeY793U_bi$I`<>iu^;E(j>ou1d%rNmZX$cb
z+}<#^w?EcbCr5B`e<{BHL=fLi5%7<2{t?bUhJ1<s!TFJ&^KTE1Q2aa}uOt8u4F@pl
zZ^BRF{5I|+_3YAqcEFEDfBMGFlO)0}y$D#-0vP#^d*Tc-9rRQJaDwj_hfxpPQ4i4f
zG`pv%v3K{!Bn(|wH1?Y$(a}bTz6Saj(Fq3qLyf(YWI7M7aVKL_ydLydxW2D)82h2`
zP7r^a{u+k^I{C-1VT`EXcwNJGG{g~re)5<482`=*f8+I}q&MIL^uMB^{2D?Z@~6MX
z>4E+OsP6%i=lM!T4!HgO++_hX-PJ@sxc~bJ?x%VV{C8P`>O;gg2D)l<Q{4VRcTl4p
z<h%angY2NYo%hY{Bm(~)_zErPYG`aF$(~z6YX;Q67fqY!s^1X({UrPH(npMXajW$m
zjlG^^y8}9e{s&>xJn{8=?>6sy^l|*i#+MI0eDZ|($k!fzl#=$@^;rh~BQikkLiB}B
zccvs*k+(fXCI0Y5{a`(L>Rxm;KS7>;S?glqa#!WU)vPo~JoMbgdmLswW%BD*(7Syi
z!jtmxWmd2SV>fSp1Npb4{9gh|vFMh!H<16ObQCwWj;mY#;s)}+;LV+}$M{X6t)g50
z7D1}DImGIEnV>_Gp!SI0b3@Ehur?LK(K0XPe<txHPq(se`9W+9eCn~kBIWgvJS~;H
z+DH1`m~c}>3|+%}1fcer>{Cf7IfeiJ2J%PNk*DXIZu#GDAU`GLQ^E~i<8;65mcP1z
zd~F^36lb^mjgZH`o-^>gl&74BFD0*V^t(RE%cn_&X6#0>RKG1#{^<NP1UeW8rF~T&
ziqPN>!;I}y2sx9j@O1?M`P41{BPk!|rFFObVNpNqo-Rfue?REzTCe^nZpy3r|7?m^
z%@0X~YF7Cp^j755{wO<YR{N#qusz<=<i03=HLHD6b6AdIRQsX$)vWeM&0#r;PvueZ
Lt6Ak$%}VYcL><|J

literal 0
HcmV?d00001

diff --git a/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin b/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..e8cdf9d03a89109e7d9e93424697209d3422b085
GIT binary patch
literal 13240
zcmeHOUu;{~c|YW(BvMjD$#GpNv*)YSLR8eb{7=bAmMC$OWmtk_O*(7|3XT*hksU?S
z_g=}A9XE_S6iHDu!~Ov2ThOOI4A>ruEiJmu4`~+!MHlR0=))d1quWDgYqYF&#g>)(
zecw6f^72y3mRJt9As5y?=llQXeBU{jy!1<tJ^6Sz6v~VVd4<Jah>n`mP)$pqdaoRx
zzvFqaB9_G{?ngyP2+OHDuI&~}nCWqQL&7{3oNA-o7FJ`+sx{V{!g4EZ*K+Ne+iVnC
z&01k?)2WmTg+|e>ZB+`TqPtoxG@ABCp<Y?9xtJDQyI5<~8ta8>(Js5iMzxSmdkPOp
zl>^<C^5H78D^A7Ug4QH^yIyaWigje6<}$q6Y&eL<%~EQua;3DnURbk>8x;-IYYoip
zN?j2>xHy1lY!#$1(F;z|aiq`z_#&2KSir-z+;+ij!$Y!*Mblndt=cLhN#4Us-}^2q
z|AEjxDh$aK>-BZJ*s5~$ExYM9-R%}4^yy0;!V1Vry`A@ui+-PG)Ybm%MrFfluB|y0
zSJu61Elo!^pxd_!8h-Yv$DS%k{z9@)tk;UDUzCKuT?(fP*j)xHg)2|uamRk}f3tzS
zwi@*R12xd$Y^J)=Z)*$Eabk$+q2@e-qh#KPji~v<z?*Lz@loWo1iT?;hVVA<As>E1
zacnbxe2D)mieK|-jep4e+Kax}HyA(CN2u9fGk$ake>6;f>eIDPb{Rk3N2uBF0)J$>
zpPx=t68?ty?;PUq;X0+TG^fiDkr!!<PrlsgEQ)CF<<4T1r$P|^^pF;b(6%CfhhU^^
zL}%dJK!wFWfH4&MkEwCoL%@ho!rf>k+@`%+tu@M<tC$tmi%zrI_(H9iIB$zYwYgDA
z^igzhl#7W<W6Mbtn{F*puay$5ZMWKNWUN#&;ndv9y{+QvZx`1qj#Qi|Xw?aOv*Ffo
zK8n@z_V&j4*0$4F+b(Ty*0SfnaNaqWt+v}+PHn4P&85$~=PxX6opT*Gy>)J*o?bm?
zwYEj*yZ`k2-y8j_W2$4Sm3pgUJ46Zp{4f9Zt<mp%3sJ?8Yk5UN#!P5;cX!9Q5S|YG
zs}TynD?-LgVKcO|YitAcZ$=w;!vN;`4Gc+11q)^&;$xa|DclvIn>$^W?e2DWKm<n^
z9L5)MyJ1`m|A`UWliaLx1n8ga?Og{`G873L?+93eLx1`bjbY;<lAR#yn^Ki$k?ex;
zEYRUM!dR*YdGV;I1{V(qU#!A77BcQ6h2wOar`s0Yev@wZ(Crf(|2kRT^pH=`?EB=M
z@m=!m#@@A?H*Ofm3Bmr^G~3idKQ-b6pQGDzbbFUYUu4zH|2e{1xLq@*Xlj!v&35<x
z`Nq!88@tAfz}(~{(n?(0-MM~ahm(1|+ugaod%e5U?Owlmtt%s;wcEM2$7^WZL3*!u
z_ly{+rc8H@X>Z!y+u7OcLJ|?k#RzvOgH#UI7D}boY?#Z|5}sm<_IkxNosGhoWKmef
zavA5DQ*de*DrTO}CUt(97EZMFhM6+0(y}Sfs~1wK&zrwwUObaKZPH(U>9n~_<1+pf
zZYc>|7WCw$``=F&oqaE)mi}9P%cL(2{b!^-vb&tYUrOSav#^60Yfi0Hub5V);o95g
zEuSol!fF-qOlw-3`K)QRT-zK<%q<%!u!%zQ^hanc^-&s2f0V{DAEmLZ&sZjX>-DBe
zA?H(j+v_ebK20Bt&f#n-o2m4&xJckSsvVyw&DcWgfz+7?(Zq}^A<Ec-{lHSHkAjCl
zl^;l_ffJ{mEZ{S7W3^Di=if&1KH7WgW!XknN%5x8zAqeF<9z=9eJtb;(LxFqC@wrL
zP(T~aEo)OIB8@;6R^4K)Y_$=*Q>|MnMQJ#&U=#h!e(qt<JhTL)_e}tH<3qEMfrWvv
z;r;^Zg@X3S44a4<#PohKWA8pRkFxNn*eHA1hHhAf{akVxzBX-5$>8JwMbh)MMisYZ
z)m_Ws6>Q-ES+I#IyOhi(Da_5fRjnzqo$=`RrjyBRCbgVgO2Krw>~Y{&f(FG$`LK0$
zWN<=(1cnS`h#-yH861t0fa37$qH@yI%>C?3z3EADNoA;{^6=EzEO``K4~_zN3X+02
zUn^HMLpjZ9;5d~<-HIexriR3jrUZBky<j7#WE(hL@krzl1~CSB5FBGx7FsD2y{|qB
zZ)JH+F2WEeOXiL2X3a||4fezQjF~j4+V%-SrWm1S2Sea2ms6abavGUdAs|A;q`g_S
zD@C``YSwG3+ZMjoS#|L=gfFz7Oy753WvjO87V1UE<t<7|rK~_UkfT7PFd4wWy8s3?
zxnDM(ObwVNFIJ1I)k*;$MO@35D_$9njTd>@%T-3^iWd$U5weifvdUznSY$XS+RG1?
z6G#o^1br2VH-$<IqK4}zr3)!gB>&^Ia6^+VR1+6QD(z(^@5{_mFf%MzPVrVXoEgvi
z50)Ad8_G>;xHVF`8v7kZtryIfuB#859I}kO@Gc`;JC*mcbH<mQ6qh*fKzcvmwk)S)
znufZ`i|GUA2M!JAN0*^58M<41sNoW$&Fq$XO7?}U0U2pJ>E#E7#G4Q|RtmeY?5s6;
z;%B&U$fvc;QhMoZ?%}g?H_~)Y9GB7l3|X&c573ZR<veIYIOlBFDz@;&bK6>TFIdW6
zR&Jv>^bTU+TFBs_If#MFgwDQ$7<dcHVq>LMQO`G7s?oK~<I739kfd^L%Uav4*FDF0
z?;$p9@OT~=qH`h@9a$ctb0QTT`&*En<2)*Q4i3<HsFISCDJ0XN_z9dXzTuek6J(Z1
z0y~DEAhSaRda$5_RU6^7<d~jA2K*$BI>$P*$h@D#Y00q*S!nc=I3v>cEbUus8%4`*
zc=k9UIc8D~4yHk#JSQc`HdOn=Mu3NtlVcav<^UH1JnUbNBa{a#JsPNI;1r&L<x^Be
z`PO2Cc?c!fxE{D90zB+^9>(8?heM<w-iJqS`Z=%;GSyo;Q+Xu5Pva^e#Vy+H9DHf!
z@nP)-`8a<q>J>j5?a*Faq>iFkXi?Ak*|;Ng_{>IGY+;d9;?iE?T%9<1cI8<#n>@?X
zq*gwo%1Ud|VF{z`#=%TnZFvsqU?#3snV&wBwAU!=QftYwAU2pqZ?ZprIu{Sw0v)|M
zM3?oz<&&olq<H-DvKFKk`B08kQ9BELpG0PHVkDRInRIq3LyT~W6!rPiT*t~ul4+&b
zl|Bc3BSj7lX`<fuYm)hu;gC#=wtYVbeeVQK2q@qaZ5Dn_GG#JwS##L>{2cVn7&IZG
zfF@l{`tzgohEv2ctrRw3RY)27SiJ%_O+2v<K3@ADqfeXkur2eH<`~oLUm6eG*j|KP
zaXppU`&c=DbgwkZ^f~tuSS+W)56Ut1f~Qz2*_ADPNARJpOGEnjrTT}|_Mu$3Jwn5Q
zZ4_52w%ElO(illZ#T~G0;x95I=j2G_h;P0W5go#Pa*{~+jY3k0y?GBF5sN}}XFNDU
z+I)0F%nra|dQat}hXh{6_+T~y*mwr-cz&TLBFCfRxA3Z`gTKi1B>O_&9MhjMw=5##
zQ89)2bQtq|gtYNR?__B5b2a{_8sEdXBU$k7MR5EI6ekYAYn{YD{ML!y;RxY>d-do6
zdKCJ$50@{7dV*eqk>htyiMefMavo!8O61%y=cY$V{6Sn?zBt(vuV79x-<T4Y+Y!;l
zl&`CC$QyFLfZ^(t_<DPk-(fv;3>HAY+#VAvq>HZa$Hnip$HwTb;ONi378h^F!9O;3
z6!1f%ivPt?@euKnhmap>s|=GvKPFhc!y}mVUp(3qUj{d!lLSxo#P2A$MeuA-d_}>C
z_f#9T0<b5(HYR?U0R66>_^P5K{v)8j2i3hRpuZCbzhUqqh5^6*nz*Mej8|WI<&`kO
zSL2aMz*qiEh*qg<P&{+(k={wD6UU$g@Qgje>)-p%Yf9hD$O;P0V1041(4L$je<lfj
z=V|fstI<h*t9inj6jvjNm#DbB_|-d475_r}D9ay}`a%osut5@2(!cOXn>qlO;(z(o
z$Wc#!DF*$rcP0I4@$CzsUIstq6Z!pmI}%vWk-&P6jL>>Uy!9Lz8C=iEz<MsUXW8Fb
z_U~k5q^;gHd;U#^rGLLTDaw?eyO{qj=1;(1>=AT%`5<c^92#jK>%(uwk<aN_&LHLU
zn*#9ByF3{6Jnu%MfAqb3mRJbu8+BxB2oo68!@L`dd35As-kq2jgvZA{81)i$j*A#6
zBl+$W;0eL+x(U8JE+$Z4*s;DS9(m6YKTbpxj9l*;RBut|1gBoxKjlWzkDUj(USS{n
z$EExp>g_dQR$1SJI9z$XqU|{}Oov(s`cG<p(0*#rdYZ2@|F3F(=!1L8pR4hZ`TE=(
z;2>T4hyDXUwnF0n8T2J0lrO$$S<qjK4*q|a_7;zmztTVSgZU=IQa|*W)d{jM>42lo
zI0cAJ8-L{~VLow!?0NB-uQR`t$9`h|i0PicrY!^lz_o(?;$%Pn%0FLzLgI&HFDCbP
zqF;r*HwWX7wr6!d!`Oc!(RNhBGCq5J+_QHzA?EG52zM0PANdZOKboC||NVGhXL+B$
zbA5lQzU<La?pP@gcw~gmVLsGo{p{!j9Kf{#{U<?}`)QvZi@~6xza~CqPtQ?k+$C3^
zhWz-rJdak$KhhVK=az<JF}cG4M*n;Y{PY`*PvDREWA=29^kI{r{SWk5j5?URj^SvO
zel_t)Vv+R%_A4vm@2TENex@gPaL6-!q{sb2`WU{mC*uSB7S|ihlRWATbkrBj5sdnm
zFw7B*{vp^F^4A<4)F0Lh@C^UPq=WhgeXL4k@QY5E_1oeI??8S2JIR01k^YW7LE_9G
zV);d4SMsE9me>hKzajpoWH6+_5ZB`3%lM@O<!kf=9#Dv`&s(DZBrbk=`#nL=9lhR{
z3~?ncW<5HD{vX1H!r9L~`f%a#=bp!p(s=sJXCFITc=p#m`y2xP`1&{l|2r~3+pl`E
z9dKwxL4Ob)K=bzN$4mg&Pd=xRcL#`M&wtS#6fnW$-;kTLs44;Cq4PoRnSwr1@%1a9
z2XzwRiu#aJm2BQU$ouOD$S*2Ge;yDBfZ|IB$p53-A0G(_1VHij0rK}yVsUl&K$3I1
z2jxF4Nv%GIoPE=j<0(bZ%fQcs3P-~QId84V(Yyb^l+$l149U~}9+Y3i!oa5?`<;0y
zXv)E>5iPIRk$zW6xD~~xhc}eGUT3mT=ed^C_&+{C{>^>lsicDP|8ju*yGlN(93uPl
z{2!D*hWPjMU)(JPwVxzUd_nnT*vG%29+*|~lym>3<uxvi3CXK=(P+_8@*O2W_Njk5
z9*3MPD*HqqxIQBxen#~X$d>=2-`A3fp!{(q@8_j?Q2tTmk8qC!7`6NppzCYD`lGlh
zuloO!XkI-&qYUa%=a100BB$3!+tH(5FFpG0QT^(5(foSU>!e4&9Ix#G`PA#9N534!
Qr}L=o=uziYk6P}(0Mh}Ap#T5?

literal 0
HcmV?d00001

diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
new file mode 100644
index 00000000000..f283030cd98
--- /dev/null
+++ b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
@@ -0,0 +1,6 @@
+// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin b/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..d2228db77f98247a3ac53ee64ff6864a708d7d4f
GIT binary patch
literal 9528
zcmeGiTZ|i5^?1g+u{XQx-L#}#=ws5xkcu$t8Sk#0q_o?nO^Zm>&=e4j$}qOaYlC;}
z%*;C8Y|}ac6@*YL0UsdoQT2;Rzz6CN5<>ZCtEl3mLh!){QWZt0DrvOYCIr-iIp;oR
z$M!CT-KL_VSJ}DuoO|B)oHGyS=?4!UP-3y{7T`mH-#}ZeX?o*M4!m(2pC2E29A+U8
zgLu6WVgR~d@dGcg3%HW=;yj1hZ21+p)B@dY>Q%Q~2R(3FfgX6(pzd0Ydetf~_)f{P
zTsx>X9jjmm#fs(Dy?LwV%vA$iT7hR*-Kslh8B?L0%}MYpw&#@I7a5-8J6;ooWteSq
zwR*v>p{vz^!Nt1k2eunbQT+6hQ&^a@%AP&%NT61AaqT%Zu^Ga47s72?oSF75-}ZgZ
zv<}|Jc=t1exJJ;jf)-jxdhukWm3pIEWJux<vE24m6NUYVkTwbo@z}N6oM$&G%=?B{
z59+~U13iq$%WORGJPWVG_5U}IWda|Wcjonax$HXuPi0Xgavu{MtF&oJ_{gCL4_Tbv
z%2;--YGX>`DMu^7+G$}|=sJ9sOdO9_Q<lMpttP@s>8|PjswOto1ZBxypJFnx#8epT
zE3|8f*%86U#rl(YE6|?b7TMv;hY+vAcpv`Dh>u0^TLr~e8GWjczAoq`os@Wj(Mu~<
z*}l#21FHzJdXeD=`|y8fczG9G-TsO~_7ANh#A*TY+s0Po(-tQDg3;g5N8iD*Q($RL
zLxuR0G^d77wcEQO(K*%Lm0(K%!td^r!Vywe`~t$9H!dUa2Sh2*z-uh_&gc+cW9S*g
zCWHCLWKj2tm8x4>DB{YRv;BJA{d(1&%pC_P=8oswdJFGN^L8`mAImm_R>{san~P0*
z&d)WzQ7Ip9RJ;ZA#9YZPPiM1r*VG$}5c~C?fAI3)FFyv5MA!L<$JFuI%GFED>Y_3h
zyQ(V6>kwO3TL`RNTzPBdlBz11nEI5WA@*rpuHgDLh+V$;_9Zoj*sB+pFJ3{Sb}V*z
zS!pZi#!Dz#VF}so7@{MJLERoxZ$Lfj9dse}YKkt0>9T_^Pm(5eJ1w`;<#TxX>tEh}
zOHJb9&E;jbbxp0(-Y{LrZ1qcYVKUjpWkN;?S*A-GFBegs7E^}^CdHR8ub}7T!V0>g
z#gx}$=m$keZ#te|t-D%Y&*Afs?aeuX=FeMuGB)UTse~QYxBTh}N1LH;DLSgAV~?G4
zHAB-2d5w3_qlWQ>_E~Ld&-8AM{$_H!wLHyv{1dob260okpE<ej-^rqGeALLjr@TDL
zGg1B?u8;KQv-oFle0~b`ppR9*TBtdi?zn-ssBN^@?E<}FV^(VV!pxMWHv&)Vi^oP~
zTTHT**?s+W7$1NR^8?V4{Qz`KMRa7%4QGxx-RX$5>y<$>a2Gu_A9I6hNi)nmED<<|
z>56X|S~A_ZJ9iJ(9zzw-rDWQ>+sv+_&@M!k?lw)t!Onvgz7)AdtAH;+Ze}mFj9Q78
zfnE@_23FnC%_ja6`_>RVvw6V=3MMb`rbFh>*PHqRcN_atx)|8ilHNj3{Yp(2@xX?a
z_HZ}1mPv@;tgq&pzM|I63eKY7sE61U)0hV2pY4{Udj>Ymw1<0!xu%D>vspu?$dI}h
zYQ%P&_E=br48__B8hS0zeRdEHwrk+kFsG-caKBJ?#GYqhcV~xT;%&p&hsWf}^E0r~
zrx}4cgASHTArF$`1{a&{_8n6dw~*j7GpNa5D1=OyYa!D##c`pgAFq}|OG+k-8;M0u
zrcUNRah9kGa5Blk@DNbqZY|S7+-$mGXjnSpm4s4li4<xlG2Dz_`?QwPC~#}2=Dwhb
zJ*rs<N(QsNWMX`TYP1Y(K{uK1qUYFw)2P>~#YG+8Zi)fk@$t*RL38h3r&%oqR?YSU
zR?nQv(0jbVQ&`K3gGQGCl2^2g701FyJCrAUG(#V=+(#BDI_R!2awiNbOAKSLFNofN
zFUZyB3&w2YzGOpRW+J{|9d-Lcm4_+8Qoi38s=y66Lu`HC$TRNAum9|1rb2J_M7&`&
z+I2L%3QGoetM6>65;x)xvi13cRkZQbnGXFi%&0@y1o-)sdF$s%Mpl=zIWxC^`o8^0
z;UDt?-EQEAqLy9`PUw=DH!eD3>K78rV!`Ut+{fx&+&!pGpx6WX3=L!|RhxQwp;nWe
zCTHT^B##8tX@pdhwL2bX7UuH2t$VKAW+lRDSdXJV#XAd@Z=|zC8_I6hXJ)jMMC&<0
zcsn%quHN1qw?ucxllZXgJFGn<#m5A@Q1F~4K1KA7lYOM@n`%E{FXQ_K_Zu*87s-a?
zQW713_+$b`aK4e&<9toJ<J%(ZTpZejyJ?uV@LO^Q;L45=9)Mi{%i|#&2d!&gg`2xz
z_F5R>k3phqpDDq8^)SB6f1(5NTN2P#0iMUd_}DPBg}o#rp*oR=_)r2yaXqHsdL~ZN
z*r0cETKrSY{}A)LIFs)*=pC5+m*ZBk=*3H!gzx|Az~w~0UOcdl9I0oc^Xa8nhkf24
zgYFoGiA5o42hQ#&oL(C2z;|#>pFmzfxD|(GK?3JfFdYoDm~<$B2u~~u@6!a2!jD@C
z_HE+c?HESfKix{gEYZ?<B?aGaC6n~EC-JvuQ*bVY^vNXU&b@<z{>cQ~OSDMB_?O}x
zcm^S|Cxh~q5*_%ifExr4b>MpfMt`=o0G>v$bN1OS@X{nqw3H5J4~*Q1MqGFf#=Q|{
za3_!Z7gF%?7AVX-Iv9k9TX>uXK%;O)<@%Kld`rkLj3Pfj?@s2qEo49GJB#_i^dUUv
zrAU6e^Ll3syqbb^YcLf@#Gr!uUVQ{kof#fvxiFy*!wW+gPw0xL;*E0z1qtJqZp9gY
zocPbDV!t|p-?B~%doZ5qmNG)}6}CT#@r?Jxf1oG+0|O*F9>#xQpgaEYuK1riGae7^
zJ+FlRBuL*YDdp4|P}tYCiQ9(h{L`%jvv-Kub2|*Q#J4m`nBS(*|Di#$7s&rz1aD1*
z@Es{oFyAnI_W<*M5_SZ`!#{iZQ#oRRbB{p69|6`yG=%ZIcX;BXA|2xS=U^+HAGDu>
z`<GR?aC=<9xVM6E3KAF}+~+CwyA)^%$_La3v+Vq}AF=(<uflAf!)QP0X9+Zh`gV?j
z<_)m;Gx~G4BMiXn80BF;-05v)sms1&;`trA3;Ec<j&v~v&n$vMdxI0$nDD$0$$F%Y
z^=1klT6`C%-*h_PRN-t2ZVva6=|c!x*8VR(aG!PHvB&Yp1-3l==!5&MBVTy*F&>tW
zt{+9<e;@+XPVjM|Z_ElzgYFlR!R{dum}+`0{)9kYeGh*6MWKg;OBX+7RD?pJp?<`A
zG|qIdqoLz|uT6xnh=-2~0g=&5yKf!-*989`BfPZi<tO`kaYT1U7<!I?$mpg0`8xc^
zlAP&J{82KL(918{Y$GHPi^JRbVyA%R8L@j%g!mX=H)bO^S_XpuMS&-Nx)t>DkKn<e
zO?~=z-NYF+K7~0V`Q<s%?`4FW6?AfXSn$hpCjE53k$e*W!*%$-wgx|?R4@PU*Wqso
z{){k)^t0zV0ZaYwti%7)HT098z4~uOe*D*017`(4#XLGBzr>liAbwFU5*-#uTLhf!
zr}}BHQn1yC&`<l((Y}R4VhXoNS9DxMfNbjJ|4A>SVCv<+i{}sA9~Fvg$^S6!%duAe
zk>BK^{Qo?XR?dfoLOIL$5qdLx@_eKoIm`2sb5tLDUXqC;8N@8lNzPF|@==~g4;#oZ
T%Q?zN{>eD@^2xZ$S@Qi0=ju#B

literal 0
HcmV?d00001

diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
new file mode 100644
index 00000000000..bbe94294805
--- /dev/null
+++ b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
@@ -0,0 +1,6 @@
+// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin b/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..9b7c06c6f791df59d0650fd339ed10f850f64651
GIT binary patch
literal 9528
zcmeHNTWlQF8UA-??W`T!8wVPQ3S<H?1T~uMtS`GxD7b{1)I?A&stTgh+1*+1z<PEv
zJ2q=O0Vl0!RaJo6K7iB*#8Z*liU(dQRTVF((y9-rs;WNp0U=bODnd@0h*r>UzyHiR
z<MFOjTAQF=7{%wD|9&~=|L3xK=I*2Sq*JN!Z4_N;`Z=x0HBax}6anwv5Y3McKT0!H
zqJE55Q;NuF)mvc@+GVWNyf_!ZVzyd!ui7TVTQFR&)+8fz+My8!ZrJoJzv)`F`Ib|)
zEYA+z1;;AeVWn<)&0x-IIJ0hur4<IY>$%>nRWM^cThq|0+ksR4SabwV%Lx{sEH8YU
zZ8Xbv1EIR1fGbU}71~}n!TB?)PI-RTss;9(qkx9%VI4RPNsM889l~3%B4MUmExXl<
zgf4@(k?&rC80Uv=D{R9<wu_xnUK+kz5lEJgv61g<E=vD_P(Dfw%h-*^Y+(C!5xpNY
z!)CbXBf^Bf{N@uMox&?H{{Nn1b$}n5bLNa@t=4kF=qSq~QFI(|ROy1H;D?UgeawpZ
zt-NJ7TpLFcyPT{5YoCR#ux|6kVT$k=3#ttM+iD`Nl=U_JZ`H(>nxHD#D^pAbl$b_4
zdJ64*VD=`k8M(e0w*vj;oe4^`ycc+lMtksI20oR*Z<icj75up#{-)$td`jbz;8$L(
ziG5Sx`_>S0^}4|K_u&63@aitOM*d2g{U2OI$kjaXJ9n(*XGOa36TyE~4}S;OK8fWu
zUm9gj@SGbuwX(9GvYk^a`?F$6#P|a}S|Wt9m3a$rB%4tg_ybUB@-e1TAB+rQOd&E#
z9SrCEgJCnM)LpMSU%|?nwOh@m_k?R7tbo5#@q@W~^GV<Fi}iW;$;k!V^W7(cV5tyR
zys$VmK3nvHrN!}T-G6Fnu64W}x`w|<sh|DvduRGz`wY<lLXWOYN*hgGxb)ttwwT_L
zx}>GkZ&7MhYXi_yX+3owV?OoX1?_17t03o7SiVU>o%{29g5}b=)$`gi%U#qo7FkWd
zkVZ6TU?DBe<@)%5PS_`HS4z8z&E3QUd!glcxSxj`c{suLv|YU1&cl5eE}TEVdj8@$
zfqh?d89l^<aQqQoogp4QA?x$<1|DiWjA1wjU3y9zWSEVfzo?z$7%m`2#Bf1`l3q<A
z5YCjb-~=tV>FFh-i02|Zm~}$EHD^udZ8Gd?6`i(awcI5~pW?15JF9M>*UoynsT<{z
z9(B^kO!IO5^ZN4i<N=-kriusj63-?4Nn9}x+yv<PlZXGAF7C>YnZ=LPS7Lny=%0@4
zvE9-*{>%tpnt&a|=(gN)!_f`L3xh>{vpsJ=8NQ9@Cf%5yn$QhD4D_CSY*x1=G+X%t
zSKfyC3D_un0yf4!0UHwu8{>sdpB%ZnlL>8C>I6;Et^D+S+zZ{RZWc<k%-}4JE8aHr
zfieHK;_awCfhr@UfwADW!o(U1egRdztxy1t_8qnGa^zL4GG2td{2^``jcQZ|Mp^Qj
zsJh3tdhj1Vyn*7Wtt&R6n4`eWhuxoRE*SGs*yvAVmC$ypMjMf~>J3BY0}X2|0JXT0
zOTce6*20Fd!s=EPk3(@XLUhG390QKe_Npp86HRk00L?6H7$GQIb!37aX$G-HwBxZr
zq}9xGuAR^~8llk=8{rG8iLqFkVDj92!>GHG8kp$sV*!)RqFJ0SO--BAhp}CO+D8)&
z9}Hj{HzPu%H!yLhi3n3jK(!ic#CZiiqK+RH_8QHYibDq~c+4qy)0!{G6h1M+6O2$P
z1fO!Nj$)`5L~7aB329N42uoI3DW(Dk8SfKx9nkY_;AX~rQ%7~lw--K*%WP0Ti5(I0
z`seh#&bi&dG_-&hE_Rs~*{PZ$s%illlBVGqd_fX~?MmR-q2o6jZe`KHi%=!RLjgV_
z94#C=<Se+A&}!JNP?UTmWg1;kApaXhanxL=fb~`EO5L&W6bOAX#zGw9cod^Z6e6K$
z<7676AUH@)V|Oe-Zz2{*^~8dlZ9bOqIF_kIEGV`0v2bzXNQkQFjfIPJ6Tz@pPc&+e
zyUy3YwlfoPG}DP_L>aH!ny5^xW)xP>-f)p_CLW0O#Dgl@{O(M~@tB2VK<ER}{?wwo
zL$opMb#1&@C?1*o!Vz$^jt8M(`}n5RHfrILp_oO~W{Ek_FC<oW1K#HXx(x0-@&;4j
zim#2|Q_AxNp{l!J)aDzFSW@&nW<yD%+lg{lTHzBEicV92UaHO6M&Kz?Q6dGwBO2T!
zIHJyi@=bJ>WJ5V1`plGml6eCsjPID{p|vLf$E)%U^dz3ATaIWC1M<6rT`mXC0v<HF
zuaiBj;`3`SW4GYP1@}5IXII#Vf#m@NLX<g}rD4oh^LjK|3;fQ+x|pFAre8C}Bz$+y
z6J6XJ!+o@$XmvD(Go-JhYjo{8SiCzX<WrPgM;Asw*B-!o|7Satxh_k-M)U&yGCPKZ
zFXDw7jN0xJWd^e}g7uCx)>9eQCN6$|=f&UU{5Ls&C})uX(D|L2bD*y4<)Ry}^fJEp
zD@CNIPbtBC*Jbo1Sj6)5a;hUfZ?J(IM`-t=<k*YZ8==$7{T=!i*8B<N6~OHbtx6Vp
zJ4ch@kchLxNddfjQARVy@Cd!s&WdjnckDuHKtJ6cpc&@n>lZotZhK&WzxHJR@?4J2
z=D<HNz~|wPe#w6#OLs6YScLp?rbEvHVt?|`x18<Jw<YW|JlLV{NEq?#Y!f{Lu=B=q
z+vw~;8V!bpgPjt?&(lzwhF`{S8-Dqs2K|{^0MFKHwN4uJUm>2h*4YDj+6AZK-)O+!
zgP`v>&u^o5*j~Cr-vEOBsg2OSc8<P=JsxIx@Q=0oa~Y8Og}(dg)T=}NVlQ_aL-fiZ
z@<;3PmkvfF1SbXg8f#~Se1_%U&ZU03gl|`;r9IGJYG=Cg+t-!fzCKo!iSygnw?4m_
zb@@H@>S!kR@5XJgpB`ra(;}XOaXi_`pZ-&?Qd)dX+kL|j-OT=Gg};M>zK8nSoEThu
zKz^M={Db|hk2wB1!`rhlT+UG%=Zk3Nb`tSFPkY0mp&y;OrN|<5_I}E?hDme+9q_&p
z+K&AK|I>8msz&eZ$w(NvyP(nUa-?T@zoCz2#Qy0o3iyixhCi%N9Ecszhx3v}KEWRy
zIplY5AU{&_nLV3J=`RX;B#-^ogFca;NdF$dL{S|^_HGKci=!dFTfUu3AG;NL#I>T)
zxg0&aNNJ|^??!Why<@8W2=qVY=-$N-iF-z;^L35h$kDYi9YX&Duw@;2<gPDR_dNV4
zeq0dC2Ohlpi1pBy9(*`T)2HL78Tb##0GB`a{qAc<B7D9vCobksUR>iF$!8_<(nrYi
zFA7~Ok$U-4MqL_Y9`0wNhhxUmkGKrbyM1CpT|P|IBer<#<~@8F`8TEfTd`!c=$2>y
zx^c|;nsoG+A0hwa%g8SdNI}uW*OYYIl5N*t<A_!#c180(Nl<$v?nw#e<I&omN#J<-
zvXp;a;#r<=CEfDF*ckZKWB=i6B0)Wx;xnS;)jsm?WsIAVd}?Y-d9}}MpYK0PPT{|Q
z8Tl7Bkmn=SE&tog$p28v=cPkzpKsmW@*iAA{`VW$=Qz9NuZKMT_0+(sl;@l$FD0*V
z1uR%zmWx8i1+pR&&i?aJT3O=|t6^!M>B(#5x(Jd}Ok!Kf>wN(1Q@8vNx&@_BxBRWq
z`Jp?LO0ib*_kpgijm{s(%^|A)&!c$N{D3s5W|coiZ$(b+kFuj?wO?vZ+7r)D>fkB?
iIjenAb5f3DRQu5-Mk>u}PRen7Dv#ZADz9o*a{mB2)l@wI

literal 0
HcmV?d00001

diff --git a/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin b/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..6e21efafc59f39c347cd5fc3fb16d696fc03742f
GIT binary patch
literal 21056
zcmeHPZEPIJd7isHl6NFUQnD?Pl58z)u~87^cza(YV>ec!*hw2Za^e&X;-u#zc_hyv
zkLSG;a}+I`R*b+dir8&|A}NX>4T`oY`XdGUBQ1hPMV-Jk+BE2o!f5{ls*oD7B#@S3
zyNw(6dES}b+ubAW#-W9x@Y>wnnfG&^dFP#(-CI3&*FAT~W3gP1(nnnVjaoPNylmX3
z3-yoc>!<g8TAfgnsvGD1DyCGrvCwF`&GIzvyzA<cF3YvlSg6gmRJyj3uGZ$3Rk~Se
zHPcPE+FY)c>J_&%ce+uTEtP8JW_6`fnl3kI7D}~ccd4{knXfi+TWY%HYOPwEFD;ba
z*=D)6P+F>#Yo%<#*Seef8n9BCeH%2ol}5!~f$EI-J-@g-U0wtz)uxnZmTL_(u9+v&
zX}7YtyGTJ{d3QC=R;EwSm*(8^Qbnb8VX<1nJ$;Xq%b|+eN~ujRHOh^K7TTe{jNs`M
z@ayW$R;k$nh_s6^^MFj(t20tbdB3&|Z^(avCsae3^5WvWTdpri2i4u>=5lkjjur;=
zWp*o4^Z<L?&+iP_H%}?vNWFilvXow)n`=~>dd!;K;L$OXk+@b$Uitoe@4C07`AeBn
zd9hkXU?Wz7L%npgghUw%Aqh!ck8^hW5O^meRN9$hS2Kd2Crpkq8SUrF2)j6+dFkuA
zBo+pnrS^7&+$g5@(Lim&+~0?K`)fA_DEjtc)LUv_NB!fdj|J-c49AngKh(kh6~piG
zg**c-dVD5MyPkJ#YE|mH+7;&R_oTkNqyFEd-dhE=$+uh7-`lP*cfXGMeM1571~uWU
z!hcf-{}zs;rk3|S;P)fE4)#C3zJ5$4w;o?VmXuqiSbw}ji>WZS5|6-`mQ5rL$wgIM
zog|8}pAPim93yJ6@#a!}yt(YoEL3Z=r)O|inlCq&munAI%j3BjHNLRCR2gqqak`u>
zk5_6djq&nwvpT+5ogS~RHW!v_xwMlRZ&aI=_twiZpDWK-8d~vq$x}V<o~|{kSQY$&
zT4jwmzF4l!pN1KwW^yz2Gc%`GXPR?vc459cKmX9@3gw5FADW+?ZxmOjS6b!j^24nK
zw^E$>{KCwc&(({k)Adyq`}U81_q*NSJ7fmwLS+%_&IT#*pMB?Fp6&kYzl?F^HH+0H
zE3M0{{PLqpr4SYRNW`qYv7cD|@uAq24eK;6RxeMn_zzU<vaZ=c$u{+$$K5N|YCI$7
zA)JNt$_D?lQWRa{i3+U*m26xQ@hcmf5QD(wP3uhjoAKCBer$D7?OJ?8#q{ke%RSHy
z4v0U7yN!)ktycUzIFcwOEbD5F*O>L{S+Jn|RGinC_4K0<PQ<OhRlLTm?>xqODMJ+-
zhRToVx=CI7)c{98g4D}apXTw@7!jjj3dBpo0_Q~Yhk1!xZCN2)(RwVRwsFa_aPiQ9
z!F%bcW&L9;z8;HNw@53e<t+E$_|LN}UiO-4O%t12U<!9Q-NNFu;Abj<Uyv-WeVNEu
zB?{a|FD>gwF<xU9C`2|6`eiVHYZRx=D;t+L<LhyZvF{^J;)>ZUJnB#p2q9&S#H>Bk
ze<x2XJl(|8x7i2QxGp0UtowL-fTxe+bnV)v<*-1kEh4DJh>JAi+Dq32GwTM@23h?e
z?`)nP;)#+kfr8ufisxs<<agzMnBjNrrH!kXuIhNCQSFWdx(@m)y8WB_Y7wj}n}|IC
zVSR=s37T~)P8-&5;CyLw<MPJF=EjdVtW%_j^Y0?#XZ)he;sA@Ui6?(duQsGj8`dGY
zKZ5hix;ebHo)yLmjB4wvjA`rN#g$jAVHW0bx@u*4D^`i`HEYB#-;BGLtO=I?LLd-e
z?`K)?rcvu2p1v=JM9E>4UlJfMS-h9i6|2hI3Qt9xHmo_EFInOxZ>0fOFI(TB9j{T0
zI&=T>2H<)1)mKGv@VY6DV-m1_pDnz4#hPJ3G}9<#m_e>yy}EgIb8}Ot6Y_6vUfX=t
z`h!Qwy}{s>Bm+AyU(@)zsM|0M|G{ka$|Wr=9nCI!Ild7?6f^6kSCA#E%Qbs4J%Md|
z*`2R6?Z#4RDpOYJ^6V^Dy^T_%dZuC*xq?k&Ute)+HZ~aCK-+1of9GqqQ?%35leS(0
zml{}6Kk(Vxm01$|Ov0m=zz;aiXYF6M*QN@`ZT=M}j@y&GPU6qhO=M7)2R(E44Rvuj
z{eUy^rurtSFAM!s+8*sr=J4m}`pG=(0FY{<I=xu2)0JA&UA1>-(RWOx>t$@B?eytl
z-cHw>uHAvo?pP{`ol@rb^|t6-cZ=EUZZUV=E#|Mg#X`VhF1t(dZ=$mJHtL=T=)OK<
zY*M|OkDeaJI(XL37K&<(#d!>GrpDA*{f@lzexxp`n+BGtG4~FqSZLS5Pt?ucku9Q5
z9lfW72c@-{(lj25)-vy7eqO|Cm5C!gZFrsHar=SYO%+Jhoo0}bch`&qGi)I?DQw!(
z@=E%&#w3p>jm<R6)!B3lFf<kx(*~_PyclyyFKp){CX3zmF}p1km@jtMOb%v30H7gb
z2qHE=*m@X7>p{$J)A}r?v)y$o54W&G^1v^^0L;UQ?BsU#F>UUy{Q_ZGc76LWFy<OB
zCKwo%W<9;wOgAKK>Y9wRlby^++_+vu6)??f<Z`#G>A8wukLljjOGpKbVW9zIpyvR#
zZ%RjK-O*H>^HZQMm1on7%f=74fOUixr6Y3h0La-{pMmi$Z28Pok*+SCUQB<!I$QBL
zX0zfjHRX+{H{2;Y>&Q3&*X`SgN}CF>5fK%Yz9@i@&h>H=sk&Wtc+1gf#vFX0i++5L
zxf*EAL_o$|?QO6|I}xnWkln6Mz>JOjVxEt#<i*e$w1{3&rMtyGi-?lu+V8P5Hlrcn
zg$xFy0WWbtUV|N~>h#s|oP{IV?Hv)-TAL#ZoUHFlHsDJ~ujw{IBm%wE&pI5|?rauB
z(-3qjDu{3k7|*zsa<fujUaZcnrtv^>riq0(-WJ`Hec$^kE7h51X|ddBO6J#6PCD8%
zhzn5_00l;)RcK$NipfK&BE55u6EaOtX38@Ql@cB)HPgYqlZezUbOgU_#>XtLF_YG#
z#}IfN3?`y=C~KKcY&`FOK5QV>flU_O5^Xg?AzFpdi&mlcej#jV8Dm7^E-Rs(&HLC)
z1h7F`4P&EYaF)HZUEnY{#j~Fe9g20r<3(XRbh5}HQ7k$GEbDdATMeBoECU`%idl(<
z?4%EmlL_GCATvb(5;$iy!ky8>JpEPxf><X&ouek_jhZ%QIj>6sRbG_6Wq@Qn(b$+v
z{W!Ptk&_DoB&mg?Yfdkv3K~y3HJ6*nPMj=!;G~{VJO=chj?JAI^kC+&*j__X{K^vP
z0E22g>^9ToI^LhQ(sRu-o<l|Pzkum`CkCCKF%))UkS;G^-rb1-u8g@duLf2xe2QSE
zdPY%urY}n1>6K-X*(tiL5J7U8bwyCAz$+^<AkQv0lO~^A0FJvSGd!Slwz`s@JH5E*
z;~=n-4X(nWD4h>GK+$we^Br`-(a<rMchCh!LuXVpPEUKbCj>!J;IJbC6oF1$BSjV$
zCQv2>h6#-=z~_?1BME|9Ymw`rxutU2t@&gDX+n>0J=Vb#;gN=F8zvus$7~NAY!7s&
zRE*9#cRge7grF=C3(8R@Z5ZdwT##w8?z@JjwrQf0GIfCkZwx#>F5@D6bQo^<PG)^4
zo4%9MC@q`MoD>%Tw^_An@alTvn-E9ZOq#1!_CT!bDL^kLxw;iCLArD&)c_XuK_lqC
zPt44DHt2p9ffwSFpwUYB6buc7st|{SPPj>;Nn8p5CUIiirA*zyh&KJ16Q}~r5)^#2
z7s9xXun*amKIap=FdK(+AciNU*QF)28Kg*mm~|cJ+CU6X3UZUeF@eSeNK-NtwOW!Q
zp*tnuY9>92<-imU?<sLd+i3F`IP!kv_~EVB3Y2O$m0)nc(`J&sp!qOU8`vpuv}x!C
zSDdY9kuZYDPDP=2#o;>l7skng@1!5zO!?3#vR+hqPPUn(Z?z+BCf$@^_)6bW7d09!
zLArD*)qu{eOUV6gY=E(zWpWD)gDGu9E1^t1i+msA&}KqS5<ZOc5TC?}aaS@17xf0J
zVkEq$B($a=66%X+csm)1GddaOi^R8+k${s{$nb2ZZzrR}UKmIVhOv{;BARCcS_3ie
zWCVJno5V;2I~f7l@abh*4s78#cQOL8G1cZW$V~V_)DB`ZCPS?@{T~Q*flz#CV-xNY
zCxd=y6B#Jl=Xy4h!<<aCE5J#=>wK<ZT%Gh?^+Q=Nz>IV|mLvIGFVKuPZ7zfl25%su
zbGobO3q56~igK9#l9+_TNYpzS1B$fUv&3fPBE#D;DUS%z7Uk;fykJN>W~Zyc38{<Z
zVMp(v1qi4UxjJ*6fo>`i|Bj;@La|T>b&LT0SVi0oGcq0niQI7{7$s!0FdQQ32o!Wl
z3q`qpZsM5L1KJfF0%r57bx(j84Q@SEJDQK%9xlW?hJc=_9c`-3Wx#RxtTU3xb9(H_
z9z3*A38nj-9)NVB4NjyFWsmEtklq%j>$ymxb2!7a)d;6JUeC1?nbTf)wFiNLSH!(2
zF|mtBab{7I(~+j#X>)akxQ>23-N7fLC^$@Xkn0Ve!v#w?YW02^1UARBB=H>`^?JGz
zPDHK9)pL;<!qIC_gHA}D4AAz}>gZ`)IH^vilU#c_=;-xeP{3dk%5?NpS{-CGpvWxb
zty{^6#(Po3B~)OW=VS$@N$H$P^mUNt<t;tE0G1F@&lTRtL*@ukWf)=KqXR@z(8Ane
zy0KcDNl(u$u6p|&C#T0a{ov?g7JM%|E2PcDx$aVCDLmcYUYm|}4h)#yPMH}q-bMCp
zJLu98OB8nYO!tjJiru)QXBr9y_)eY~dqySKnLD-800FFa^eivE=9pO}$2olmZF$|3
z6xhi#?YmK#m$9@xW_jK3oTGO3O!v5%$MVdVJ9(!2Jr98$Jo8-AD)X{m<!o4a*n7Ay
zX!2J9O!w^<LQY;Hg)apV0)Y*<!<R&X-yi6>)^7@2NALE92ZJ}y0n>iZ;c!2X=ksVh
z7X*x8BWHr#lV;bR7^_d%cqnUrB+TtUA6e%DPi2@<Lt4aT>XKIrJ83AipX*3K2K^8;
zsG&&RJ81|!TFlYcpauy&YM5fM@1$X>?L(e(>2%qZv2Ul?4YD0+ns+I6y?&5qAqD6R
z7&Fl%ul4Z$NZWJdB|rc+Je)xU6TE!;8T(EVCGL|Sp|xFlaLj8WGbY|(*c3Di`O)Fq
z>(Zk^t8@x)FmCw12m!^=f!ghq<@H$5FQl9Tb6_y^3PL7`6BCF^fw(;g%)*OI><#o9
zUQBcbpNAiPk+Br&HoSTXppMtg4qr1pz;Q5vS1j9*F_JX-JvsA2G^zmcXgG>|X}FCJ
zw8(Hw6~W=nW8C|OybHfda|+VYME<>I63TPR_Q#uf1CgTc!qjJv+iRvhyy!%%AAvfB
z$+LL%7mD9N8qp6i`HT>GED*YmeqQNi3O-cqY+MoDO$Um6r5(&cMT7%ZCE0wQ(|G4Y
zI-R^6%2S-d5b{u#r{VX4k~{b?5(8iS^xg{!VV2pNZwfdc2IH#e$5oa))ks{K^h+jq
zNjdMWpe)_%La#ei%9zVB=FICmFX7EPe!k+XynMk1+)BBXX1lZT41xVFKWV7cX1NkR
z3m$w#R*_d-Df3(W^7OP@S;4O--qLZlL!W$t_7=6hB^Ta)q0UAwm1o$_)LKd~O^hei
z0Q|7=m)NWC=~iN2;C>>Z)>(I;pCpyUH-75cs9)|;_=(@fUccO}GNE#pIu_zj&{{y=
zQp2IL{8~0ZkHKP)Uz`Bl!uN;Pp)Hko_i_16JbxGezc~`ej;nS2Z2uYjeGPx#<WKnV
zQNa-YZ2V>LcLGg)5P#3$FL5}jez*s80F@X*1%3g^A3^fZ8abg7y-CHN-4DfaUrf+j
zIp_;-erB%!ZLY7IE5DcE!5@;d54_`Ma}%xi^s>J5YXq$)_*KJ)cF^P5d9&-ewb+(?
z-ax~*4ych;BQb_+Z9tt{OKz#(#69u*I?Ant+Au8Y!l3GE4e8Pr8jAABDn3BqF^azn
z1L`ZSWFmpvJNE-J(9g9}>V&~WeQ!|xPAionuF2QFG^m~%1b-^UuS@RiHvEqy)txBg
z5c#f_`NqW9JGVBxrM?I%{mVfA+Wsx|q$y7is-rEXY!tWDmwME*<0#)`(H|*4|FqiI
zimSsY-}D;ju;1T8R{`Mq^o9k06JxDpoczf+=r5d7k3Th(ls>vS-LJkLS0k-He8+(p
z*^fqDBp&3(LI0#|zBMua{?!?j&w);VfycpCU!UKf&(q&SeY78^J=os|x=nSjqYS)?
zt<)j)HB|7pFiwB_9NgB2+ywQFwGJMl2g6!^Z&wSk%)|G`M4?}%BZe;W!>NEg+G8Tp
z`2YA*iG$Ljn{Vk?7vr$EU+~#ad(RHWA0O!-_UQ{L;Fa9(<8=YQV{6`s@v{DY!5{Ya
zwQg$TkMdVqiAcP3MdGEai}8~1<E5)B950Dbyo@%7hW`A!@0yS{K6gqb8wcfg`Vth=
zX*Js1yVs|~{%CXGKEM1AgWx};zmBq@;noD`H{9UUzfbx;{dE)Ri1)shI*9f?sy_G%
z;%7W@Zf$5w{T671dHQ1>9lWnw3jLjI99B1gr2Xk4{g9UbFH2oKl1R3O(Y$pGf%O>T
zDQ4)9e~s~}?8$?)_kLtEi8uSjVI<o1k4hPI@MFNxpKIWMTG=lOf0Pb=@b_8x)4IX4
z7grZfDf?50=#SQ~^<lVBAN;>K*bV(GLmvLppXY?%*Pm=0ARtJb`1_>+Wgpu|f3!Y~
zH~U*s4)E_M0Gj^-W|4)%)ZgZRLCQY=xwV4`CGZp9Kl)g^KWOh(^rzdElJSM%$niAl
z?nxod>%=boJ0kHX{!LrTc2frcAdVB@KLR`s7(5j5M}OStl}&qZK=&EEG2ZQO9XJpw
zzaad+zoTv<DH+1|r#~qpRF`2txnJTz>_3h2@IELs{tU*(T8Cx44axW(59eR>Uo=13
z?#Kun_T=Np4>!^NL6JWNeobxXhyB7WW%MWV-C^;U`R>mKP`>e?U&eee>c;zIp7Q<g
zN$HGpLhuHjl25#NwB7!s;4kePOrX6<o&SU%`nr21fB5(e^h(Au`bXW~KKcC3w-+CY
zi^q^h`)#*xSZ6q`&vtu9bcWOAzgyMTWRAFX0Qq*HJs+UGoZpl*5K7UXzwIWvSiJK*
z<|o9%ZEjBwLrYM*aBBEAw9hXio>OjOOV8Wqn7_fFO!6y1`6BSb;84o#+Df5}NuY=N
zF@B)_{0a3#j3@BhZja8N8XwZ(FzL)E)Yso1&<A_TEuEn~dp$jzA*C0=-{&gHr)nMJ
zANW8Y*r|{Ch5q($>HhNMyW@;Mi8(FbC5Wx-d~=)I*B8{ET%daB<A~Djfja7kJ<%U;
z;{p50Nvacm<QKN5w>M%>&#&u}U+8aK>qGlRU*A?AjCuGaKSg%YCv>#8U)n=|VGr_P
zr$1hMd$)8vX#7)d_m;$)*B>sXUaB|tlgFq|^v6y6AWnTgo)Bjl_TXP~lH$Ve`O_!<
zYJCa#*DLndb$gPZB00q0E&i;-U*dI)?4(QnSILi%EM@U0=_V4g09ES}uSvJ7OP3K>
z^at@0ruSe$Cr3i`qd&v+c-*Ipd~a`<p6d4L=s)DMgc(sB?<n{5`|DOI$NS>ip7=NF
zrh0Y!qZ0I|9>efD!lg0(KQ@4JcaL9QU>V{b{ZCo$Px)ocx5)1S`$tet_UQhk{&AFh
z<2qhgK813>$xkRh`!wx!K_60*XTLA$m&N{`UcW5%d;0w{?2oh(?e!n}3+u@Nt|zz7
zV?6}?_O-Sw&L>;v*G)N*2-5dHl9GJBb&UCi_ziaX{y%pF`6-cNs!K3mL*J;|mu$C(
z?c39<^C_65eL?)6R!@FGB_v-mNxV2H<vtttf%2Xc;d0M|rVM{>Uc<c{@Z!>-dSbPE
z00W-#N!?#?_sBZN2=F_43;am}P;)$g1o(CVPwuopN4_N<Jy<Xc9@5@klm8*3`GL1>
z^s#^YdV~6&d`b<l{rKP+`+->4Kj=G(_4-BZy;3s25&wSc2>7KxFyEhBLpVQy3J$_s
zZ$Wz!IvfoiPpo#Olnt6+?#6~CSnf%w2Tgf;pu_&&NBaAT&(;w7Q{rLE`XS5gf2`l1
zY$!edqlN0?Z5{IaS{A|!`-%2`#FG8UzB<?Ui6>TddK3S?K7jRiT;{J{Y44G+zH5W(
z-lw2Uj&)1@cwE`7uKup^Ke<EZ<L6JQFX9_$;@drfO)v1f*W3Ro@(-#HufDFhm)+X>
zf~C$6s$riFq5lWvQt9Ncedq(FyFdA9{7W8kd*3JSI$65^<DdAXhV7m6I|%$ALV$CD
zdBPI%>VzqMG(seQ@L=wYun2k6R9=k`qw-v<`SW=|@r!2C$0|h*@*v|&nqr0CQz`XR
z(4#s@$D;x{7WBnGM|qP_9IYes4~+cEoA_VYLH=$tiT-n>aZy`mc90+DCIH9Zze``e
zb5#D_no#%X-u0q>LZObD3a=TQOM=H^x==q6ti*kCQdi`SLCSNV5S9N|?SF^;JEpX}
ztuKCzc=DVMd3*!<S$Bek!117w_o_Vmo}AbJM`8bs<UKyhb6$$dmv@kF8T&zbuA`#z
zf4GDEGe$mR8bJH((y07%JIJ5k#y;B_mH#mCV;4b9=y<`%(`NAS<h{BqZYbZapJI8{
zO5p{H9J1KcW1Qn}UDx{O31gq_3mw0rOI(`<584VI|AYd4IvK3Qy=~-!0mt}@wr3Oy
zDIc}(%SZG^?Y{!xSogsQpC^A`0NCx~pE1FH@%~-Ih#KWRulKGV{<Qgv$a(!6wddjE
vU4!;y-Rt#hRM7OZGS{G->>s@Th{)NdY_l;qf^w|#a1>(T<l*XFJ-Pn@5GHQy

literal 0
HcmV?d00001

diff --git a/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin b/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1196e215fefc887a1b7db4c451b31d9e08c26559
GIT binary patch
literal 10296
zcmeHN-ESP#6~DW)w%3lc>ljFrCNvX>A++guKOB3L@L>pnRt+KrDpV*PW<P8%tamrF
z<Lx>out_UgRTW5WABxlm)F(tg9x5KFR8^=gsPqBx&_4i0s6r@BZI=KQl<n`Fd++S*
zu3c1{P_^YMz8~lNoO|w>nY(%H*zw!Kp-_52$QBmA5luCx;f-tL;*Gs>yl?A$Vp?QG
zKl;l>NC>MoSMyxY$zrC*#d*2r(XP#v^9^B@>sGNmQxTR|Xn2<A7QIT@u2zcn%tEb@
zx9zgy73&2%>v*|2yIgVS?NVX3=wWJmu2U=*%d>WJ%9pb_3AH)LE#yBE8E&CgaO)73
zVB2O(m8?^Ot3{93xk|a_Ic0Bx;%DUx*@anq#&zZkdQmEtF?S0kh4j}>2cTTHB{Sje
znp3Murgh*B;@!*O!>V4x_8PE|^kQeUl~%QwV<7SSP-%PHL}fn!+C~K-9;Z~Ab)D)Q
zdtY@cUd3Ci!oz^P#Kr?Jv+xr1|G#;x6Zr1=!n{?PnW+^#nab*rXgm@er?hVC_1$+I
zyTg|BcEWZ_MF%N~T@IcAd!LP}(6RZPOcIVhrH{dX+nMlBO6QsWx6Z_-il9ESm*y~~
zaKsET(sR(>0Yp6t7#C3#*WgiLK0Vq)e>>oY*wzET7w}L3zC}@do$2E}^c6*~>9oXC
zOs}nIvwfT4eQki6J;(6=9{ArGuAhQ!^jE`V|4<vCW($Cij<n;`RH}Z#^q2L}x6t+}
zD9x!EBJv20@!>Pg=6(@vooVim@>B?dALx-H0a{mNH;k6N5gmcQf+Q@e=tH6RVngUd
zAQGW*Z@xP2RovWMv7BGXVP?-dwMwP@P|+EmFA9;HFHS7fAI?@z<{OK(iTq-<Qd*dI
z($0jN_GVGXtBWVyhZgXZuBU2c_u;9^+)1mtC_=yf^ABF^|K$}z48nEUA|Yd2X!YU+
z<5YMgbkTS${Hh2ouNaG1y<;@c8=<fnS}}}pB6NP$cod7}3kr6ch*y`-uU=Tb@b-BI
z8etF^OX1f-A)^n|)$^+>j0is+CO4rp%&FSxGQa_ogt0wjTt>RDp^mNi8qM}mS0k;)
zAWe5rcPDi>P<IU7%E}7NznYddbxgv<Sz3-#SEKG;>ON21Zt5OnsXV=c)~s(AX6KDz
zT6@&(q^^!`d1d*c^!G9HmqLWl1!v1|F0ZcsWp!nF6`?aj;U>9HA-C!Uw^po_&5VUN
zB+qeY3!Yh<w+|&8VLACcN_Nez6;Bn+Nh-gpM4J|h_H5ZqnpQSr%94D4GWmdcgSm9*
z-~p5VCZ`UV85%SAQ?RK7U=zS6P9OPKvZ%b@pPc$oc^Q(Ig8V~LAL-4c@t2hF%mnnn
z$6~FREfq|wQ1;wKbECy`zp$zfUcgLiVRFK>s-A21#ABnfZI)~&4qSR2$&W!t>SNH6
z{up#j1azcR8_pcHy9Wc(E>#9iitFe#{AAfH=FMa(BbI16i*&_ji#a$}y?*LOoIM6*
z;nLukdwqJc4T4<&<*!er02BL;+xW^V=j<%LsLF}MR5D6=c?_(qqD|t|-M`tyf8fX(
zf+sgGI0?bz1)gGL{(PlwEl9VMC=_Ek&nf1u23)Ppl`IvIBudnnOZd!MR-v$M*4@;a
z?xIp|R&W}EJDkMcj3E`s#h{0XEIb4<xt510r<*lo0){x{ydP2o4GN()b#MYER$Lzg
zuYr+xLg6c-YLz^z#@kdyDF+%v$xJ3WN!~4#thu5>yGht8@l(lznbcG!lO|7Hy~`;)
zB7g@~8m<NrQZNuuiYJ{Gn8Nd9%uO*il{mucN);bVN#hdDA=pw|$odGGGwe(XCNSSO
ziur;j$SX@gp%v=$Ln8CE85l@qdk10p0W;AcGgRV2xq?iORg)$Tqxx#9r#PZZ5F%|i
zPh%HZviUhPVNznOVU(nW@m)r-pLyCaG76znM!{wPK+jg?+=Am3@WNNjEn4^x%XxU0
zz$M1<)ZxR0dNJqOC8y?brI1`ntII#=>GDs@?(*+=vO^?k$T_*Wf{pi5&kA_YmQVQJ
zr=|Do2c?(XYZZFwo=m=8FOicQ@Df}-USe;%yo6?kY}8B6mPy~s$$*zr-Cpt$$<rBH
zYcG>HAscX$*n0fb5o>2Fly1sTt_vC8&qDz}lblL?boEK%vzwPSp?5#2qHn}gu=RL~
zn6jw0r+yqax~05L$%OA~G8J@|Y6|D>jC_2hI0GoLX3|rssiOyPJ}T>&#&d2{ng#Hp
zcGC4Mr;4w#hBf1zvNSP=Idu>vZ5;wx*r@(e(bH^fG6x+;D}YGLs!It6yD3KM(VC`u
zHj}<Blb{A^`C{FgStymX$TUkMGE_Qbau1%5x5DG`%q2~klja&Mb*EFZbZZnxE5-Jv
zDU>twj^&oM)+Fnb{2YoyN4hDbT`Y!UA+wM78Wpay2nWmV0h3EIE5MGPhHl{&JpZjE
zdAR+SRVe4_CFL}}S!xBYrGx5{$jN5iLLD!2-R*P_Dc_a%5_U6wRO?;?=A9hbFt{`b
zM?_>iDz;+WPV;SYPP!wbf%#NKGzs>pVM5_If`kxn?e*7vV!sf}+x&Gzm>qaS?C4nY
zM}2@lB%&R7Rsy_n0(UgKS|YMLDjvi|NE3gNkzuxlFF!zN?97PBP*lV)9|>bV86j!h
z=&pq(AE@ykYJ49f`A&`QHpqY7u2hq5xRy!q-rFQ8CwMD$>pF6@o=ZKRT?)1MdW#Id
zDkgR=Dvf(FmSf`V(ojo$2Xndt`w7;Kh*(x6;#?f`%F8hT&&9-3jVRwm-LxIy1^#Sf
zP)ze31@JG$#rGS7gLJzWefNpD_(L4@gM*i2eN(@pe<Uhy!WvETFGX77TUe3)1mrEn
zTH<lFuF`s>CBCcHWPgM16>)z?SA4IG13%gl-&6Q(OpF5m0pVYGVu18-a|hUg)-<i3
z6vGWM(EOqh)$F?l?2lZF^-rID_Oq=p@PCBA4WqRO{9-$_FhAW7Lf&gn4v5#|qOYND
z!CJt>-+hg(;%Vg30P%x<tTD6|=_!U--ksvi%VA_C+AeEYoQnV-86bRw@aN*8zJ|K1
zBzY`fWf-jy-(Q^q{wxW^AJ$`y7|V}I{_s<cNY{S!b?rxAAMHoP-;ch&&i#mV?8ljx
zqY<`f*Jp;sRitl(^^LH;J(0c!8G^?4l7Ia%-@mIupdV%WQKlb9ykxCldc^0<%R{5&
zKgZ*l7}moh{u=pr(2L`&Hg6>H;@lpw*9(V#_Tsfu#3If;iM3()Ys5cVlV?ttz$ao@
z$9TVZjeOXGe1N=1#VyN*czsVqts(0zL;OBYc$5snuL3_#cv=Jhwjo}L3zMB|-YCR0
z=Y#nS%li_qu^;A+7}2-!&-2>HlYQJq&?A4N4FlWJlt%#a$zG~ie7^~Mqm8)2H(>+h
zKVu;Mno=I(XFkt(U;bX?_aO0eyyBs~Zht?UEZ4$x4H*x%e-EwuDW84&T@j=Fkp7JW
zuglsr>%ZC^GH3^R*I&9*T<vb*I=O{dVBZ}$OR_Z$aXu~{Uql%Opnn$%G~so<5x~D0
z7q>6IFR1ReTHiFp3vsc-$AjsAaBbU1zjEu%_HFmvhacB?dg88QNA0`6eAhiHZl4UF
zMBqOk0V)@=E_B*5t(H~w*wa{|4>qa?Yw<@F@ZyK?(=TdWBvQP3JkBYFL__asT;nj}
zjgp%Me78-6m{YGWbCS&OyJ?TC!~d${|C`S#C*AyHUpI^>f2Rz^L#?BA)Bb!N{@-$O
z#|IccB-$dn`LC0xYMVmN_NxHxQwY6B{LBpypOo|JbO1)v<BI<|1t)%bly&oO#m2y<
z9{pmcWHjaA=ZNOl`$)eR6Kq=1>A_U|dY?%@y<clS4gcXf{FycQDW$sk|FjPO3B{jK
z29bVxUUu`pw+?@E4gKV2H~*F3$G@I4@FT@fF%LG)uVM6iKJlyLqQOFZT2A$E%49#C
zpQZ%+{Z^%)@WIyPmti4x2Z5NAu3)48H$XOZ^Z!`!2We^E&3~QDA8|_;q2~Vr@Vc#)
zf8;mCRsVlBO{>QfN}(Qg{0O`mKD|F$j~?}Y=`pB}zYo>EXnH;BebQr)k9^eoq3QLg
V_eYOGKJrh;QPb;D$5oG-@1J5L-jDzQ

literal 0
HcmV?d00001

diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..bbc7d301593f72433c5b7626638d0e6f0f4a0813
GIT binary patch
literal 11656
zcmeHNUu;`f89(-Qnm<kK#7Wv_U70r}tkbF+-)s9iS*tYy225&MVGP8k$#r5U@#NUv
zwUb@1Elayn4I$bMcmVN0#RD*b1P^<FhoMS?X#yU2<N>59nx<+SDBYiKg0b-XzH{#N
z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL
zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W
z)q1gzTc}pns^!&23D~5^eJy<T<-A=i{2QEhv0k*-AObsfv3zX~fiZt=oQ2~2>QZje
z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI
z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv
zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI
zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo
z<HNqV`Xu2P^T#{*w=gCZmew@j_er|O22ZcA?-zYrr`PxQ@lpuFt6NBiE}?8ieg>zd
zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy
z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE
zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T
zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y<up)I+S34p&rP9?
z!c!7W(Zs7bt$z6!%@$2@*8d|$>>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l
z8cXjac$Fp{)2rGySn+!Tz$<TY7;p2uc@<y-*iFT9HU!{`_V=&iqCG)?_8zbQLbBSA
zS<lC4c^6IRiQ}!!tD8KuuQB;KX5|z<ONckMM=`&y{a#R3pCQy0xQ#XQ+)1k$Aho+G
z6ejSVI>2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^
zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n
zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj
zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hw<o(IeEul}4$MFw7Zog1{xD
zleVjWs&>ds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I
zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A
zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe;
zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g
zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V
zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJ<fngdUP
zp5aufjj40l)(q({a$vzPch_8^=X5us^c<P!MwISDc{ier0m{1(Igk{TcOlC7Qnc%x
z-ctP|Wl-LYh)hn|lnWw?mDRGoTyjrVT9z&qU^-=3nM~FSR8^YAsklM#o$a(N1PTRO
zaktWC>g+V<Rf?qyRHBc>S(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b
z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E
z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4s<Qp#eAb!
ztCmX(O&xCo7U+FTzN{ZH@3^D5R$6G}%K3VO??Y0`(5bt4Jq(>lX>$8MT1f}SwbtT6
z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9
zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+
zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv
zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G
zsDgp>M-8;CXUW!_Miv5%hx6i!lE<B*GE~?Oh_W}HD9&%EC?|uG7!c*t?Rjh>x1F+_
zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it
zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1<!o`C
zXTULKXWCt`INju8`w6f(l~T~c=A+N5SB0w4^*=CWTPAwhiOz<1%Ky-0$(@FU3_pIh
zp0zDb8_~&jD}0L+MpToYzkKJ?k~_eGSaj2Coj;3nCK<MOID5O9OwUNKrIl94g4gqT
zTdy}O3yFT%_5CWaIPDpj2xLj7RQl7N*sRQXhe(zaZm~#~t$4pxvYc~zl4`e-<qS$e
zYE-x-sl8Bp4`tYmY)byTYNydzWa8~9vnw6uB;87PrNb#npS8Nu;hd!7=Vn(1?4x3^
zl3f|F5sG29yNlR$$zYn~qri<hl|V(R8#85r@2^?+Rzr8;*~BTj+kiLSjd#Y+Ya~79
zOgHA)HfF<Pwz@GxnYzvM!+_~=XS;G6ZQLo3d!{S5_Xxqd@o1r9obAe;YSW$ZxZz@#
z^cm?kc1yC`#fJB@+qk3kN}yON(68`Q^i{3Mk7oVq`%r#<-Y%};)kAQc>R{!MY@Lij
z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh
zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA
z*27_}t%&<tn%`ZMuJ`Z`o38&<*LQJ6Xgug$vHz|5zHRWI+6I5-L}-f@(7W(kqGF;c
zbHE=JDp7IfL}W{xRYraY*olZurAutYM319hqHiH0z!S|ONu9!FBPzb<L?RI^5AQ-)
zpr3K@<(A)50{MJQe9P(Sp|}5$ch1Me3o-Eb^dQ^f@D9a)G9nHG#$fp#M@8$|(<Jrd
zxZDyGlg`*6+1nE5d&S1Im~f(71P|_#{!KbVLvt?u`3vF(CmK~6*#AjK)97`#l#hkt
zPVdkmz+>+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z<ygR@C(Xt
zZP<<|`LKj*5%5PQozbw5ANf_^SX8`I&hQAu6Cr&s+#{x(_^1cNUJv$%2l4+dCQkSF
z#Tm0t9~3`{iqjL3VGrI2i|3l^+s+e^LjK~;*x0)e-MS?GpJ<LteK+H>$nlNg!-(Mc
zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;<r|d`%`E&RZ_WuvDkaaQ^X8&lv
z!Ec=$k^U-p(%Cge^ccne66I@*;SmWV-(ql%^wa(${y6e|2^;L%pK!*x9>%4A;gOT0
z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3
zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e
zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg
z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mY<R>T8)SViUf}%2
zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p
zIl#3z8Vl<miY8lh@}KLc7GGoimNPgu13C}b1AXTlfLgr4^hsyufIq%-UlDz@BasO-
z`i;kV|0O>T3jBdQ_aBuXVL3zNWS<A})w0k10QBACpLOZ*cgh*q>BWnFWH}S#$6NG>
z<?J0t3Cn@=VLAK8pKQU99~<)IH)6x*N@4&NBmI53Cp6{k+TBK<5Mxx|9M8+WVdQ(F
zMV}r%cbnKn`zh(Wp?}Zb79IL$MIXURp8Vg}qTeIV-7WS|2V(l=UV;1#P{(6<qgNdL
zIshK)ns|lk7u+ubK}B-Qd8)rbFaOBLloRgt^y9pqYl^U^f8!=`w8_daNqwJbzANZ{
zy0!JVCN9Oq9*+)|{{z^~#z)tWGVtHX0Pb3X&TDYYDMXAO9(;rN{ezy9yUBl3$({ND
zdFpim7FIt@O#&mK3=$9B!l}*BpnD39IOsv22%n^;g|RFz-W=q;X&d?PD*4`kKmZg+
zwvqpV`j?xZ2LuA3cxoH@&-F`}M<_wQ5tP48(rP{yULL<uVkQ;A-6QJ7{_$lA)aLw1
ztj{X>?<hRU^L;14_q_DK!~P!i!zbaBr=@G(-N(xcH%CO@m~nqNDV%HHm2>z1AncRQ
zIiD;oU3r&}<f){B^3FE$$CQ1)Je5RH{+Vs$m%GT5pFw&0KQgEgJLos8Z>+oVD*^JK
z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB`
z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD>
N+sV24aj&l2zW{T-tULe!

literal 0
HcmV?d00001

diff --git a/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin b/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c5783aaf76e89eda9066225eae34d4470b74a950
GIT binary patch
literal 10296
zcmeHN+ix7z89%$TzQ);g45UdDnhC@Z)O5TTU)JGf2!U1&A_Xc`C>>^R_C|Zl?ATq~
z1U6|!tEwVW+lM0c0riQ8wlC#@N>zp0f(j3ahyDRogerv6)OHC_LD~Mk@0>H9-L;Eq
z6VwYw@wt5W?>pc5X69_3JbvQta43}O6S9THZ$wkgX?XJnxp?!S93S5Ku$U2P(To0S
z5fZ|x&(%HGb26Ceaj`7dJlgfSO0FTS%7Rs_6sp4V@(s`O+@e>l*tKfWF3i{SIoqx{
zUU4CBXB;m(XIH9j*)HX0iyo%7=Q_nou`+8X$9*~5lTe>?+<fk1k>Td+d3OQA;%wV&
zshV+0aJA_1I$N#OJ*VQ0Q~a!4J~KaS7hI>D*NalIg1MV7DWt!4+5nXWTQU>gt~>R*
zWZDGoAl}^!KCI?7Y_9<eNiTLrTWQsbSq2in50$pBo2cvuK-;Jw#N(7ov#wK{WAAHj
z)vJ1oHFy}17vFl|Wfoq6{{J_Rbpk(7&X=ufp-|6zGL_XK(Rd^{PU(WJ*ALu#{9aqq
z+i}||6&<7`b~$(g>_axHLfhtZGD$f4q&^1!ZD+ziDeY(a-#Qc9DuVjRUYWy`#1S*Z
zaMwY57ZCL*U|d8|+<-@c`Rqs+{XKviVpkXZA;3cc_zp$!4W^HE(N`6{rqdFam|k14
z&h{OK_pAfd?0JUwcESJ7aQzfqM}I9$_7AQD)NCH`k>U0DG?l7fF#T0s^ewbQ3QBWo
zhKM{)V{GVbvw2wbx6U>X_w!T;f*<LUA^}=gWIv3Syb&FNzk(zzYUo3u_oIX8Lm(2N
zF|S-3^Qvxku2{*<XEC#9oqDxeIaPGVyjdZ#-t6Mke5JTBk(w_|B&+3x%A!-7E!XEy
zEu9{pET1Zu3vPM5mRUMgNfk~Nm!|Tkt=ggp{rb;8e5v=B*9b8H*JX=@j9sC%%a@F$
z@Nnp|@nraQ5n5R_7O{HQXrMPjVKcO97~y#6;+pXU7Auz&><kgFtz2BYv~ua4iwrcv
zATXB0e+q?+9!%FRuB|d6{A`%qgwimlYP-t-2TT&i?vQa6>ArzFw&LqFJ49Wbv>F36
z-9z1d)ZI$madfMzt1$n1TH4ex2@_{%IYM2Xx`(Ly0(JYTdyJ*>^cq^TzI~WoG=^yH
zQMZ@61#~N`E0?9ePm;eBB7`nDTX}0`ZS5~>t1D{=of!%@$$bjBwUBq~#cIV&TT^(!
za@^UxXV%O1blee^lgpuG*X??7DQ`|v`Bf#_v{1BXD`vv9GHFwm<VO;TN6lN!<>`qd
zCjCuL9Wm21rtzm>Q*pq?fsdcL`CrMR^8QF->LcZ)NnR52r=>p9n@-^`A>rw9=z)*L
zdNEVVn^wN!xr^pji|1is)f~Klnb!Q|xM|fq*X)YNR%P2P*^VE%@;VZqfR5xRpd<AO
z=ok;^NF}$NIcj$&0@AKj22F~a=r#Ow#Vh8_L^3UwX*r8@#b=8-Fj~81>NcD`24&#V
zz^HpmYH}R}y8z1Fl1u_74xO;^l~u{w8GKPy;zy}ulydSISQ$l|z^Qv=yNUnk%^L`w
z+`ixh1d|tdijn!{>Vh>d-A<rTjAlKjn6nyiwLVv}R6r6aQKK&5(;HcZ!nR#^lN-8=
zO1WLZDF|+J5_>a>R3I0F9wM^v5Xi(v9-^FX*N|}-;#6{eND(wBgxci9I83a%J_cR`
zBk_d7S47P!c~+gbsfJPxG>8%tlj%5lH(#>miVE!}V5`DUC8nm+lhcW5a@Ez_oWwH%
z1RzVnS3*d^Zc<7HVhX0_ay}dnlu?CELLuoZRedaFjhh5bijejBjE|97!|o(g%Fu5X
zb9qgWQ>K6dezHb=f=FhLwgUr+UEf2Pe#DG7$PSgd(5@geqqT&I<EXxy7Lpv(We8!{
zlI9ugBTF_vZ^lhZ4fdwpC`k+BJB(sSbF^h-6k?~0g3SPco~_EddB@Aws-<Ff(ZZ)#
z*2CKbt}#v|j~>l06tkXPa_Sz}3dxnQI{bs44*#_54*yOh+C-9utdpI~+juketbq4y
z`MB?WN_x+JP=d9EUb-jKuiHyx<rcgISC^OA+YT?GnIT*ClCx#f_i{4eB@(*bOFkqy
zIz=1pWdcWJ3vLoym!CRf>)8sW+wzkuL)!OqI^bu5Q>pDF@!`$Ms?fckRMWTODcHI^
zMNC=Lx~F~|x4NagO^LYgYa$tRmTC&;Zb3f4lAHmQScTM7a_ZQ`9mix@(Rj{nO0xi7
z)K9yf<<xM$*02iRlBJ0`%*hFqv`q+PVVi^zRP;0(o6JGS(F!0^vg}gA!ETC?dbp;j
zo)uDer{mNhEmvHy3iG9s7MWscM21R-Ozy(-@m6>|pt+<eb5dM`rS4Qx)^3gBXeHU+
z6os--b}YA|wI*1f<mXV}Dbhh9?P4(;3z>bq*QjvqML1Y?kC<GNSpjzR40H=O@A>Z~
ziKFZ9S@}wiUQ^ECtEHajS~{SviJVNv%`f0}uCtx#BIVoiZo*E+k89m+K-tNX4Fk&q
zXGCPIU+jdc-S~^_l5^4=841j%BBDvK&kPX?zY)ZRc>AEg?h%KDSlQ*TBf@OM8)8q}
znm_6T{2|fbhG!+f8z*r`v#%v0`}@UXxCm+DFETvDw(#W#2#vjI5gF_kQOt+Km`_GX
z8aKLYp~;78{D&Gpz=$p6Mt2+Jzi!v6Nhe&(BzX625?xdX5xkYUYZEyNe8q=z%b^xu
zZ;|2GMaAAlrSc%gN>rR%9&Cy4Voq0JKgGHc5i5#BT!?{Qc{vK;g{XL@(a(2Lx9>)H
zfj`$65Hoy70sPA`@q@;|0Nw8OzxQ-Z{2>PVfq|>BzP(q`Ki)5H#~MxYFGpJ9+gOqQ
zIOHuyTjD9TuF-n9CBCQDWPgM16>)z?SA4IH0YB0b-&godREz-sA>m(qx{vhla{JhU
z)(oxBi=l?-o4HkpTIM|i_D62SdbUs~w8Fr@3V$0$>j3!0ZfIeCx*vqRSI_r}H)5iv
zK|4$U;NkC~#!m4f6048+K|k6U+==uQgDmeparTukvJ!2dH6$)XfRFSMK0^2lu~1J#
z-Bpr2maj65)`;(~mViG;0`Z6SXd}w<qmn=TOe4~<A3YuW(bGfw5%Kq<r>A{CB5nI|
z_Lcq!TeR<UL*hEpH_ZBmS>J(3PlF6WV|&TJ-l*^2bs^A?F#QPAk0D;NRxmx{bM}?N
z5%Qno@mv(^p<#cG{G0G%IIGRuNW8dkKpgbK;h(*9;}o%o3+J&m41bOI_t)i_6DIKS
zDArNlFJ2=bb|4=h?+J0|iXq-O5K(K$dfO1cj}aavgYav>j}e~Mz`tXN*J8qC=bE<(
zF~j*_KF9LD%xmn2xhG2W>-ZOV?c>QlZo}x2zx@pZ+tHLq0P@LRs#$!$348k+F@<l!
z2FQQTK>9VMJjBm@f$_flgUIgz;^%n9LI>U6UN%{-h3OhH9&G;sTK7^u`}VscO8FuE
z8v|aKwHelby*p^o4)U(Qe4n`9-NAKo2eH7ud%r{zYE462jEScfQHHVV-G>5AcwKJ@
z|5i-gv-p9ay4z}f%MdTd#2z0HrvJgUZ6EvUU3b`bKlm_yT;u7<`;Q;9ANb1s4^q%R
zZJ$QqKOzEDE@WM3H)lpIYwEG5u|ywiR1Y@d?^nRfAHh$*sCAG?@#^t7rxX$my{B=F
z!-zLZZWi#JHW6Y@y}ryzGQaPny?GP<*A@TYd`>y(<R|+&VMO^mWhfqMZLO2`=bP~V
zmWw++gZLq_E~1nFCW)%8Q^?t26`(^3q4$WNxdGzSa$cJWz-W3x@jtKN#7~d1PX3+P
z7}(UMU+k5PrX2hn(foQJ>GxuS%_urOn2KNTGwG-IYt5(OKiY&py#YU^R44x*H{n02
z_~XhT(ofIJPX6~d;csrBpZx6PzZU%X*L4PdtoSMB!KV2&jDF82esx?lScuQcss2rw
z?5Fe7lwiN#sq_;**qZz@EX4jG5OdNMZ1n#I$fi#IpD6wyEzLXmZ<6^V?(87c{9goK
zw~g|T{HD0-|Iemr^>|V#)T53cfw#k__ebl|quwt)2KDjxq1qQsuSdO4dJOWBk9t2e
Yy&m=c=rPDg{^>YsdOhm6>QVFk6Sbn<Jpcdz

literal 0
HcmV?d00001

diff --git a/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin b/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..b6a468f77a33e4d5c2147ba94dfa5b59314bc5ce
GIT binary patch
literal 15624
zcmeHOYiwKBeLv(SMN*PQnevNHT+^A9*t0?N@^MMVL1T35whs1EtV@SB%@~TLEv6F1
zkhF4H+l!n9M)zTHy6wY?Z9q^A0|E?P3hc|aZZNj23kD3!URY4{LnatSkYIq~#&Lmq
z)qel~Ip@AeJGK@SD3A;I-t&Jy&*M_p&wcvTFN8y(LQKdjEdExkt1&$`Pe>qFmd`Kl
z_@X!|Dk6&eoCpb#?JxBQy+M5uBmEq%NSL3s{!*uDi)?2t+wPoM71=?{9%Ki-_F%PB
z>#nwIXU_Fo&01r%v$nc?ZqNqSbhz(^ufJ69wVL<9*=zM%y)}sBM7GyjzB`A&Sid{Y
zW^3`>+1i<2eWfL`5?F3`FeY08xb8!A)@m-j*01;bQs_4LI&wNH;NZFgyEd=~Uc1UE
zj>YFDiqohsFQ4tzyGxw>Zf|w4Iv950Hqll){XxAm$bDFrWs_|C{?pV<x{E%y(pt%`
zo;lNR4bV?Wmg+}MN0-%VD2j$Zcly((Ym&c~tJRm=_5S#LecYRJ3f-YM?_w4Ch*~#~
z8W|!vQmI#fIpo1c)c8$2GcWJ-P&`8{o58CNoFTNdOYJZ|>LS$W4;eo(f`5zgdNp^+
zyThb^(nY9|34ChSjca{c5<biPsS*AyT=NP`W4hq?1$ri@FRia17V)i1>xbh!6oT-_
zMzlzTwiS5;#-waypW>ZG9AVJ~6bk)lW)k-hFd}q(u+lv~SnV~I+MVXP21d2B_5SK=
z=jnF+c-|7Ck+;6ww-)WL)m{5?FTd94_DlJ~({^F0_;h1%wp@Is(JnVyE9ag)V+|LJ
z)-zUrCEFc}(6_(;d;c);T~sUqZ{!sT89PHaJ}{mQ=R!szJR36B!f%Ms_5Zp4?&c5P
z1qSG637rUMoDt)$MnlFVQHJ4{L!l4w$lwm?NN7`X55q5oLPm@Phv7{TdjA~&kR67v
z3J3p^A&23Y!pOi991DwCN`o&aKzMu>#$3pFkZeChw@={q1LGUd(fGT9?ieEo<31X%
z(Cr}Ip1|#f6!{Xt5%O%FMqj{f(>Sb$?-@BFW$E^7;kTeEBZp)sM_i}r_K$4qucfX3
zWZ=&wP$Kd#<973gQK4~{#>Oe!-!uM>ISB*hxE1L}%*H2iyLr=i=Qj!Xs}S9fkl|m&
z=(=%?U>U<X4Bs`bG2b6EA6xt;BfiCmpCRrBZZ~ci6zvDn@y{~`xF?DJvvecs4Z4iq
z!03HBl#&>~|DN$ToZ0^-<u{EwiTn$hV8$CCY#Oh!+5ZZ`)lI&PKV{4;v98iBOi}!Z
z;|@~?yg(W^ja!Tf5hhL?L*nHow28iX)A(2B_!c|xZ#;gL$N$b_WE%qi!Q*f8n5%I8
z`gNmClvl`&O=F!CvXe01Bi`#_EFnDCM0Ty!>*G9@sbtGIw$*!QTZ2r0rB=<=MYi5-
zVn^%O`t7f@GFA~in8mrS)$3%`2AIiWZ#&z`m{umcSjn7SUR|s&*ZSBkpLi07%IsRZ
z-(Fm9WwNc#pf}9ic_}|EvfVnigG~0ERm^0&gI*>pH9BTKL_>|`Yq?_|l_t}psgS=r
z>-lZS6g)Ejzu1f72ptBW=?vP<jG3>9^8}tnI&pr^Bo?}l6mp+Dp>T_cDY4Ld#5D6R
z4)%d-K2kD)6Z5BPIJ|cnwZ+!ib|?2REt<<sS*Yxy;w==8WuCZmadm0X7Ng?DyA?Mf
zP99-{AXipa*Rtni3}*RQW}z{tx0~4^-0m+eXH|4&1(Y7qEfBmSRS=M-tbIb%J59}>
z&oMuUX30-5HNh&JfQ6p6ozH7R@q}1B({k`g3oJtu0)&WI?Kwt(0f}A9n<u=K5>fI}
zwBn;!OaZ-+_u(n@ULN>j&WD%U3GdfKarczdK1zW@&x2l~T;a6a%`OjAI9*P0FAvYF
zpazLfmf>5FCJn;<F@iGPdBiY|$SKc5_c$V_oqQ&ux3UEilg)tIdYhpjWk5%%yfvg2
zmbHQ|PYnyQP>2BTqfnko6}VE8!C#RASBmJqLKV1Bgobgq7Ydv@Lc_EN%~>O~uSP|#
zk#yH@waCdLy6Mwjlri%?j}CFl)90zvmAyuWs&45tjl-vS#?T<K(4(Nn&}0UvaHBe9
z8W7$Xn#=`!g`>KSv@e_xiUx9G<=k?1sja74k;_NDX%-8nRVtM&?umLcyS$o}3VKBj
zh9-*A1U1;=95Tm4Ol6eKOW<ILU>1>CFNH%PO5P-UUJ^&bqzVe?rAX@#Dp*tw52?h(
zC7~kKcE#s@iR;V#PKhsqfwS;zyV=q)G^vcBkp^W{t<hZmrtBMVK{*oW7nNZ`K|qvM
zI--bkR8-~x3Id{>1V^+yPE_Uz3Id`ghwI29hofd?CZQoHi}<mJ>1fd}BeHTH6`>(0
z%du!#T2e=4*)=LwKvphhjx5%TQCTho)iNl{E$qlrb!k`9`!<Ctmtag-l-5HsscSTi
zw0ddes-~&^Osh1>6-`4~eRAZDrlD>=8f910sJFfLm_#-0=8<+UiP{KAW(bg|^Aw3z
zD6bS%Lz9qCFG*Qp21=LQ0?_YNdaNKeR5g07`k>WaU2ZpqS^PBCpdTOW%h^-;haYaO
zwHt%la=ky`lZ=!yvot52-2{7r(sU9bXK;3rN7C#m)2D#+HR_F}Rt+z$(1)e<Jp3>-
z7jd>Bhw_=DnOx>cI1A?-3({GHzYS+0HR3EP%9^(2Y~FFU<Z+hPHE(87z->4SsS#&&
zjke`%!Ev_Yah7%lud~$R+whjeM%+c$Z$Gz1$6eF(_-h9H*2FB?j>8ZeaagNDeoOb<
zxh2=(qSs-pA^r?w7H-F3h>bX`=K&nH?%83>b-3(x7z>oo;k=&E_qH^U*oebeoODZ+
z@7dw9<8VIbahP5Q{0{5JyGMs1HsY`@7BXD9XNN1U!v(Ly*mB^o|Exk=OOH-vbSA^w
z2Jr+~Xou<1d4&fh%mnXAh{Z}E(Z=K+(n{c8=n<Z#j+a1th$Q%rTP5DlNrIg4O3?nn
z5(V0GJT_>Lk_0;J_yV)!2@TEQcs*v1mo(h9MSTck@0;?Sp;^Q}FnT_af#V*O<p3R7
zP)B8Da&QmIsxuAiZWhO#{)#exxCdo9O^#j4NbuxXlqtqND9f32WXGNVin0p02V~{T
zs3SY>{70uZ6bkpCEb2r1tcrKs`A;6TshmMsZe1-~qSF8!I@GTT7Hg(=dNYr3Jijwo
zEXQyA=q8T@Hv>`4Tn<rqemJn$KN6u{_K0vDiJGUd@hgY6j`DbZWb<{c#YM2Vj20^F
zb&YdOuKVYV#rac$<k~?0VT<iOvCy-sd}6_p<i41ec?ys={J!Lir_9MAx<8&Wr-rEX
z!sD4~WtkiHIq=d|=G-WTvA&gyM@wd&{ewA=e=>U%E&Yt)$#+>A0zKd#r;eyqe}c*~
zaZ-#<!R{ha=G-ZU@%~khnz?+D9mGs`9jrJBEL6~Z9tSI&K%)B{tZ)K}Y6cduiYJ2=
zzw;H&pkf&7VWoJ~EEO#F5RP~}bW(^6`#t1bR7O%*;S^RtH7V(yL{&UFtoU<S;T$T4
z@g7!>=5xggdx)NPJ*+x8wCEMhlfx?Kkm&v#Ryl`6H3Ki?RZk8B4N&DADu%HhR*Of`
zW;yl{fqOi3a#*%V(*3=#$~ml#<gm&)Bx;`Dg4|}SdU9Cx=dj8-R1D)itR5}o%5tJW
z*7XqU5iOIn{#5w)HP=Nm$5~_!e;U!!EJO_Xz@}~HJedR|MeY6$o8w$MD&6SJ9H&Z2
zn>m^jBP$V@IL8QavEIw}hn+?y;jg@z<HTBUf_(NG@T`sC3iu_HJ4T9AYvx$klxw#e
z4!?ExXp0FLlRsRTIVo+zN4zzv4+hja4iXku)1=?CM~I|YQjb=~(vf*ZH?6S@sE}g7
z@-xl?VpR-<{8$D=r5K92u?(n~W*F}zN}(7^<fOofvzkF!)$21QLcSL}@0#?!T+Ljh
z!z7r^+*oc)A3XW(I_I*RW7(ZI0cOYBxYnK@%kI2NGCSUfHG5$!d)4O*-bgijaU6TW
zIpwo<yb<1+-FZ)BcD#?>ncew;f!XmkMC`rR!1;@b`LO%foK~kve?YlN-*&b5H<5(;
zN~ylM*lVrf&ql%P;s`5$>NUz36#S^*Q3qD)4e}vzK7l}lh#Zge*AE#Q@04T4r#$0w
zM645iulxuiBJefB`-dFZ5WDa}+)fAPzr}mtkcj*ENk7ROpT)Nq_ic&D{c(|-65?h2
zi_A{5E&iwhY(_dGB9n0el9&x+Y(+>LU-S(H4SuYi|Er$2@Z{h?qi+_xUw0fJ0bD`+
z&r9?OUZH>DkLS;aw%BnB_JJ9Z9x4|W@a)Wp=g&`Xi5D=Y;JyLaj)+agA~upDYSZ@+
zxVDf<!0DkO;X`<A%!ses6OjmpkM2P_L4V$kiIbFcy1tVXziY>0^!3Zc4_{1**OTCn
z#ZYDO=!D|GFd-f#ULoQ1P<<WpATSRk#k`GrVYnq;jERloB5g;v$eytWMqa-nQt+4b
z8$TpE@y~=$0+#&A(0)4_RUQj*Eh(mL5lMjFUEDMXPW7VE8REYM`D@8I=wb4pJKKv8
zeP=IhF#WA7N`E3s@{usxi=^y$G^6ETgFbsQdRW8o$Fk=VzXX^sz$y5X1Hkr}9u8L&
znf!<S_)~zpvx5l13-&~U;+<xDj}d)R($|>&bvqu&e<G0oM1t}Yck-V|`12q4<)7l0
z`jJ?+pPF;+vwu-Z?=t;TdglaxC6&%ji$4h?-rb3pV2`hB$%tjA6HB@uHj<G`>DiRS
z|3D1<Q;9b<KkR+f{bbpD6B*Y&^hW~u!k4z1&ixvVQobTizM@foz9PPS&D#gUPCuYu
z=6gwAOt}95TlRg4SG9h?3-+$vPChXoP~WrzZycYE+WTZS)O@>?PVVA$it^he{ri3T
z7wpL?N1pY?_d4)K68iQ8^zFwgh)d~Pup<W?{%gtEOGnair#`T^U{BxYp^Hn^y>7ji
z3~?#3Z<^?=Zz(A*eP$2mm*K?_)(iEG+ad65yeR&DsJ^xPD#3ESaX!mK{VjXKSl1JF
z<2dG5#CQ&@TURmPUOUM1pXMjef8;Z5@7f}Gg4P4TDXd59fYp44Jf;swdcJ+F7m1R;
zk6y)kv~dONML6ZIFQhNEMe(P2eIb2wVXPF=KJvTsLYU{n*5;KY>_zt5v6L>av_BbH
zckM->pZUXby?{QH3l6*fO-I&!a4Lci2_^kCxnA5)@lgC6Z`_R+`GWo!#iRA7?F7>k
zDj#WklHo}Qo?>{)fp;)GO)%`G?Ky@yeyYcX80QNnczwtEwd<D0AFO}zM|^N1@g*NT
zIeEbcPfz`khB>~}j&Exi@ulotbKlc2$0zrf808Oi_!H-TBYzl9@cIZ51y4=-={sb9
z5*_xe3v);?Uy6VCE-yT9$GCr$J=9OGuL)@%@D$Uh9C(u99Rx%FygkkEoI~Hu@NNmi
zzU)8t7yUGECk(RB`w8NC{(NLhTn2*nW9-){duE3AN8aDSkM()xDZt&Q(3Qx~Uw#GB
z&jUgHfWO)o6Z};yP4c%NBRDoo4acw%0vu1$gkv}v20XQ2pyL^i68~hH07)l!dM_z=
z;obWL&TlT9*dwq*61));2b?o0`U(D|?HJD|xnCa_m@hGBzJUJ1a2K4zMe<8CfbS3U
znBc=ipOc?tGM(VPVLCRt{OPdBcwu~rd)N!_3=7(DN&W#|A7X6q$EX+fqlfINna@A|
z_~Y`t@EFbm3I4d3=jUrzDBlx&UPvL)s1Mc`o^L|Rqkgkf<jxG$7x92UA^EX?VtsJs
zah?Dj^-j?8HmmqaAFVg6?^<$K3i%;dwSLP^CYVk*sbBen^8xw|`eV|bJP%lQEFn+V
z%paNVVeP=Mo09uA)^naO+;6WR<o(zA3K{V&*fU9|AJK2Lf2kdb)>rK32NItkU+7{u
zm6(?hufN#OI6vxqG<*3oVw(2PNE!$7?;nExIPX^u{R?8|eur*^u%GdM<IvM$V$PvQ
z!|<Q?D@i{%d-;fn?{(?qKhJMTKQ1m;sh^eox1-{6LQL=W!!tBr*uRaaIQ`oI_`)vq
z4)q(_r2+)^&#&M-NUJk)`UcK}|3vvQsQcxGe9ha2;nWu}_E&d)ga7hSM5w`}y@TSk
zK937&?-z%+1ig1`ZT*%Zt|rAkhYq2iLRe+y<HyGt_=#kI_7<L?0c$4}oRqtbLv{8d
zg`wXE#>xMVlDqgJ^0Zb3SXlF{I`-;!5#pgOo7Z|g^%_fu$)Hc9Ag4BgB?3HGkoVwi
z<o`g)?+OS6Kym6e@_(lO%gb^gxS;&kZX>^sg2naEwAg#Dp!^X@>v~vt`1XSmqxTwJ
zBigIH*RM;Ud(w-<c={oU_<e;Zd4A7Ruzq|;`afd-WhI|cj6wUlj&FlbKAlv2`f*js
zYgpUYa=QK}VE<0?nvdkErGoPIZRFon_Pz4d5<&UDxQ+bg7<uwDD4z!oDsZBtw<`C-
z|6Afq|DPoCpITy_vF?rD^+@N>xhOd*DFhi+7uC;iMTl2`BYD#AyR^IyIXc%i;5_;N
zsQ{l)@;OickLw@GDY>{wzW|T9_K|X2`l<6r=<Ui;AL#s&Zttb@t)E_dyd>NvFOCkC
RZ?7DqZX>7jqn}#t{{RQ)El~gf

literal 0
HcmV?d00001

diff --git a/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin b/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..c7bbcfbc8082bb2f81a0bf73c6c8c5e63be6fada
GIT binary patch
literal 10816
zcmeHN-)~#h9Y6MUg3|`)2T=(tTW>})%9h>uM-scGD}h4Uv?4mnUO?z_?dv24$9Au8
zT&JPbDHB4|RHp4^NSlOsLTt|yn_%h#RCEt$|G*?vp=s(m<U|Z1jp2R1=g0N+wX?yJ
zcALbJe9rm){&BwNoa<|!d*Jx7a43|ziHR0wzhezKr=fX30L?qZ_}I;lv3ZtZ{piP7
zh%uwORCOKK%3`L*)nx(m(X1|&>^d__HKR~kC^N&&*ImPP3U0Y%R>}o)VWpb4O|xXV
zg<9UsT5fL1ER~&QvzT8jxR{!*V--q;(xPeEwwdu%ZCgoo$#U}cPN{M7)x1-KzBu=A
zu~^PpMZ{ciIh-q(s;*UXXDE?|ozJc;nhTD#oL4}xP{Q2F7bVeytqw$~W(r}Vn^mh?
z6+&C!EhN5Mz{6GCy6M*8A=$+eDKCvmA;*y<?_q`SZ5O5gKqwz2hGeW_anZ3VOFa6D
zQ+CVlY6TJc^u@Q8dr^_sr~jg?Q4yk^PcG+|jq<`mHSdbbmfJ?rQSaEjHB-SSA9~;+
zQ}CN{(<&A$R45kRzah;1CQgfvTHv)M@aU6jSNy+i5pUykZt>5xMYel5s!e<yw^tI|
zPGiH}n-5pQgby2$^J(Dqmq&dRF+Bmi#zwmEPXZtE;de`pMb1Ce#b1~Fice{Lj`J%o
z+G0QE_`Wtm&fets{w{ntOn$0cvQ5r%{H``a&YlH+bhw?LhBWnC&VNf6e+zBD#L}Fa
z#v;$qI5fE4XdGbC)_UVWlusEW{8X0~fl#(0GYCe=MpOo(K!sTajG@rqW4q9YfMKC&
zceyg{mYv*Ep=7V*Ff$jeYPno`s$fkoS&ZeDEbHlNsZyw>t?bh?#o6TCnewV*ubfWS
zo~oV66>NL1=-RHme7auF)t9Y&dd8@%vd|lU`sv&KZ|-4i0HKQ(327st&5u6NR>Q-g
zkF;~)Kd?}9L#qSQ2B<qtT_miBu5Dnr$U@q);drR|fffR8Lpw_Xo51Pl&!PW|)`$M8
zwi|NSIFl9zLR$;J7YcFO=C#cYK_C_3iy=fl#&bby)R_w)e7-X6){u4!IW$cjck(37
zzDZq`ToF!>6MT%iON{zA&Y|Db25DYHx6$0#(C#9LhsOh5B=BbHO4M<^b2Pi1x+l5v
zNt$wzQ#9L49T#?KMj}_WBQ$-GI$@jaH8+}n6CqQIdd-?wo12?|-GpP!O--Y`yi9qa
zMKWr6r&=hN^o)_lBb((c=3TwIY#xkT%)nz94*RNEEu6{gb98Xa!(BIU+%J~&gl=Rr
zx;V@qPb8kuzooAooSo9?GnbyyGc;!Kk+^gmxEavnXYc-3y69qfJdysl`ZA<13H=9!
zJ+hlg;gb;f%na-x#zM7_E#`G2UviyQeH+K_0cKPzJk06F%G``@R9r{zO360Vn_RgW
zpSmGt5;w+7^2V4+-54`7H^$7Y&rB-0osyL`HRn@!9ZNUO?xL>~r%G<Y)|0byY>mK0
zY!!S1(+4Iihcid8^*Jhwzy>Cr!!z+V3a$ZVA5P8y$MzpL@wKs(Gqd=LSc)H_6RT*8
zEofvVZ(?pre|!fv-4l2B(2YZE2XrT(n_|J`OF=A`YsQKQK7kW!GUr+a+o&Vp>Qd2=
zX`!$t9ikuX<rq%Y9dIn!GiaQqJD@uS-5sGKmy;+93e+DbB0-!WCVIq~#0kFxzRbWE
zt7Ln*MTD>k%d?4iFRQpecEIW^WuWXRs}u{Y3fptE+IhEP6kVgrYoP+8lMXSZ@z6Cx
z$y+HJO9e@Gl5KioHj_+eGAT-=qe>}>ixBeQN+FYe0&YY>NFoci?J2}f47{L%MCOWR
zk4ly4B*Bw}FV&T-N8p7if=SLwH%}MrykfAWCx`@++@RX3q|z3Z4KB!f1%kv(>G3*N
zL+uV2mzep<N<zoJm)|UFNuJR)=n^T>&tkc_X8jv_T&JvenB|GFsl;GCXcplI%pzot
z+1e~m*^Fm4*)~f#?97EoFLD;h1y8<BHAqfjDP=B@Dj<-2?nBPWTW-EmE*5gD2EHTb
zTs*Adb<^?Wp+osvA?KP!tLpNzP)H>Vq|l#MrKz1(_|hxB<B1NHWFcqemhvXPD!7I(
z>AW!Jyy#~{^gIqKhmO>XoQs<8juOHD2T?+*D@xemn%Yq+-GNp7%u#aBGhUQwUzC|(
zlnK7GZLuf1!%V2s{(P%RVqJma^aun>#<#Ve;y{Xfp(f(KSQ9u)I+rxTE7}&vQIBBN
z_W7K_LaZxTFLCW<O>A)}eCBl}oj5p>?Qm26aOvFT6}%uW=p?TLDzAl9I+;E?d+$+k
zrYSn#z*KBLI;);?T*Ip1ZD`$CaL*WunWs9L#(CPCA;k+RNd-nHt`tjgm-&H#Yr{|A
zCmUZ6>^w@6e0WF!2ZxSj;?$tcF4T;Lm15CT$+Zz1j#)flbkX@H7xWY<po2}h!!Ll8
zu#J5ANjzJ^-xM7N3(J<_lsrkEtQ40QRfp;duyA~e>k->3LH7jRBGSC5zZ6#maX&=_
zC6Wvw#`ivs$Igqxe^pHB{2JkTNZ>>#DJSoGk86oT?T65O$)=~bv-n0?&GUnJK)(F5
zvRNlz!?SL%o$X@fzm0S=1_gHt?sj0=%8?HPYXb;`MW&<dW{l%B9}#o19U1k_(-GDn
z+^vH|!rQS7V;@d<a6gOV2c!`X?qm2d$w!Ycy#v<BnvWi)_lrI_#G)NAHwwCT0`KPb
zwpir$D7%OkUGL!&8HP`IlR+;t=%d{>&my~`EQa}T81uOZY2%IFrP1V5IsQYApJ1e1
zP@{KOln>RuEGI#{(n<L4w}nEF|E=TwTj)`CuJ?F;E!1N4`jQ;~YK+~sDlJZ6EXCOQ
zwZRtqA?C#M3gCK#H6;tXG6a52K$94*#Mn#qD1X;@&mL$5{d|3Z&C>!<yEMdpQXd$g
zx3|%cFATAFhQL2CFb4RZe#!q#l-)zT<PpiQ#airnKx98o@NkR0AmIwZqb>F$36uYI
zvED6uh5PO_=wmH*LDI7^HVXQu@TYb0!cAm<#KCJ_wAQx)UwRSz(H8rG<VQS%;AdY8
zu}b!14gN&#tcTezUwiHAtuXk1kNE3aYadv`d!Ur-)54^%)qIiD_t$r6f2MjOe#FxU
zdEQ#!M?NO&gS$|!Y>?}Fn5|z95Aqt?YYeh05%3cgjriXudHHUdVCb8y#~5+Mg#OTP
z&VYU%2+9}NKg{(H3wrpadL*zueS!7q>!bCFc<a;G*SS8Cj`dl;9Eov{_I_oMWe4D&
zI19MHS009cqul-|*?)guWc_mguos^l0)33r$GCsfkSE8{WPIzF2S<4ZC?BuHK&Kt-
z(L<B<D33o%@>sw1%e%%r{#RpwV^PmPtmmvdg1z22Lh9L-eQd%Fhky0<ooTKU`J8Yy
z&4ZD@XjSZ4rh_9M13WAWhG5jwZmJK~c$VGQ)F?g)L-&Uoduxd3IK+s)3HmhA2?qTm
zjlDa>be@dH5!^|<e)LzlzVC7v{^;ZMBZ80qI)^<v`Ntc$X#Ym*8h-j{h%E&D<S&tU
z2Y;jW5lL_G`q5w2P_7N35B00R&gq{138(KSeqQl>4TUG1NI%c7fSK-SA|KrUeFR6u
zi7+qpJ8W3|76bU5Y0#NG{sjFFN7JZc_^MxenBC!sb9OgyRNoU$Op_~)dVUe`5c7Wh
z0sW!x5F@pQ#;y&q=T}*j`1|+b0wB6N&xn3?h&{Oa38Qnp)p|~27l+umM~BdV0XEH}
z-@pG}^Vp-0;YU<HJ@Lo`N6nMpd*o3H>WlV82L7iqK-*sI?@qVoCHRcI-V~Mi{f+i@
zFZo*~QXU?jNKl@BISrWLyt=#=m!tsk(DNEUUNMeyF2>TJ2XzwZ1^KYHB>B`3<h_3j
z`KFW~3<v~3c4`axPo$&c0f7L>e!YeKn7lT_{IFNvcgK2{AXVBNVmAIIG47WHwMP7`
z_c6O-UYYmdXnI)6za{Y`Pp$;z_h4ZVLYIBER|x82@M=WKt95)5!i1Zbd}`38yjo|n
zPZy1nQ}|zOA)o0XPbC$U|Kk?&C!~B_Iz;yQbFPGy{Rrak<$t4xec}t+$D=5(U#f3^
z`<axdoco)SS2+6L22|kxdx3VlfYpBo$bZ^D4T1K0ddwwxqWfE8LO@cDM<U*i1_b#O
zl>fPu@8AUxluwHKVfO_XmHY``VtUmd#Z7rt|7#4WVf23#YHC#ZBlLFU)cSDczD})|
r8vXY8gHkY|QdX^#8vSw<qgoH2DLz%|)aaL^_*5Q)aw@NCRC50Wn^<wZ

literal 0
HcmV?d00001

diff --git a/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin b/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..1de4097ad449a4d7aa34afa2f37a276f37ce5d45
GIT binary patch
literal 11320
zcmeHNYiwK99Y6N9Q@aUv8c+u+TW_HmftBl561$JllmcT~5go9HVCd@FzKPvBwtIaO
zoR-qGOlY8qPGkEpX&(@uCN^oBgeEpkTR#A+CbXe_nE2R5XhO7fNhk<u4Da_pkLznY
z*&|J>ZQ?4q=ltK#bI$d(PuzXro?sx5*}%jVWWQ$fa!$kBw+Z0wVKF{@&BJVh<yar?
z*RcR&Mx)el9oNcZrpCEx0rSypl&ZxhGpe&jxjI>6hHE!n!*$ATt!mb5Wpi?-VHZua
zYPscE+ss>Tp=4HT&a_#vr^+s-rt4VcYPmXPmMo{}TGf)7%6JObNmT>7?c(|>bL@ug
z%tC9DdplLB<*f>`P<A<7s8t(?#vP^98bv!lGi6RX*0imFO1X-;V^<{6gRKrkb=DNZ
zL^m5&qalP=!CNRruYiZEyG_$=!b7r)qA4$pdbz-nB=2E`?`;>Q|3D}oC5B|IN@dEi
z>Lnh1-Kn`Xcdm{IefpB?Pyx}YSMvTz)32}?RkuGlZBHAu$;pQ8ioTaiOVP0!u-a!$
z1wVM--S?S--%OfTrEH;p(GvdUVs1CFyL5C4?>vFWJ+0Qv|JnlbmR08l@E<Lp_1R3-
zLcgL*FpXuYvB93jIfSERn-3e3^L@bU&ky-1VwwkDW1D*L4)6gVzF%_8asGG@|1*+b
z@hOep=lsfxw%7|CA8sS$><=8@*Mq+<NPeo*wM}+8eq$RUXWs#SXwa7*sq&AU|Jok@
z7Ow3QOLMw37J8h<_`u2e`4JXrotz(u@F`=2-_fH*BBZU*9t0y~Ln;GL02O3^24f)b
zx7bG91HiCA!kw-s+?rD;m8->>0%qov)u`2~hssu>e267VwP`!iM$y4hv=Vl8wvn)E
zZaGmY=M(igw^XZUj8rnwD7*IUb*pgLnz9>0al%xp6V6Q4E#rJ-h4P_8Gl%W0Ggq6d
z9hrNq<~%l&ZDwZ=S10Qkw>&m?sBok>b#$(1O<BdsnmaQ-oi*xnEbyzpJojqfZ#T;o
zQ?e^{+i4Ib`0;Q4^h)0^UqVz-<XT*zfVL^H{QgDlXmBv_z7`CgV}V7j3E;i+S|F$g
zmNYGx3|v^&P5`(FKo2ZlxVWr65qyIMmKM)1FTQ_)Git{{y7bN^u$&77E?jyy6x3d4
zPy`b{KS5(q+lygIi{h@$&~y>B{SXU2A4I-2UIFTi&I$nGbC_wH1KPEu;<I!kSG8}@
zZ9Cl_C%3fgX*xi+{d60on}gfZ($eCRb~Aw{-8dh2=P<zY+SdsADmg8rZJJf-HcYqa
zF6H-o6mIJp0%vhsTv}XSKL4Ib%L$|pvt;1nB664vEWdl{-4D+5BrNl+zqh=!L|H=`
z<|z$S7-JSiE7z)e&dB0<(sHJ3S8q(4JChbOtYQ%-OT%oGkJ|b;of7gS(G8p}Q&l~s
z8~L0rPL4-XsYmtC>&JGE?a=9eJi9~B(U`-(#ATDfje?#$zWbl)qEqORRQ6x$%aOh`
z^zRh*$Zjr!|CGSzMqvjrmK)`K#nuhG>N<1!W$*DL%&1#<y3~!C@loBVyN=$In9DX|
zaua59$5k|zx|+t)SJPPLY8o5$8Ox+Ezu#mljQP}F@xB{px6((HBRD0BdMce`#|WH4
zx8t*&9v!alNoDT9BIdX}qKpnZdq&f36g&i~xF=0Gw*7#KPp8#_na3y5YH}CtJ(Z$Z
zMn+!prm%b;S+|Ar=<XFPjIX1G6f96&IJYUF>DsI@BNCCq9yDBVt#Z+5BKStBV#pMw
zu=@==M9;0{9`?+2OF(+X1YkE_Hwzh9=m;D28%DoS(EgZ_bi@o|YNeR5cdwgAqwvV8
z7QJjEpRf%3>9JAxT5~)K@&JmU6A2%&>PE#i8oYva93UWrDU%(~q$$jqicu;{vXg?-
zf}Tx{<<i+)E<+(ZssYkCm_P?t1~K~yLNN$q@U&P|3ez~nhMhF0rjxt5xk}BWQa?Gw
zliCSq8Z&v1BKnC(lZFpm@Ue2yRt!by3OEoZx2RSd=`2!NFp$l5E#dkOJ=r8bWK+Ys
z#7qp=Q#zKe{7gBU<|#b}CBk|AIO@qY>z~t;I<?3OP6=K(-{lk!v`7^rr;t186nq8(
z{M@U8V_U9WuT{#0IRhV43oagE@IL53de<&{wp?({iq&xWz9OVjMpt~W(-oi6-4)+~
zREJ8kP_POm+r(44Yxtt)o{xIbXGHWo4(hLt)Qj*$_w`1Jrd&gmkm`vNwe5-$mN{mv
zQSxpX_o5v4MVal6lCO~>EuxibnZgoTLzpDi6Q{~pyIW!Ovf|_$L(YqHr!USFZ>5fv
z#20T-Yzn>gq}_fkkwUB|Qsk7YYDenjajhlgWlANzSW{_#u(YS}-klUjTbg$OHP&P%
zo6hbZ+qYkAD~is0o7&7rXN@C{Ygl!>sB9XO?omTA^E9Wi6|Tl0Hnwp%L7Scu<1X{0
zV`=#a8L{nB!@+OLkvw)Yw4Y68?#U(Tg0^CL)|i~BRFujL*G6pE>CnkNbiTYL9Y;6c
z(xf{Xz6T52bXL$&p-4PiY3^@^QaL$o8BWzB^JEEmo{F8jyE!;I!`1LyQ1{7)juaj`
z#Ia>dg>}Krv(F>t!8Y*G+_OjvV^Yz0qR59jLhe$<t!1FF%}zW%Rn&`}s>pmD*^v&W
zJqKOopvVMwVl=s*`=Km*Jk-%BAXndJ68=>!mArb27b_$Dgi-FlpDs!&EF!VOdD#%7
z!Ulv>$sA7j(QJn2Aw|bE1*x9bd`7rT9h&?Mqk;iex)jkl?N{XUhRf7_mCVrdFkino
zwK@-0|53U_=f^C!h*f$VF)_z>y?l@EYCmV&)gnEV9tVrju=xQQmG4EZeBQBV@j$<(
z>v)eo{$11>wXMm8D<jk!*tAuk*rLaxl*UjZ!mh!%ndY0soID8)`R3UWn<w0-28e|3
z_L7WU+UCJwHp1BACJzoVy@Rf?>pS2ez2Ei82Uw&7=0-r*4&vSDmKF=$7-8q=MI&CT
zh6V??FZ7;={<Rx&EVMDgVwevGF&__+Hh$5&OqzTw$G^+*Ba9aX3*K2W9=`<Hr&qx%
zoy6b!wwB&~a`%)PcGoI;JcfUBa_U&1#ppFKIdD^q-7qHww_&Wt*r{WY7W)?F#Pb5+
zW{53H7Pb%vzkG8&4CF$L{h%4)?`-#OhDOj&HKS~T5=qx<arWJ2G)iy5BOg8+XJ_N!
zk4CQpythyCKOSLwiI+Sg`D3vbdkPTQPZB)XV$VppPVi8ReOtohe^b=0MemY-p8&nD
z#l9)&`4}4l{bPWwGtX`y`<tA89zbh?;L~iN$@<xIjMejt8tjK|0sP|^U%agq1pRl2
zx2d(ZLY{3#P+XqguS4IPr#G;-;w;>hFK=6*Bi`-JFq=R>^piaJhntZwwOWMhyPutW
zIvC-#-(n1~g%Id5PLC0NAs!Al#hPHG5Bd1TQ3Q4h3Me12H{2ZL{DVS&aJV_d;UO-+
z75<PlT+*N6W?w&H`Zzs-{D->g1%Es06%JFqLSDVX;m&%6I_h=u=}^pz=cWMclcTuE
zKF<%Q3V4CyAL!$+{kME(fQ`ffZye+vQ$Bta0~{IhVD#UZ8^@ZPzk}4Vg{^Fx8w~#B
z)myUM#=>d9n&!dCf21MS4%5Mri~)}EdU1Fm$hJA55YI5z7Z3EKKcN2!cIPtczcsYY
z35RL_;+p~TFX^!TfyQ2e{-%cX%%f%J5nmz%I<^XuzofCZPAeGv9|9)+xa5aM_@`s*
z0{<O2$#{PBmo>E0{FmA=r?=se80i=M=qHMgU)b_!{px4Ae!&la^|PE`z^@a3jPGUq
zD=@4F-O)ro$o|=_1c#`fxqXs%f-$N;hhIyun;lJ~it;L=exQd%$4~I}5OqTM{)etp
z)W6^l2Kxa;359r(AC&XV1UrZPy%i7jJB;@qn@7HppQsaV34M@9eBj5fO#D25#2*F%
zDL{UqZ$pm6L8tt`0r*}b6O8=A{(amqAU3bD3vu?;9P5Js>)V2jp6F`7Bl<gW_Jz5R
z813J!){`1L6KB_ZbO`+)!lt?ZD|hWP?|JB9d@0VS2Oqe5zxm*oA9#q8_Q~~02L3BD
zK-;s}S32F9kl^>krr{BZ-+$2_w37UoMBdaz?3Sl*zPn6t^<U?Q7h@$UKs<C_$WxA<
zO!@Ui(7SaK;f#FPD+xC5-p%{fRpdvcp;KJ~T_AgM75NwC{<xt_pbKPYSCRjjJl=0t
zKU|Ozbj#l=@O3XhIXSmQOtwo{m4WtX|8-OV^$9-`^U#d~@sh-oJRODI@*^k=eCo0P
zvXs|-@-$WQs*d!#4#G_k(RaNm<yD=@KAlfWPT_yLiu~dV^3+n@@_$`LJ|aI*B&APe
zpU&%U`OS!bCI53$UinG##MdpKgMIw<tbv14o^tNLl)S>x?`}w5E*FJn^XdbUB#?bt
zKl2`kn2bpKME76wHw#Fn(Ib&9|3$y=A)mVCe<<bsyfp8Yzf1HFyR(Z?$=?gQx>o8x
z3Xk%te$T3S)p$@ERHMotp)W^H)koP;qpFu0{r0GTy}Bp?HL3#2(Jx0RRS(6kMpYj*
T`sFA-l}E*|MwM4JD!G3EVVCTx

literal 0
HcmV?d00001

diff --git a/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin b/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..4f37daa42cedb56958717d999109c3a635cd248f
GIT binary patch
literal 9528
zcmeGiTWlOx^~}uL*>!BMotCtQJ|=AzToJ}Q>(?eJt=sg4NYv045Ut8EyF2S0*qxos
zj-7R!w3~nmLa3F150Ln%`b8w*1N8?9p?<VgRPj+E_}~L6iXv2%v~t{pfLd72xsUPe
zu1&#iA}V?n&%Ni|^FHUEd2F7(|KJ0vqD*cBK2-P(w8WaGH*V*^8@KTJ@uA0I4)V~C
z*Xuz6FdCIc7=(5KS8`seahT0kqvDmCV0ep$>y_(ZgibRw!oUsdp5@nFtGv)~N|xo>
zp}Xi<1v@NOEUzBatg18bhPbrCz;->?o43qtBxh?98WlTmN*{=fz-c(aA`0WS%~$IM
zyNa&5A+xnu_Zp$?g;NwiqvRA8=B;vI*Bl8{T@TlRQx%&LY<D2MMT;}jzSXcB4bHRy
z-o|+MGDJ8(Y+7LxEhN2|8EK{AyG4d1{s_x$Uo%nIj|gd_zz~mJt<DFwUt!++K|QR8
zOFnuSlb79m;CU5Zh3o%s9m@iKwC2=|db!+iLSD+^kjQ;3a6F}pmV_TYbpIiX(_2}~
zuDUjsBxX5&0<7H@c7=}Qvtr_SyyoOF_^_Rc=%jR>>Hq3XY^e##BYSlYQx1=q1|vNO
z?FwRc#jpvn{v_TC^yhcPcKGr^#A`6xga0z(l^A}zp!h1IPxsK*1-+z`5|1%@X~i1b
zw;8@~4Ix%9GJJmz{_hMg?}BUFUs1{afi;9!Eg*i!$Xa|_!h~Ni`s;e=+c<U$EUjs1
zkT_0rdhlecH4e%4$<}z1EddC>r$-7$NL`8Z2y@<qjKCidrGk%FMR|940Iv#q2FgTO
z^C!Z3P^`FKX`zTKYu;|u>)zL0d%~>%6y3^9?dX*6Rm){})-L<gwIlUlYSI73QMXbn
z*WDxYzUNF$AFEXg^<aKs>8M*U{3TF+{pTON(*Mhk0i@7%J`##Hs;pjqdqrDPN0iH&
zs-6X9MQb9kdSUgg)weZG%_`bcs*c#Fak+}?*Fm{>;hncN1+kYetX#N+L@h<RxT3aH
zbmL_dt+ImbbPUlk#h~p}wChlhb{kzty_TlSVY=+1%W=}A?WE;)x_k~VfBnllZ)qu9
zyt%T%wytO{?G4g}%+|g{7bcTkJSJqAkY&1L@Nxm=>54W$Fe$!xaTPr$7go^~T~W^}
z=m$l}SagDhTle(5F^kVbb};XRdZT9T&Dvntr4n}7hShM7Ir<EBOVLqv1AFYer<=M_
z$m_g=K4F?q>Yvq@_fGH8>2GFskDjMFkADI;n?>9d?q^Tz|97&e8$V&rzOTGI$;+Yq
zy<8vZ%}?T=$?^Fq)Pp{{4YyErbi?t&U`gL>uNw!$xAA#NHx_25bi)q=y(b=<m2EM}
zR(8+T*I`}*9l2|uWAYm4n2PC`%x$`IM0TfR(ymqo&A=V>)O^GX-I8wR@~}+cJeDiI
zW$3Alf9LF7c=i~ofG(vn!JWCuH5A%~sM4Le9O7X2K?`4syrNaW7a%XYk6K2x#E*ed
z5VR(qx+k_;_)qR%NAS$n1)C_Cyuh0dnP00f8VlTQ>`$3uXuBn&iJmqpRYSxB8&)R3
z-P!d_Li|>J&8_PzYTc^fNfaFS5W8Xq%Ygi|y^?g##HN`EaL>%G>mlxJ)sQJNq#i^X
zu^ndu7FIJ$v35e=sD?&^CBkRBCSG&X*?b=N3uQ;_1txZPmW&+UHq8B)ra<P;#73WC
z1mp+at0EpG#SJcYveS1gRop^?&rIe}+Cm{>!di=%rYVjKRpY2ziYzJRSlmeLlu#$@
zpEyf&3UJDigHaMt;vPNQMBH4)H+4KZ;+2F-ZJ8A6CotTMU;nh8)hTf6sOG+)iCwB$
z2udciy_CcF2-RpA+JbH}-NnGML&vXI-QtpgZ#Tsd@A&v-;9zdwK4;M_hE~;XgzP+X
zF4O4p0<*B57YEG_0VJ<z7b}j1k9H_e_?U})oa8>TK%pnVvnImGoiOQGVi>!9LG&hk
zL9QNOFlL+gWis+*Cguy)U#Bm0@~|XW$@ls~CvX$a5L=HoGRIx@^`B*CD)MG;%o}z_
zJ5s|>Vaeof_2h<5;%59owjO`*6m32`(~&=BF76OE0iK^SZ~eT;$m;UsY;N|z^gRcV
zqH!b$4co^LMNOj|9y25{Z(NjP>K78rV!_TOHWIq6h`R^12^9N4K1&0cO75ajUZ_?j
zXO1)RZbI#jI*pK;W9^Q|nT5GrvyH%$+w6#N8rI{ePw~!z$2Znl;tgdF>oYU@38D?0
zFuEO@`_^vnj#r|);|Y9NZaAzxq{PPryHE(6MSP0r9w&N8**Dc*!fwV73hp(aW*5nZ
z)N%?Pfy6`-hH$=~)}wq)x)VEM>)8af2zSFEZQ-}%EWo8*5!?sk04t*roB+LJUxOPv
zVD?%V<5wWrvCov?zIGVj<v-De#7#+PX#mgTUt(mC*}`6ukx(1ULt-Ea!?+$%aXphD
zX>8CtIW7Jv=6{I!J)FsR8uSiK{>yQ*Sajp1Ov3kmb>MR1UoY<4K#tV2+4<D6(q^AG
z$e`PXVQfiA+J&<>45ya++wdJ+(<hJ@5N;-5MUcR`G)#wsEGBIVAi`ry!ut%t!|>x~
zl6{-Fdnbkw_fIubFh{gBUQNUIo2e9i?MeRa*)*I<BYi4GwR3mBpg*33yNMP_82@sj
z4bLD%_GD4sa<UEI6|hh6KpVa%VDx846X0nC+b=!44PKsrv8LL_>Vcu_(TMZU!DukV
z3~uLf|9l!g-UOAIM~Ojru!-p`0QiMV8rQG3;afs}VHo*&zT4x^Z6o_h-&w2&rVrtf
zAWiaH?X&G|@LC!&&Hi)(5&bIad+lL3d3vy))xwxD2rmp^JfS0=YB0(X6eNsarkP;;
z3F1GOR(^F1zh#{g_Fz0SO?8Omt89M)<C*A+e_vPp`}#<9B8q=sUuXOi9q~VTdNdK)
zdrpn~Ns_)-)9T67pt7%PW48=a{xi)avv+{mb1U>U#kVvnnBS(+|ABt87s&r{g109l
z__j2tSZ^4<yMXyW0lUJ%!JobIsaaxyGY><uF$AoO=m=xJcLn03A|>(MbFiKA2d$^z
z-W3hb-<l9G?yVx6h9t%Z_j!r^E)9B;>H+n^9LvA{BewtfHJI&l80{zhtbj&P-|k`1
zgFY62Mt|m3gaP<DMtN8dw+Gu<>9X&bn7>1JARimpkuId+nI%wZuYU|16XyGnJdf0|
z-b}-TOYZ^on|Aw~8oZQ-8>4+>`VhjFb>NHl-D5rQ*yH%)0$Uz_<o*NJqhEOBF&>tW
zt{+9<e;@+XPVjM|XUqwVPxp)1VE2$1OlNvM{-i)&ejk4NMWKs?OBX+7RD?pJp?<`A
zG|qIdqha8Fw@rkvh=-2~0kP3dyMF`z*9HF{W4yHN<|q5QaYT1o7<z_)*yyJH`3C$)
zQk>~e{82KJ(9JK}tRE4G#o?WNv0K10N9-OHBR;~{{#*=4%TVyYDDcEjw}Ni|AxsR~
z)T4j=2F|GSDXJ04FY`#hml1AG(8=i`!7uYn`sscn`6T{_8}NT^9eygQZvNkIz~2=7
zSz!?AXU}s2miph_fd8lK=qEqB_1}#A_^;;-yd?N3=J6r<B`${x;upt7qN4-S5&<Xs
z>HM_TDA;O9=%@YoXx+piF-2RXD?Y9uKsI&r|D>ByFm?0a!RrU^jSI!K<bMeF<yf!&
z$Zv8{{(l}xE9b*Pp`2y>2)z|PnIEY~&N5$ej_YI3OR{hzgP3KW<Q(TCA7wte*g%F^
S&T&5SPsXvEPsUZwlJ8%{bWHyM

literal 0
HcmV?d00001

diff --git a/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin b/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..08525c4095e3de4a6b007392b56a1fc657fd2750
GIT binary patch
literal 11400
zcmeHNTWnm#8J@j?H>`J^4Ft?Z9FUEINKf{9Y$s7;wM|i_P-uCm0HI|sC-%a6cR9N@
z$8n;VfTC1Yp|pLV>I-cj3hGOx@=!!oMdBh<^?}Fs&2kY$A+1aT1gUA=@Be4!?Afzx
zNK@3R5+ixe{I~fp^UpuC=lJ-+LthVvLiwwOY+><xv8u-OXx}A)U*9304~{-4X2rCK
zq2D1wLYSTTPS@#{D;VkL@`8l<S?bI;YpyVxOJ==!xFyW4?RL$sQ}4E#rFN@cI=tAi
zYo%(dxzuVbcI&`qJnqZk>&%xOyY_EzI(El)mLLK>j@`ICgTPq6JkFY3S)3~!cFGI3
zFeT8aH!&t%0l3_UXfBm{^irqX=}4h1@MXkwP{6~ryKbrLcKv#lQS3{fnOK}^xzU(&
z%I$d$e%onvTixY0%qCi^+3A*>-OOi2StiL9_y14Hq_gP$1$)749X{N#yE1Q8K59C$
ztW?6HX!!nn4&GCe{H08(+^Cm3!{hZuZptn6DY<zWRpfI@-5k~^B2oh@^%5|*`LIzn
z{u7?Y3)_7Z-w<Up*vh~~gqH46n~ab35Nh;O#$PpnUt_$k<~{OtVVZwi51~d`;L~G0
zzgCs#eaxR8;NL`>P*@sMgWX5znMyply1Gk@Y(BZVYlMeF5PtW-EE1u0MPG!`QZ_oK
zc>jWDShUfHLVr(gLmvW@2<_=EwD)vdPIbQCtSweCD$SKUtyc5VdU=oYm=IOxv2LMT
zd35eEyV|KN7VGtVcm8laZ+CJFPOCcCXieqpDXZ|9b>v8K&T;IDJ7>0+Md;OcfAq#x
zzrl*d;ks<mP-J`P!p9LeoC#fwtc2G@Xye@9E}T1m?!vi`BBAgLp^(IW2o?auV}#CK
zJp0!VA~DDSBo-Q;MSofl#*m2c+fg|63pf=P<aEEAK={lMksYukJQj+W)bS`yqhB4T
z{xv~;I{Y@~f0Zy7A`bfZBRdHHICVT~)96O(9;MEpE=*?LM5B3hA4GQ2@EYnCsrwST
z_czXe7+EIhXRPi$R!4}75<QCH*^gQ0#mEt2{2nu&2x0s|<QWD!g#R;(zRZXx2$7)f
z0gA~fV*Y63;s!T4!$xF3kq&~g5qU$9slPi8%*IA!ngsU}^=|4ygpI@r=`uGD_YizP
zbs6fuO<k6{Z&8<{?mp`B=+2*y)M<P#bsXU-!eRljG?WhJlI?VGJ29rsDO_U8&Yayf
zIt!(lOj(%aS`GERQ|i=@*oIXgcHDAor)l6wqw9%bqSDVbjjUyum1zTMSSdG39h@Bx
zJ-m-5uXpN|hHaR3v+FDypI+E@3A0^BaWu?Dt6-Szu49;TMo-OW%ur(aQfBw(HB;6%
zQ$BZj+H+SRllRH|XTBH2opd#NyxFbSjBIXNtPnVdaME(cC)@jT`7iHNxC(rVPdfXj
zGCdU316A9fn*vTu94g@^+pLx<_FTQ0*-I(YsL6FQD~dN;+-*Gc>1(T323avEZe6Z;
z7UE<PswJ7S&{{GVr3=}q-Nt0KTdvp4WtiQWZ<xxt>@+A2(Q_d9MP@)iluUP@C^u`G
zKbK*C5Us*KQ8{dD6}hZO*llF{WpbL7hmMxx&6Ov-5}e-WrxaL%ZOf5ixM#J^Mpr3o
zXIWVe@|cln_Ty%UeQjr18__a~=97~eF*{k}^EQVyQUQX${s83=ei#WlEbJptOr0zu
z0R#OJl2%hhi=cGo9WO)_QZ3;NUEYf+OfN!9FGP9isGQRBhbYf(O3_?Dg`FgN9{PQA
zaZm|QOXtIr?+)Q-`sZQK36G7^C%~RF!9t9EGqIganCVx|o)aOLgKlp*^^!@;;A=6H
zoq`d1xi#Iha0|M$6n<|(mj*-27IbMhv}{3_=0nSHy4)0bXxW0!tC)wDE$9dgRsdQQ
zT`ZJqW}~H2+9_~?P)akMnX$zL4j_+(RS;t~lgU%!iG{_6`9!^DYnB>U0iFzNz_O}B
zvHBX-DU`PC)V?929H9b~1VojLde9c3t)QzZ8P(3x>SD{vU7|^(Q<Gq=UyAa#os~Vd
zgP$a)k|gAbpF}2TDY)P#QF>?+O!Si|Gc;+YpF}PzQVwbEm!f>oq<~*pS}RSXWsqeC
zkw$D`c*ezCRmU!O?RKkCuP&STeO{$EAmxU6D7Sa7y;QGuOO0};%O{AG%9@bsRV-8y
zHIuGkC{Y6#t*ys{@Ui|`SZLs+k@pAlP_|DkSy3%l=j{^i+g)?1zy*5+4sepmWQCVb
z!8nGqi5$xrcNrPuVM5lZwECS!^sm5aNDVlRCFYrW4(Yjj7%XUj2kWSv&RvPqoZFN_
z4wsQ>&*`GiX(S_@?%%aCQ0OUN9V&eO4a>*^s8n_R1j=!T2w5tkehOPcl-x9>v2XXY
zc|^%i2h_05L@7*D68mK2mIkFrCAW{n^^SzBso|BxW6;`EC1Gu~ESpoyVnYoSAz3R6
z@ID~Q=4escaD$?peg(uQAj;NwqS%mwqVNSlMF0b$yj@ZBq{VdC7?hPO1&*LB+vCY1
z?+0b&Vj?U-SvJU%1$9tXE;r&5lx2fFS(L~@S(y+BOi-2$@??48d$+ryJZNBUF@p^A
z-4l!KE8TWf0af?MBKtwqDJwu>D|3*@LJMtBEdmpKKZW{|vdBIc*~lW@?Rq!WBD+Ku
z_V0=!=bRFRCe%&!GENn)_(PLUaSF9*Ww3x9^Iu<zoPwmGn1v4C&8^58Msx~iFJ2bM
zi>QVE4QX+>q#txg^6fi|Bc>R(ve(MpmCF>SrO_M{_MTmxhS8O>V|JFCRU?kceTG^b
zbXpX>?(zqKB_qnc)gM<&h8!aOakW^Z6tvVatX9xsJ17}~zHtPVBpt%Na$|7_EoPwI
zjnhXhlbRHCER;1Pe*^W?XqPhaE?h;2G9ccPAy>49GQdB{fC@R30g;moSp1<32%coH
z3Ynn{2&81dRv5~Fh)Ra(88TAf7|t8WS9J_iAow($(gzEjxiiKwI$OY77{)v8=gnq3
z=Hf8snLg&M$7~H_#$@VH=c7O8aZe59&h~NVJ?`nD+}?$O=f*`s`8YF_JKr~V(c^}R
zL&BFW^s!r#-Lbpg&*SXg-cNVCS)*Ue$LOn;%@?t_`aV;xR2+K=uLy$e*Z?bk<QilQ
z3VvSjpaBczD%lWUiNlfTo)LcQdo%P!x63i9j;4L%si;^b-1P|};VtwHLVS3e2S>!U
zLTqgJV18lggF|AZkDn)jeB?g7bG~U)L~kAuKftT%HT*@#60D0~=7KRYZiwi%5pfO1
zV_}S~D9y%=-W}86pX&J!^}K{9T;oRXYW;7uZ`lI>wJq=`RzjOR0V%ybDaM!8+uz#&
zH<RMT%GjoO8e>xObHHv?Y+%T3JtbnUvUU?50UlqDNE&@FT~CUixY1}7!vj0v7U(Bj
zd^zR!i9o)Y5<hffF?#zR{p6XHcq;||SPan?2d+~5N2B5ZU^JGGxyoBlpIkFB369%S
zV!|B_lfF&y%++Fjj~I8wCVeFc@09jUxRHoQfAd+9c4MQLi2pSB-${uZUA&40%nk5P
z*fVS6^*kZoNrlo*6yFiRe~I}M2^XJ&U&nXi%0`6b)6S-%pMZX{Clw`nEKK&KqTr8`
ze0!`7d`fuo>GM(Ix7@2!;*_!g{4vCvoAP(kO=3dI70mb6y<*ZGyV`>xA9KlDG~)k5
zN<29}GR8C9VkX3kN%3Snn)KlHuy}1*eOoyNncHE%J34v^q8tylAMfW0=6KTjL7FE3
zKl$sXN5LWE0e{l&4*JmI#gpPs8O8Aj|G1mpB=Iz@2g}13z&iezFP+ga^griBW5j>p
zH1gw*snGVL$sGi5u05Ng^|g+s)5@F?${)m==}C$Y=C_U}wYeh%PrzQ*FX_<<H_0&b
zLtokz;aQqr=0h~?j)nJY81s)gg1>@|t!?uB<8I3D-x%T-q4f=aBK4nG32%y%Kv3FN
zlE|-AN=2Og$NG<QK0G4xXQ!J;dHDu?peH!LDDjXFuN;B(C)9kf_f$v7^`rzn^DM!r
zY}k_ouOAe*c$FCOK9>?tFRO2Iv~ABM0VDdW7!$mU==1>$yadk^3~N^vyo>mWfE8ta
zf{|~lewd;QkBE+Xl6K<>14G^~uip!JTVhtj@9=(zOaHLZIp5+5ipQwhFKPFh#3Mfb
zw3|$<_F#(7HYtz5V0{5!BVi;e=9_f4#mS;D&$ox@Nl9Pg{fG5RywIBu>zz!T?t>GF
zH4S4v%iR%w1TZ(im~T|h*JXVt$Hg{^aC97ne*Gz*K3+e_kH$|+X{wi057hdi9`O9B
zxPfqUd!6)M7pL%ZgMX85n)93eL4H{7_3^bHJ!-k*@mbn%)R;fP`7QlVjh(FHb0<h_
z-@CDpt|uP-0g<2#Q1me3cbzYOV<*2Ml9a!aejV(&_WB+j^UnyJW||%YedoB3E>7Ml
zuB8ga@@HZK@!CmrhSy`_p6>$Swi*#<b^hsm`X>C1$6wI?!T(7&+*@DNyOYZz?Af<|
ztGH*mf4;kyF9|w7H#eV(h|?)?gGUG7{{Yt3eewJv0{@K&P^qB$8f>!)k<xoh!Me0)
ze!8a)lmEVwJN6m!xE;{xG0?`;p{Q#g@z9Y;wS*dO<lNFQC}<NAGV0nhFWJ04$b0J+
z^3N;z_dH2C2+AMYLjJh=zmZ=C1OlLVZVUN4;!@u$fqyv(%HJv3+5t1g+D#HOp$K}7
zsJ8mscO=lB^&>Hk(hnrGAIk@?G2hD-tRHVm`v>$tujCD%JPoyey^d!TZkC9?_FE;d
zQDi^q*K&IO_d!3+Ig3Ak)AE{+<SC_s^6nP$Z!7(Nc}j_({L5R&pBW-gb_V5hz+nn;
z72kR>&`tet)Bh7h_EY|^YPxzJ$CKv#Y5>1Fp%A20HHsgd_5St}0FtNq{mqm2BQYjh
z{cRlx!hOY$#8}H`e4Be%{_xoke*J&7foa4IN}+!0_z{|1yAnArE&7LXK%tJee){#1
Yt=g}ESPxY3_RA4!D>)q>{nT>*0?5{&i~s-t

literal 0
HcmV?d00001

diff --git a/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin b/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..cf975523f82abbc0139fa3d9c7423217617869cb
GIT binary patch
literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAn)amDfdh&!kyPx)s_l3PU?ahTVOz6|p+xH8Q6zIo
zIajeg%UKX~K-;X_fHv5dBA?m;!yX+F^uw}rZL^@juy)&rZl3}pABw~s6x&;ZHLdph
z{m;3Vmk%YDmkk(}3;5o1{_ofMpL2ETs}G+$8w>=}VWqF2dR1+hF&~>5JxL_=^Ygnt
zujW)<_2Ir-1(b?6mK#mGS)9kneXg(SxjYMv<!Z@M@#@9+N_DZO;?1(tj5qC-X02MN
z*H#LPYmIWLuu!XBtX0;UD}bgv>N}xpEEnx^X$O>cxly(+f+eBicDZtA27$16XOyM#
z{Mu4s(Jro*Ra{RhD^-lCt7Bel15_^-TKGbv*l1{`ZQw=3w41?$)tgSC=`{U%jZxxD
zo0&+?La|a=vWxX)34YzKHEYfFI?N`nR&6wk)n?+OqO6PLuJ8Ysipi~_$5+d%@!H~I
zqufM2Ay}p!T|BC+P(V`L`QzswK3~xEg+!rPSt&Mp$Lo{Y)Jy0?YV!`d$j4N=Ddfmf
zqg|Q$CLo7>(2yDb7|+<($9x#y5N)&bDgtIHm%D|n3cRlcFrz;fcz+lCO@X`JyoKHy
zr2N4az>JcBk4Ia6Z5Wcrgnqn>ehb&6fpW|Xc3<XmbokoF#!(g7y0&pNB15GJf1pc>
z2DrLH=b%jUhN6bH0zZPPj#(h^vynmE1AwT&baS;n-K^OQ%PZB=+5$#}rDCI2t6o?s
zPS=Y{E!2w_nw4gKsb)LQZ&%ivYt^+Sr?xb+R9n4Jzp%XMRF~@M#`64<le5>;xy8!U
zsd#-|1^)3r|M2_$Z|qZQ0M_U$6tKnuH?6^7G+;dwOa!a}j@E-eRDm0}ZoIpB%X$`o
zcW>VKcNtjy!B_yJSb%eV2Yo*|Nl0h{_h*7%?+aK5DXYfY7;bOhc<09L+mMYpwcNB8
zDeNB3{iB2jdrff&nkCX4z-{y2tN?XgB{YB==NK*o-vsSJJp`?2ULe;&kOzNgA-a#i
z>7Zit+v5#@FCNO;9k6zh^E*oOM+3L4m#=dCuLAkYtBj6i6SAAP25*euO>2U~-{NhA
zxBa|b;El$^OWf|GkxRVYt6>;Ta`Yr`;>~wO&#R(S7+2`?-wX7zD1Q|En>tt1WP6<Q
zra0Qe+ap5scfwGT@i~BRY+4Tydz!cJ3-UaNTHPz6?neUt3mNBtd{0w<R)#l!hveT0
z@c!$(9p!Bvx1YZA(_6Y!-h+eDz&qAI3+<nh_LlVzGWixK@9J9nTY>zIh^hiCk$l&B
zRlvgpydt=N62LVXe~Mrxb#qgcY)W0+(8(?lE?~=(w+X^;iGY8Rxc-TZe<Z^GT*iMU
z)pUe{>4tTPx33E7ZIa!(DP?W_CdRiVz&C@xA4CCh^~5ih?FJ5Wv3xv-GhfkODmP<|
z)xvC|sN%&^3EN?#&{%o49Lw^E8ONcpY*%Au_l(7{F)meO$!sh>pN}n7YV*ZPp@FUT
zi6@U!_)24CzEY0G%hjg69{ccp;HZk%i`YwI@wIFw7Oyw$SX@i=)O>`5g76m-4}4sb
zl0Hf4)SYQh-33qD$MfIwJ)`dDN%HAxbEOnZrt<0%r%MPY4&SkXsrrdj?xEubHV>Z$
zrtA~5i53Xz0V<tH&jO|<&lPa+uPzkk%S$WO#DgrEN=fG_K5uAKxd&oTe0Xl%%%Cc|
z+4FbGo&-BB!sbF#R%;jIYube*Oqg0|7FSB~b(q~)uEdRVNl2fviJt+&&oT=Hq7-+I
zt75g}(x(za4`e!dT+J_*T@|UMhj<{CfvlSC@uUPLno^15eoUGe!<uiyWCR0zGE0lg
z&3e4jG|K8pQI>*S%Iidmx)fM@A+<(GC(`e5Y6cp0jNr0AMyagW0z6~XShhW%8CE-q
zU;yEdg$^p=&;tWo@lG+U!UDK2u4#!Yu_?sC7oxOwRR<5DXoWQ`eiDyx`@rHM!EFIe
zOF#)uLbH#YZrcKHmo82d-k!3w_$=^twc0|#rrITm%W2^0T1#_ATsDsTliZM^ttUN;
zv(V%w*+puz5rSSwZ94+d3#lD|@LovqA`RiaklGOl?}5|{gVE`QL>Lid@b^MW9vKZH
z)#_R$zP#d=SVlNm>G_%b>})!dm9hm*r54w*urpc!C{c!uwRsP&Yh7cJbP?@illW3W
zTGoOuGZNg4#;3_xEcx|$rJ?Dd6<eXFRpOfg4W;@K5?dlN9SDgl5p>SSF(Va1M7{$d
zQ6VCk=)h=iiJ|&-!)Nr;fI81`iODl7rLt>&Nv}{b(-`CSapthZZqCOngd+=Wth$yn
zpg<zU55p?zT-G%0>SEO%Aw(Ul5;Tt$J-3@xy9}>7Sap~5cu@h}yb=+{xsz9t@A0At
zx_O0U28MU?nh@zn8rW`LiO38r@8H$T$kU6`>gGjqr)=i3lUJPac-d0eVco5r5!O2g
zS_|^yLotFbiq-5xQKpN6L_doDyQo|{itf26G(A5Tt#?r<B0q`&GbmX*J~LPbE(%59
z=i=(AvqONuY$~41TtF_43t4Y?s7jPDJWDvJ#Ra=uY?kY_%F4oe9DiF|;9oC`mH4^T
zgAbN3t}HYQm13hQ#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2muwS2c?-pb<#GXUy^sgi
zOJt5+!2yQ`rDDfoiP)1c8OC`Uw6TcVT^P$Y&}l3JC6ZulYRAT=JY#2k#<J4<kwr%C
z!dP&18SCZ<#-?{{Y}zw6?=u!P*Jf<e4am;wfNWjnVnw*I&Ft9RjAw2#>9dzL-EJ@z
z*Ik4+3AQeST~Ubd%#ID7X&IdH8;lOpZZH<_-53nEE`!~50E4qTHaOcdIOjK*Cr*F;
z@<4tU29vGJVDvP%Byu}8IOiFhO85*;p#!!X>=xe6YB&Y9E`!}<A;S3`8=P+$oc0@x
z{if?BhHofSb_RD=xSVgxPn%;%ou+9ddu^j!smB-fyW9+YT|D)0D)+hk*)trc&!kW1
zPv>QSOHq@1R@QHSb$PQ9MeM4bNbX01)SNy=TpWw(hf}lZ+$pxS#q{Y@xwE;mG_B4v
ze!E>P&Baq^K9|p*NuQ<A83+I&mqVKaaxS17Affhpigxk5<hsb+&YlrEZgxHlH#>vj
zUf058<*cZAAkL3rU<?oUeIE}uO@raK@54xLv^4JeK8&_oY~!Zy!_ZwLjVsFc)|Zta
zGvN^Chl?2In$e^zbEMBo1T<U6X*nw)$O<P>Jq}2MqX^?}!#%-@g{71TFQs9A_u)z_
zhK^2Nalzy5a4;v)H*dl`oR~i^<RtdQ&d!>%iHnig^RPXWfH2=%bxvYWPM&u0M+}z(
zVe<V6O~>BED<_`iBsV#v-xuSY7*27fU|bfRLe1EomYf8EVd`L!(C0KGjc&T1$;8Ab
zrtI9T9A9R(K^{wHH6ELAFXm(El;HUu)Dxe{Ydj*;6Q9j!JnZg?&nGmVA?jiLu~}`O
zyruTSCzHlNESLGB9dE2x7h(gnw)H2DyjKfcyRBM~24xeQ>L}^F)Pl}lp04?X>0$su
zdn)phYR#C3H?5b~yv`^%W_Wb)6;@s<f%p!?B%#D+X3b(ZHIkP?Fov1c?xLg?Gyx63
zvDbVuy_nGzdkJS-v0ncngXa%GvR820owD<@$5IJoa~nnxp45SqC6r7Vz6}3zjJ;_#
zh6I{}w$Z2$(cZz7hwC14Vpkx55|WASL}cBqOU~y#e9$uNa?|^RWAee)A4AL4694{q
zg+Hq(%iF|&`GG)je%>x$#2;NduPa@o@+*sO!cNAIYu;_ZYH@)!3|tzxqC(RV`Re#7
zs0)qhF*S$AedAn6esdDqKP*Ot)B&Y#A6D+%QhV@2lre8E->H1_fQq!yixBWzkKvo=
zy;~}DUqr<Ql&a%jC^{^<<YOWTt%;Zl4Mx-u#?c_g*$}14#c!7!{K7ncZk`|Di4@7j
zZ-xG^st31$pV<cf+@-*l2%yp<BWhwjq$xj*M|DI!cPY4~p2wI<UdG%BsZC9x)b&v{
zY3~vvx73%zoKLKqZ^ef}bbUm<<n)Iq@#H?Z1^jbPSk1`?8USA(Re#`w!~FW&|NfUp
z)f=Or4~G$Lb+X^kU+z~YF~=qR6YF}Srtmxh{Z1$p!%*v=Ruj(97A@%82PJR3sK%XW
z=n&`cYx^c0d>8lVM+5gbLqp#8R@n0yCmhYB(`n;RXxLFv+EagO(^7AZLSFxnMCcc;
zylD7m$VW7&=i^RfNb=|52g3As%f;Jardn-%0D8!S{Qf!0AHN(D`F8&Rzk&+>dx8&Z
zysh!h;E+_QCqEpfzx_r&<V7_7vxa_ni1L)7pK>B$+8-49N8!)N5d9Opwr6k-)A~3P
z4gKSY4_1T6e{Et{M85h@#D~?(K{aL%QoY*x@r#oGtTQJ0BaHcmJwtOC*2lH|m=Eju
z#SrmXXK3&~;N`;nB`4BRF9RL*GBCh;iFoxgFwkBvk+yo7w1?a36Z%Kyz+WG48vVP&
zM@2bqZ;ghs&ff5e)?6W8W36~iNBf)+jom_0b-av!b&)sajPGvIkE&}c!>xGz53GkN
zXU|^0zL&5bIB%^t{8A34$|dWquaEHydGYJ(YmZ;3Eq>R2V-%ed7xR6@!g{-R$X{<!
zXDEy|r?%eQV*U}o{oxYnd0~F-k&!(u{vqUhB8(-dFT`)s9&6<X^C_qQUcWzmj`<A;
z<Nf+H@Po4cq<-PgwSn<|(rf$3AKCBm6F)mP=+!IjxA*w<1*e?ggh!A5bM5{;t$ZzG
zKCwIK$)69PzwOtyAI1ay^1E)o`+}aMzM||W(l6jY`=$4@8r1h>b8ze1x%(e-`xWUa
zzy8#JSd7<XbKlm_zwsG^$NI5Vl$${PS8iHdkEFMR{<id^h+_@je(x2O>_(q@>yU~#
z!F|`hc;d4vup#R|5(!!-zx0j221|hJ?JIVukNQr&hJH(XoW1*Fn0x)(2}}RnFa4A2
zfA8Kmp>XRp^ee<?>~g=<+t&Lpj<WvmajdY{KQSM(`}Z>5TT#&efbydj^+|sMe$v73
zE?j+BZ<9{H<V*A;U*nEtaeongphv&${{X3%Iq=!bmdFE1g!W825y`)0`rm{T?i2Ym
z0{C(4H|#HA_J5J5`td^(fB)6}bsqg4@i=Vn3hn~ETt9eJeW9VEGG`GBemJO437o$e
zI&6;yy<aa~AMHDAj|5GJko}GP!)(Ch|4BgPjmFGe<c$ZtUpXQFhwZVT_p2?$hx{W6
zKmDGd^lN}%Pyb<icW^hyyuLRo^2g>dHulQ?{Q~CPzZXXYzlRG}?9ueQpK70v*UcRE
z9rO-zNCM^)&M<y&<?gSCkpI46=2tG@pI?tqGJa2mf1e&v<946ao4emlI3ccNrC47&
zAAM0w4W9e0^rLzd`vv^LoTo2UkLvz9>Gbu@VIFq(Q)f7|>8p<^N7uLXhc~8CFS}g(
z<oxh$Y>VnS2w*%=Pm|6rsh?d&-U|~dL<d3#aZtNHrq2H!CUR|9>TUWrjB)@4{hJ>F
zKh854ian?|^pn>9iu!+HT}i!YdDH5=dwx+`-Y>6zpm?6!+WK8fy*8>2cz6i?e=vua
zpIo0r;D0Rw>^<yXo!6WJjB@kvUDB(3a~|G%@qfwiUHJ%p_UaB2ksg&(G~P{&Lej8B
z>1L|<Mk?1h@SQdhk&r`<Qp=jmyLZwa+J^sYhQHpy&@ol#w&6EjU!Cu8u9N>O+wlJ?
zoyDbJHuRYHPX7BfiaS)>_0$nPoitE4N8a9K;J>Dy^*J+lAI}^9KQaLF%Q@L4FwfVu
z{ayMG{E{Y!8M;pWZXSOEJZ+jYbnYW>_}#gy-{o`je;oQhjNhdrKTE2U-`R%$F{9tl
z&ywim|Eq2ImwWKj&QAUmU=V>r7<c{FwPE--OdZgE4mVmPGP;U8WuESUtGEGBskxZH
zb}dSM6EN~qe%s~pw*kBB?>da9{iizUT>Xh)3(>277*m?){=cRcwH_Uya9r-wjUT~x
n%f~u!<4f88%Z<1D^y`zBuuWV%87AI-J^^jR=f=l<x_tiyry;Ja

literal 0
HcmV?d00001

diff --git a/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin b/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..936d39048d4b65699b59c1fe697ebef495f7b797
GIT binary patch
literal 6280
zcmeHLO^h5z6@J}4PG-HEon7Y;TVT?TypkiYdS`lPe;iwA1rA(d5h<4tggQOby)y$d
zJ)Q2}@yu=vV`Ew50!WAhH^h+x95^8%<U<l6goMP269)?+ffH-9ikt(6;d|9pv%NjL
zvLO}{kdmhBz3;v6)vH(CRo(N=7hn2<u4&6pQrdL-6%D1#UHD9@l%G!f*PeQfHfWuW
z;W|$mkrg-NBud;GB-IZ(sV;gaZU$bTte|JLg3T^jiQiAGBx)tyzzMr8XLBp|Jty>|
z&Q{_(^=>$D?A5XVqj|<nH}buI$v5(2Kk9*0Cd>0{TMcJ3ay!0K+N}U`<hP}9th)z*
zpy#AywmY#K$0_Lu+(j8C31ciw`cBeMvg696kcb*sBB*EGZMPed8#YC?!l;{clR=2I
z+13r>#0`@2!;6xx?8mtN_uT_(@n7lq9jm*!8T$!t1C&kPNXm|z!*Q@YN`K|bi&vb~
z-zht8yX9iN|Htm0M|w0L*X?EFwlipU;^zAqkTV&yAmu-zH-A*hSW<C>&LHKrZ=S&x
zWnj0;w+jM)A`jCy1g>_&ee=%+etaL0((izunA^{PsHcWq;XgOwKSDbzQI@$O?>D(G
z&0HT2&rxx7eR!@YL?Xr)C!(Z)8msUs^psmr75E7tokCnS?QgTkan+C!X-i2bTuQo8
zz1a%9tvV#9;l|x=@U@n^)VoSl?_I6gcDULP8f$&ecemH-Z}hiUlBjEYp_{a-og|3b
zD}Hl(d%NFk58}<0uUp}Ov|s-D`|mvQ7gS<OmVl_B8S~oSp7Ew$){KH~YP)w01M>U#
zc0VvQ==XN@+d6V{MYde6gV_SeL?c8q=Jh$vn8xKJqlE}Y%=&p;_6!f#-4E{HeSc3l
z+~0fe{vO6NHT`=$2!fH-^P{-c4a{|G4JVNsHT=YkJI;l&OP1?-c+O)dZf*PKYBlw5
z#jRS~H!VL%qJjCaXX6}Mp^Ib0w6<2OrWGcUX*Js2n%g!l>=uo{tUNS^BO*BE#m5(`
zl8Lo!KWYYchV^l0vrNzPsrE*Yv^=w7uhR~z4U`DywmG#BURwF=MG4iA;?zQP$*%50
z7$1;#X&n%qeaXRFLr`~WexntXU*MH$duipZnsi-RG{5q&<x-{o>cvU^>Z9{3;B!uR
zwm6wiw`XmobEvE>nhW*BZFyE7iO0>hC1+Pzhb3Zrr3_1!a{(5Vi5VG!T^0lw?Th3F
zo(g1F7-#EBc^#3u(b!@Oi?CN<%2<|#1@@H+q7jb2&}t`EEV2&Mh_GUz%4o>rphFrM
zLB<Z$*=k!?Tb?gtL>|vPvf)`px}Imna$Q7l537UaTsvqIVLdq5<tk5QT$P0g(=PXs
zh^iAxhZ*M~Q1cLC3TR^?teCh-(x=htRYjF{kOh}pM8j1iSFz1&SbR>fVxBk4CYCpw
zO9d0p4^y3lFvYwV?kSaweRwh)8+jlcuA)el)p0iVem2>P3W|r=lNA+lJUFO|iZaAW
zt_)(X%W3pQ^T?ub84$oW4-Zhy1~_~o93k?P@#9i?VgF(F1CO7cYy3^&*Awj0Q}Wfy
zt<@sGhb<%Du1&a#PmD>&Jn?bqlL>U(I_EI8GlhxZ2wW=C6#5e^&!@7$Ph{k^0u9;z
z%nY0GzH*Z2!!u)DC!CDC^J873;sLuL!QXfp+)s_D@aZDGjN>lEuP`?wV)N<8KjV~1
zh2uryH<>vd@@j#jiN^05Ec`?Ef0z9O^kOow@!8JvQSG!8^7!Q=aK%&ojU8<ylHnwt
zo263&X*q*Fn57##`iO2q=7fJdOUL?!)M13#{HX+8zQeJ8rv(0VYF|KiXO@1k4gVXE
znGgH1ep+K<|F=tsUmVdbX@9OnXZxrb)DeB>N&3waoq+xTuA|$mvweD6IOx!S{asq<
z>(lJ7vwr7Gg_%B0)3=|du(oT^yCqs>UA+*24?CsyJ%i(2f2*L28J+&jOgcaLE-B};
z&@YPkMUMa5l6L)vGw?^T*uQDet+%Mi{K9E0^qtSsl~Yhe8yfW2XX(~}iZk#}?Irz+
zsxPs<L|+^{AYR|m=vxNeF44)c9YOvNbXlxVx=$+bAE^Mg6727jZ9_tDvqPrkRBR@P
z^DoOi=2^AZ$QXG(|BaV(5uQ!42@w?~fqi(NS7*aX;YIWg`g}|_h4K@-Dg9KJ_x(K#
zd5nGkAo)L4MA_$k`GCq}?0QQc1HR;`F7JC8Kptb?yOJNt{wV(QsewPN546{xO4YLx
zRckEEI`;i?Duo+kBo%H-{!NK9pHG53e~{)sF@8}V8)k;jqT;J{yeF{@HfGv6$ye*l
z<MWazp2E*#e2%$+U$$)=!sk-v`Hds^uO7nZaq@irzlhf_w@fsY{Gq(LIe)%?hxlb1
z|F6U`)%uKW^JD3Nz++|G4*)Rsxh%-C<d^063~@@T{;YH3yctvft235uC*^>uSMz81
l-{tW-s`_NdVp-Ko_1W=6jgP2HK2DY==6nQC)kpP;_fJyYA>IH0

literal 0
HcmV?d00001

diff --git a/exir/program/_program.py b/exir/program/_program.py
index a33d715ca3b..af94399a3ed 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1825,6 +1825,7 @@ def __init__(
 
         backend_config = backend_config or ExecutorchBackendConfig()
 
+        print("start emitting..")
         # Emit methods
         self._emitter_output: EmitterOutput = emit_program(
             self._execution_programs,
@@ -1832,6 +1833,7 @@ def __init__(
             self._config_methods,
             backend_config.emit_mutable_buffer_names,
         )
+        print("done. start serializing..")
 
         # Serialize emitter output, ready to be written to a file.
         self._data_serializer = FlatTensorSerializer()
diff --git a/export_add.py b/export_add.py
deleted file mode 100644
index d0d2489b885..00000000000
--- a/export_add.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import torch
-from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
-from executorch.exir import to_edge
-from torch.export import export
-
-
-# Start with a PyTorch model that adds two input tensors (matrices)
-class Add(torch.nn.Module):
-    def __init__(self):
-        super(Add, self).__init__()
-
-    def forward(self, x: torch.Tensor, y: torch.Tensor):
-        # return triton_transpose_acc(x, y)
-        return (x.cuda() + y.cuda()).cpu()
-
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(
-    Add(), (torch.ones(10, device="cpu"), torch.ones(10, device="cpu"))
-)
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-
-edge_program = edge_program.to_backend(AotiPartitioner([]))
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("aoti_model.pte", "wb") as file:
-    file.write(executorch_program.buffer)
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 01c023f0e8f..90571b0751e 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -1,11 +1,130 @@
-./install_executorch.sh
-python $1
-./install_executorch.sh --clean
-mkdir -p cmake-out
-cd cmake-out
-cmake -DEXECUTORCH_BUILD_AOTI=ON \
-      -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-      ..
-cd ..
-cmake --build cmake-out -j9
-./cmake-out/executor_runner --model_path aoti_model.pte
+#!/bin/bash
+
+# Script to export and run AOTI with different modes
+# Usage:
+#   ./export_and_run_aoti.sh <model_arg> [mode]
+#   ./export_and_run_aoti.sh <model_arg> --mode=<mode>
+#
+# Examples:
+#   ./export_and_run_aoti.sh conv2d                    # Uses default mode (reinstall_all)
+#   ./export_and_run_aoti.sh conv2d inference          # Uses inference mode
+#   ./export_and_run_aoti.sh conv2d --mode=inference   # Alternative syntax
+#
+# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference
+# model_arg: argument to pass to export_aoti.py
+
+set -e  # Exit on any error
+
+# Parse command line arguments
+MODE="reinstall_all"
+MODEL_ARG="$1"
+
+# Parse arguments for mode
+for arg in "$@"; do
+    case $arg in
+        --mode=*)
+            MODE="${arg#*=}"
+            shift
+            ;;
+        reinstall_all|reinstall_aot|reinstall_runtime|inference)
+            # If it's the second argument and a valid mode, use it as mode
+            if [[ "$arg" == "$2" ]]; then
+                MODE="$arg"
+            fi
+            ;;
+    esac
+done
+
+# Validate mode
+case "$MODE" in
+    reinstall_all|reinstall_aot|reinstall_runtime|inference)
+        # Valid mode, continue
+        ;;
+    *)
+        echo "Error: Unknown mode '$MODE'"
+        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference"
+        echo ""
+        echo "Usage examples:"
+        echo "  ./export_and_run_aoti.sh conv2d                    # Uses default mode"
+        echo "  ./export_and_run_aoti.sh conv2d inference          # Positional mode"
+        echo "  ./export_and_run_aoti.sh conv2d --mode=inference   # GNU-style mode"
+        exit 1
+        ;;
+esac
+
+echo "Running in mode: $MODE"
+if [[ -n "$MODEL_ARG" ]]; then
+    echo "Model argument: $MODEL_ARG"
+fi
+
+# Function definitions for each step
+install_executorch() {
+    echo "Installing executorch..."
+    ./install_executorch.sh
+}
+
+export_aoti_model() {
+    echo "Exporting AOTI model..."
+    python export_aoti.py $MODEL_ARG
+}
+
+clean_install_executorch() {
+    echo "Clean installing executorch..."
+    ./install_executorch.sh --clean
+}
+
+build_runtime() {
+    echo "Building runtime..."
+    # Clean the build directory to ensure debug flags take effect
+    rm -rf cmake-out
+    mkdir -p cmake-out
+    cd cmake-out
+    cmake -DEXECUTORCH_BUILD_AOTI=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+          -DEXECUTORCH_LOG_LEVEL=Debug \
+          -DCMAKE_BUILD_TYPE=Debug \
+          ..
+    cd ..
+    cmake --build cmake-out -j9
+}
+
+run_inference() {
+    echo "Running executor_runner with debug logging enabled..."
+    ./cmake-out/executor_runner --model_path aoti_model.pte
+}
+
+# Execute based on mode
+case "$MODE" in
+    "reinstall_all")
+        echo "Mode: reinstall_all - Full reinstall and run"
+        install_executorch          # Line 1
+        export_aoti_model           # Line 2
+        clean_install_executorch    # Line 3
+        build_runtime              # Lines 6-16
+        run_inference              # Lines 17-18
+        ;;
+    "reinstall_aot")
+        echo "Mode: reinstall_aot - Reinstall AOT components only"
+        install_executorch          # Line 1
+        export_aoti_model           # Line 2
+        run_inference              # Lines 17-18
+        ;;
+    "reinstall_runtime")
+        echo "Mode: reinstall_runtime - Rebuild runtime and run"
+        build_runtime              # Lines 6-16
+        run_inference              # Lines 17-18
+        ;;
+    "inference")
+        echo "Mode: inference - Export model and run inference only"
+        export_aoti_model           # Line 2
+        run_inference              # Lines 17-18
+        ;;
+    *)
+        echo "Error: Unknown mode '$MODE'"
+        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference"
+        exit 1
+        ;;
+esac
+
+echo "Script completed successfully!"
diff --git a/export_aoti.py b/export_aoti.py
new file mode 100644
index 00000000000..d798654ffe0
--- /dev/null
+++ b/export_aoti.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+"""
+Unified export script for AOTI backend.
+Usage: python export_aoti.py <model_name>
+
+Supported models:
+- mv2: MobileNetV2 model
+- linear: Simple linear layer model
+- conv2d: Single Conv2d layer model
+- add: Simple tensor addition model
+"""
+
+import copy
+import os
+
+import shutil
+
+import sys
+from subprocess import check_call
+from typing import Any, Dict, Tuple
+
+import torch
+from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
+from executorch.exir import to_edge
+from torch import nn
+from torch.export import export
+from torchvision import models
+from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+
+
+# Model classes
+class MV2(torch.nn.Module):
+    def __init__(self):
+        super(MV2, self).__init__()
+        self.mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
+
+    def forward(self, x: torch.Tensor):
+        return self.mv2(x)
+
+
+class Linear(torch.nn.Module):
+    def __init__(self):
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(3, 5)
+
+    def forward(self, x: torch.Tensor):
+        return self.linear(x).cpu()
+
+
+class SingleConv2d(nn.Module):
+    def __init__(self):
+        super(SingleConv2d, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels=3, out_channels=5, kernel_size=3, stride=1, padding=1
+        )
+
+    def forward(self, x: torch.Tensor):
+        return self.conv(x)
+
+
+class Add(torch.nn.Module):
+    def __init__(self):
+        super(Add, self).__init__()
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return (x + y).cpu()
+
+
+# Model registry mapping model names to their configurations
+MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
+    "mv2": {
+        "model_class": MV2,
+        "input_shapes": [(1, 3, 224, 224)],
+        "device": "cuda",
+        "description": "MobileNetV2 model",
+    },
+    "linear": {
+        "model_class": Linear,
+        "input_shapes": [(4, 3)],
+        "device": "cuda",
+        "description": "Simple linear layer model",
+    },
+    "conv2d": {
+        "model_class": SingleConv2d,
+        "input_shapes": [(4, 3, 8, 8)],
+        "device": "cuda",
+        "description": "Single Conv2d layer model",
+    },
+    "add": {
+        "model_class": Add,
+        "input_shapes": [(10,), (10,)],
+        "device": "cuda",
+        "description": "Simple tensor addition model",
+    },
+}
+
+
+def get_model_and_inputs(
+    model_name: str,
+) -> Tuple[torch.nn.Module, Tuple[torch.Tensor, ...]]:
+    """Get model and example inputs based on model name."""
+
+    if model_name not in MODEL_REGISTRY:
+        available_models = ", ".join(MODEL_REGISTRY.keys())
+        raise ValueError(
+            f"Unsupported model: {model_name}. Available models: {available_models}"
+        )
+
+    model_config = MODEL_REGISTRY[model_name]
+    model_class = model_config["model_class"]
+    input_shapes = model_config["input_shapes"]
+    device = model_config["device"]
+
+    # Create model instance
+    model = model_class().to(device).eval()
+
+    # Create example inputs (support multiple inputs)
+    example_inputs = tuple(torch.randn(*shape, device=device) for shape in input_shapes)
+
+    return model, example_inputs
+
+
+def export_model(model, example_inputs, output_filename="aoti_model.pte"):
+    """Export model through the AOTI pipeline."""
+
+    print(f"Starting export process...")
+
+    # 1. torch.export: Defines the program with the ATen operator set.
+    print("Step 1: Converting to ATen dialect...")
+    aten_dialect = export(model, example_inputs)
+
+    # 2. to_edge: Make optimizations for Edge devices
+    print("Step 2: Converting to Edge program...")
+    edge_program = to_edge(aten_dialect)
+    print(edge_program.exported_program().graph)
+
+    print("Step 3: Converting to backend...")
+    edge_program = edge_program.to_backend(AotiPartitioner([]))
+    print("To backend done.")
+
+    # 3. to_executorch: Convert the graph to an ExecuTorch program
+    print("Step 4: Converting to ExecuTorch program...")
+    executorch_program = edge_program.to_executorch()
+    print("To executorch done.")
+
+    # 4. Save the compiled .pte program
+    print(f"Step 5: Saving to {output_filename}...")
+    with open(output_filename, "wb") as file:
+        file.write(executorch_program.buffer)
+
+    print(f"Export completed successfully! Output saved to {output_filename}")
+
+
+def main():
+    if len(sys.argv) != 2:
+        available_models = ", ".join(MODEL_REGISTRY.keys())
+        print("Usage: python export_aoti.py <model_name>")
+        print(f"Available models: {available_models}")
+        print("\nModel descriptions:")
+        for name, config in MODEL_REGISTRY.items():
+            print(f"  {name}: {config['description']}")
+        sys.exit(1)
+
+    model_name = sys.argv[1]
+
+    try:
+        model, example_inputs = get_model_and_inputs(model_name)
+        export_model(model, example_inputs)
+    except ValueError as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/export_mv2.py b/export_mv2.py
deleted file mode 100644
index fa84084088f..00000000000
--- a/export_mv2.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import torch
-from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
-from executorch.examples.models.mobilenet_v2 import MV2Model
-from executorch.exir import to_edge
-from torch.export import export
-from torchvision import models
-from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
-
-mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights)
-mv2 = mv2.eval()
-
-model_inputs = (torch.randn(1, 3, 224, 224),)
-
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(mv2, model_inputs)
-
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-
-edge_program = edge_program.to_backend(AotiPartitioner([]))
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("aoti_model.pte", "wb") as file:
-    file.write(executorch_program.buffer)
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 65a47594c8d..1c90f88df7c 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -1580,6 +1580,8 @@ Error Method::execute() {
         "chain %" ET_PRIsize_t " has no instructions field",
         step_state_.chain_idx);
 
+    ET_LOG(Debug, "Executing chain idx: %" ET_PRIsize_t, step_state_.chain_idx);
+
     // Loop over instructions
     step_state_.instr_idx = 0;
     while (step_state_.instr_idx < chain.s_chain_->instructions()->size()) {

From 17f1a5f28fb1710571732d134cb57f898b36e7f2 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 20 Aug 2025 00:37:05 -0700
Subject: [PATCH 14/50] move input to GPU + using torch aoti kernel

---
 backends/aoti/CMakeLists.txt                  |   13 +-
 backends/aoti/aoti_backend.py                 |    1 -
 backends/aoti/runtime/aoti_backend.cpp        |  180 +-
 .../aoti/runtime/aoti_model_container.cpp     |    2 +
 backends/aoti/runtime/aoti_model_container.h  |    7 +
 backends/aoti/runtime/shims/memory.cpp        |   12 +-
 backends/aoti/runtime/shims/memory.h          |    1 +
 backends/aoti/runtime/targets.bzl             |    2 +
 ...ky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin |  Bin 11320 -> 0 bytes
 ...rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin |  Bin 10048 -> 0 bytes
 ...x5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin |  Bin 10816 -> 0 bytes
 ...ci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin |  Bin 10176 -> 0 bytes
 ...c3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp |    6 -
 ...3am2yslkkhyp4e7oaf7ej.kernel_metadata.json |    1 -
 ...vspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin |  Bin 11320 -> 0 bytes
 ...6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin |  Bin 10936 -> 0 bytes
 ...xauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin |  Bin 11320 -> 0 bytes
 ...47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin |  Bin 10944 -> 0 bytes
 ...ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin |  Bin 11320 -> 0 bytes
 ...2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp |  965 ---
 ...kmcvkgx3hnjvysymcgms.wrapper_metadata.json |    1 -
 ...nhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin |  Bin 10936 -> 0 bytes
 ...jd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp | 6144 -----------------
 ...6ndtpaca5r3ct3piucq7.wrapper_metadata.json |    1 -
 ...x6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin |  Bin 11320 -> 0 bytes
 ...f6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp |  965 ---
 ...x5fugystcn2wozmmxwaf.wrapper_metadata.json |    1 -
 ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin |  Bin 8968 -> 0 bytes
 ...retoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin |  Bin 9784 -> 0 bytes
 ...ugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin |  Bin 11320 -> 0 bytes
 ...2j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin |  Bin 13832 -> 0 bytes
 ...hvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin |  Bin 11320 -> 0 bytes
 ...6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin |  Bin 10296 -> 0 bytes
 ...kxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin |  Bin 13240 -> 0 bytes
 ...lgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp |    6 -
 ...uw6cqsbpvx756nf43k7mq.kernel_metadata.json |    1 -
 ...ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin |  Bin 9528 -> 0 bytes
 ...ksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp |    6 -
 ...4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json |    1 -
 ...rsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin |  Bin 9528 -> 0 bytes
 ...pudstbhsobm3wlczsly46p5oeax43spr3eab.cubin |  Bin 21056 -> 0 bytes
 ...reaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin |  Bin 10296 -> 0 bytes
 ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin |  Bin 11656 -> 0 bytes
 ...oylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin |  Bin 10296 -> 0 bytes
 ...wncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin |  Bin 15624 -> 0 bytes
 ...47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin |  Bin 10816 -> 0 bytes
 ...zkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin |  Bin 11320 -> 0 bytes
 ...jkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin |  Bin 9528 -> 0 bytes
 ...zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin |  Bin 11400 -> 0 bytes
 ...zc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin |  Bin 13832 -> 0 bytes
 ...vkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin |  Bin 6280 -> 0 bytes
 export_and_run_aoti.sh                        |   19 +
 export_aoti.py                                |    6 +-
 53 files changed, 193 insertions(+), 8148 deletions(-)
 delete mode 100644 c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin
 delete mode 100644 c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin
 delete mode 100644 c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin
 delete mode 100644 c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin
 delete mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
 delete mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
 delete mode 100644 c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin
 delete mode 100644 c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin
 delete mode 100644 c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin
 delete mode 100644 cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin
 delete mode 100644 caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin
 delete mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
 delete mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
 delete mode 100644 cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin
 delete mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
 delete mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
 delete mode 100644 cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin
 delete mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
 delete mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
 delete mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
 delete mode 100644 cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin
 delete mode 100644 cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin
 delete mode 100644 ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin
 delete mode 100644 cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin
 delete mode 100644 ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin
 delete mode 100644 cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin
 delete mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
 delete mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
 delete mode 100644 cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin
 delete mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
 delete mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
 delete mode 100644 crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin
 delete mode 100644 csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin
 delete mode 100644 ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin
 delete mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
 delete mode 100644 cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin
 delete mode 100644 cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin
 delete mode 100644 cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin
 delete mode 100644 cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin
 delete mode 100644 cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin
 delete mode 100644 cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin
 delete mode 100644 cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin
 delete mode 100644 czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin

diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 1c596fef6e6..6922d5e9356 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -21,10 +21,12 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-# include(${EXECUTORCH_ROOT}/build/Utils.cmake)
-
 find_package(CUDAToolkit REQUIRED)
 
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
 set(_aoti_sources
     runtime/aoti_backend.cpp
     runtime/aoti_model_container.cpp
@@ -37,17 +39,22 @@ target_include_directories(
     ${CUDAToolkit_INCLUDE_DIRS}
     $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
     $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
 )
 target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
 target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic)
-# Link against CUDA::cudart (the CUDA runtime library)
+
+# Link against CUDA::cudart, PyTorch libraries and standard libraries
 target_link_libraries(
   aoti_backend
   PUBLIC
     extension_tensor
     CUDA::cudart
     ${CMAKE_DL_LIBS}
+    # Link PyTorch libraries for AOTI CUDA functions
+    ${TORCH_LIBRARIES}
 )
 # If you need other CUDA libraries, link them similarly:
 # target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...)
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a0c4a2aa005..5aa547d789c 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -46,7 +46,6 @@ def preprocess(
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
         }
-
         so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
 
         assert so_path == output_path, f"Expected {output_path} but got {so_path}"
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 65d28a7a1ff..4c065fbeeb6 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -30,6 +30,9 @@
 #include "shims/memory.h"
 #include "shims/tensor_attribute.h"
 
+// Include CUDA AOTI shims
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
+
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -111,6 +114,14 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       return Error::AccessFailed;
     }
 
+    AOTInductorModelContainerGetInputName =
+        reinterpret_cast<AOTInductorModelContainerGetInputNameFunc>(
+            dlsym(so_handle, "AOTInductorModelContainerGetInputName"));
+    if (AOTInductorModelContainerGetInputName == nullptr) {
+      perror("dlsym AOTInductorModelContainerGetInputName");
+      return Error::AccessFailed;
+    }
+
     AOTInductorModelContainerGetNumOutputs =
         reinterpret_cast<AOTInductorModelContainerGetNumOutputsFunc>(
             dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs"));
@@ -152,29 +163,35 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "AOTIBackend Handle generated");
 
-    size_t n_inputs, n_constants;
+    size_t n_inputs;
     AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
 
-    AOTInductorModelContainerGetNumConstants(
-        handle->container_handle, &n_constants);
-    size_t n_user_inputs = n_inputs - n_constants;
-
-    if (n_user_inputs != n_inputs) {
-      ET_LOG(
-          Error,
-          "number of user input does not match number of inputs. n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.",
-          n_user_inputs,
-          n_constants,
-          n_inputs);
-      return Error::InvalidArgument;
-    }
-
-    ET_LOG(
-        Debug,
-        "AOTIBackend n_inputs %zd generated, where %zd is constant input, %zd is user input",
-        n_inputs,
-        n_constants,
-        n_user_inputs);
+    // for (int i = 0; i < n_inputs; i++) {
+    //   const char* input_name;
+    //   AOTInductorModelContainerGetInputName(
+    //       handle->container_handle, i, &input_name);
+    //   ET_LOG(Debug, "AOTIBackend %d-th input name %s", i, input_name);
+    // }
+
+    // AOTInductorModelContainerGetNumConstants(
+    //     handle->container_handle, &n_constants);
+    // size_t n_user_inputs = n_inputs - n_constants;
+
+    // if (n_user_inputs != n_inputs) {
+    //   ET_LOG(
+    //       Error,
+    //       "number of user input does not match number of inputs.
+    //       n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.",
+    //       n_user_inputs,
+    //       n_constants,
+    //       n_inputs);
+    //   return Error::InvalidArgument;
+    // }
+
+    // ET_LOG(
+    //     Debug,
+    //     "AOTIBackend n_inputs %zd generated, where %zd is constant input,
+    //     %zd is user input", n_inputs, n_constants, n_user_inputs);
 
     size_t n_outputs;
     AOTInductorModelContainerGetNumOutputs(
@@ -199,22 +216,87 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
         n_outputs,
         args.size());
 
-    std::vector<AOTITensorHandle> inputs(n_inputs);
-    std::vector<AOTITensorHandle> outputs(n_outputs);
+    // NOTE: ExecutorTorch tensors are always on CPU/host memory
+    // We need to create GPU copies for CUDA kernel execution
+    std::vector<AOTITensorHandle> gpu_inputs(
+        n_inputs); // GPU copies for kernel execution
+    std::vector<AOTITensorHandle> gpu_outputs(
+        n_outputs); // GPU tensors for kernel output
 
     ET_LOG(Debug, "AOTIBackend input/output vectors generated");
 
+    // Process input tensors: ExecutorTorch provides CPU tensors, create GPU
+    // copies
     for (int i = 0; i < n_inputs; i++) {
-      ET_LOG(Debug, "Copying input %d from args to inputs vector", i);
+      ET_LOG(Debug, "Processing input %d from args to inputs vector", i);
       ET_LOG(
           Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor()));
-      inputs[i] = &(args[i]->toTensor());
+
+      // Get tensor dimensions and properties from ExecutorTorch CPU tensor
+      auto cpu_tensor = &(args[i]->toTensor());
+      auto sizes = cpu_tensor->sizes();
+      auto scalar_type = cpu_tensor->scalar_type();
+
+      // Create GPU tensor with same shape
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_input_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_input_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for input %d", i);
+        return Error::Internal;
+      }
+
+      gpu_inputs[i] = gpu_input_handle;
+
+      // Copy data from CPU to GPU
+      Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i);
+        return Error::Internal;
+      }
+
+      ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
     }
 
-    ET_LOG(Debug, "AOTIBackend input generated");
+    ET_LOG(Debug, "AOTIBackend GPU inputs generated");
 
+    // Process output tensors: create GPU counterparts for ExecutorTorch CPU
+    // tensors
     for (int i = 0; i < n_outputs; i++) {
-      outputs[i] = &(args[i + n_inputs]->toTensor());
+      // Get output tensor dimensions from ExecutorTorch CPU tensor
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = cpu_output_tensor->sizes();
+      auto scalar_type = cpu_output_tensor->scalar_type();
+
+      // Create GPU tensor with same shape for kernel output
+      std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
+
+      AOTITensorHandle gpu_output_handle;
+      Error create_err = aoti_torch_empty_strided(
+          sizes_vec.size(),
+          sizes_vec.data(),
+          nullptr, // use default strides
+          static_cast<int32_t>(scalar_type),
+          1, // device_type = cuda
+          0, // device_index = 0
+          &gpu_output_handle);
+
+      if (create_err != Error::Ok) {
+        ET_LOG(Error, "Failed to create GPU tensor for output %d", i);
+        return Error::Internal;
+      }
+
+      gpu_outputs[i] = gpu_output_handle;
+      ET_LOG(Debug, "Created GPU output tensor %d", i);
     }
 
     ET_LOG(Debug, "AOTIBackend output generated");
@@ -232,13 +314,12 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream);
 
-    // Run AOTI container with the stream (AOTI will create its own stream guard
-    // internally)
+    // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,
-        inputs.data(),
+        gpu_inputs.data(), // Use GPU input tensors
         n_inputs,
-        outputs.data(),
+        gpu_outputs.data(), // Use GPU output tensors
         n_outputs,
         cuda_stream, // Pass the actual CUDA stream!
         nullptr); // proxy_executor_handle can remain nullptr
@@ -253,27 +334,46 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "AOTIBackend running done");
 
-    // Synchronize and destroy the CUDA stream
+    // Synchronize the CUDA stream to ensure kernels complete
     cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
     if (sync_err != cudaSuccess) {
       ET_LOG(
           Error,
           "Failed to synchronize CUDA stream: %s",
           cudaGetErrorString(sync_err));
-      // Continue anyway to avoid fatal errors
+      return Error::Internal;
     }
 
-    cudaStreamDestroy(cuda_stream);
-    ET_LOG(Debug, "CUDA stream synchronized and destroyed");
+    ET_LOG(Debug, "CUDA stream synchronized");
+
+    // Copy GPU output results back to CPU output tensors
+    for (int i = 0; i < n_outputs; i++) {
+      auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+      Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0);
+      if (copy_err != Error::Ok) {
+        ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i);
+        return Error::Internal;
+      }
+      ET_LOG(Debug, "Copied GPU output %d back to CPU", i);
+    }
+
+    // Clean up GPU tensors that we created (ExecutorTorch tensors are always
+    // CPU, so all GPU tensors are our copies)
+    for (int i = 0; i < n_inputs; i++) {
+      // All GPU input tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_inputs[i]);
+    }
 
-    // Still need to copy the output to args, because they are malloc'ed but
-    // not using the data_ptr from outputs.
     for (int i = 0; i < n_outputs; i++) {
-      auto args_out = args[i + n_inputs]->toTensor();
-      aoti_torch_copy_(&args_out, outputs[i], 0);
+      // All GPU output tensors were created by us, delete them
+      aoti_torch_delete_tensor_object(gpu_outputs[i]);
     }
 
-    ET_LOG(Debug, "AOTIBackend output copied");
+    // Destroy the CUDA stream
+    cudaStreamDestroy(cuda_stream);
+    ET_LOG(Debug, "CUDA stream destroyed and GPU tensors cleaned up");
+
+    ET_LOG(Debug, "AOTIBackend execution completed successfully");
 
     return Error::Ok;
   }
diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/runtime/aoti_model_container.cpp
index 0809a677a81..f9d66ed82e4 100644
--- a/backends/aoti/runtime/aoti_model_container.cpp
+++ b/backends/aoti/runtime/aoti_model_container.cpp
@@ -21,6 +21,8 @@ AOTInductorModelContainerCreateWithDeviceFunc
 AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr;
 AOTInductorModelContainerGetNumInputsFunc
     AOTInductorModelContainerGetNumInputs = nullptr;
+AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName = nullptr;
 AOTInductorModelContainerGetNumConstantsFunc
     AOTInductorModelContainerGetNumConstants = nullptr;
 AOTInductorModelContainerGetNumOutputsFunc
diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/runtime/aoti_model_container.h
index 2078490022d..39a8a35c14f 100644
--- a/backends/aoti/runtime/aoti_model_container.h
+++ b/backends/aoti/runtime/aoti_model_container.h
@@ -45,6 +45,11 @@ using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
 
+using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** input_name);
+
 using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)(
     AOTInductorModelContainerHandle container_handle,
     size_t* num_constants);
@@ -72,6 +77,8 @@ extern AOTInductorModelContainerCreateWithDeviceFunc
 extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete;
 extern AOTInductorModelContainerGetNumInputsFunc
     AOTInductorModelContainerGetNumInputs;
+extern AOTInductorModelContainerGetInputNameFunc
+    AOTInductorModelContainerGetInputName;
 extern AOTInductorModelContainerGetNumConstantsFunc
     AOTInductorModelContainerGetNumConstants;
 extern AOTInductorModelContainerGetNumOutputsFunc
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index cadd021f51f..ab5d35efd9f 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <cstdlib> // For posix_memalign
 #include <cstring>
 #include <iostream>
 #include <memory>
@@ -147,14 +148,19 @@ AOTITorchError aoti_torch_empty_strided(
     std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl;
     cudaError_t err = cudaMalloc(&ptr, nbytes);
     if (err != cudaSuccess) {
-      std::cout << "failed to allocate " << nbytes << std::endl;
+      std::cout << "failed to allocate " << nbytes
+                << " error: " << cudaGetErrorString(err) << std::endl;
       throw std::runtime_error("Failed to call cudaMalloc");
     }
   } else if (device_type == 0) { // cpu
     std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl;
-    ptr = malloc(nbytes);
+    // Ensure 16-byte alignment for CPU memory to match CUDA requirements
+    int result = posix_memalign(&ptr, 16, nbytes);
+    if (result != 0) {
+      throw std::runtime_error("Failed to allocate aligned CPU memory");
+    }
     if (ptr == nullptr) {
-      throw std::runtime_error("Failed to call malloc");
+      throw std::runtime_error("Failed to call posix_memalign");
     }
   } else {
     throw std::runtime_error(
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index bcbb33d0e99..996c729b4be 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -11,6 +11,7 @@
 #include <cuda_runtime.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
+#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl
index 7b02c1075a2..28c9e893721 100644
--- a/backends/aoti/runtime/targets.bzl
+++ b/backends/aoti/runtime/targets.bzl
@@ -23,5 +23,7 @@ def define_common_targets():
         deps = [
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
+            "//caffe2/torch/csrc/inductor:aoti_torch",
+            "//caffe2/torch/csrc/inductor:aoti_torch_cuda",
         ],
     )
diff --git a/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin b/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin
deleted file mode 100644
index d34f0ffd0262af56bf3c172beda228da57f93b3c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJu$8lYb`tkxG*DnnE20DTA{aWI?Mq^C?Bjef
zPD^Q8rfHywPJ8(<X&(@urfJeN2~BL8wtfIsA+(`=nD(*D(1d8|l28!R7~b#yyj;$9
zvX?Ziwuw`6&+~tOeV*rh_VGIp-4%^Qa+{dAqU={}Ue0NF^HTzNb6AWIUiBcGU<KBP
z`!y`Wn9-PS_?~YUF;nB*jDY!QHKuFj7Bgy1qf(o!GsAaUzTtZnzh1N4dc~TYZ8&Ai
zs@Z;}=~zYEFHKvux;JB0ovDhCspWfirB<m;S<|*x_U+oVWo80}8>FfM-A;L9m3dCX
z@tV*oyq&7ni*^-RsQBFbQoYtdG=82^Ym}Yh?36X>*)xs;s+Ah%o>P^?0Jb|2wWcM6
ziEcIQMned#fwxhLUV#AT`Yp?E!9%i(qA4#8w^HIrk`J)L_Z1hV|3D}oC5B||YIVx9
z-Dw`Z>(%|bKj$LCkiN_YR6um<<-C8?^s6jJ)$RAsI5S3la<bw0qVMI>Qgo~atoEj*
z;QQ~n^Bzm^TN%r)R&4YyS|Yq$tX&p%myS;1ohR_Pn`+JcuPvZpS#@pz|Iq^4n9Woz
z^vk*gO)N`|4fZU~Asi(;L)e&{-v+$?+)#)jrbXa2wzUWE0UrtB2PDTF=TG(UKO^}S
zpVIg}&ab>!5qqBF<0}X``yI#k_293GlAr2yT_O7%zj*~AXWs&TXfTu?sq*)n|LPw8
zHm+R~OLMw37JGum)Rq(T^CK+LJ~2O%;8Vs3zq?0^L`Yk){Rl?L##9EL1S-n@1jb0@
zZ^_NLM}T3Gv_Ipf{km70uGGr2CCscTyHT&#9;w*r*(OU**Jqsc3W^SnvYmEn%|_a;
z`;~OHQcSyZ{&c;TGqRa<qvAWaxOVALd&+4D#c4~aPJ6R8zk>6Tm1didn=_A1PR};I
z(R{r=I%>8W&8epQXt7bL=iJA~-J;v@@+EVsS;;vw<+(?yhC9b1zx>O0UhezNHo0P^
zovQ124WdLp{PiDR>ifkDh^imC7FR5yZH+9ydqF!A9gMuIMWbg~WKnAYc;}oJiRzIh
zO^aqC=a;qP04@N~Ba7!RENf3jUuTh}#dFJx@1Ez3+A)wWzI72SXCsmG7vGLWwbvLF
z!Nkvw(-_qbU|7=nao1*Px(M1qh((`^BHtRX0Ch%Z1%L=S%(QJ0?P^l-NxG4%+E?ke
zi*8SlTiUfW-9op6bQ`3bhuhN9(&CbKBY_s(I3IWCQGn;PFB9@*a#~0`G^^2Vm~Jy&
z%I^&*+}2eDHgQ{AT3lW}_l`)*aikBkOyt5Ma+rxMzkTuT_s;PoEc2|tv%IuKSwkA;
zDGgK@qluzb>NUM!jN^II_NE+PZ_HSGGBz{pav3K}!)jEHIQkf!67nR`4V*1gH9e~v
z#eyzQj)$_@hxO0sNB4~G*6Dw2e79bpv4DSx8_xij2R(Ca-#^nur_e*$@qekWK>AGR
z-y`gi-9irkS%EL)VFxi*8kJ(z(G92Odvp3F@9`tdaBVzY>c;F?UN>Cd(|Zzg$wn-0
z!piKvg2u8}(wKQAjpeSSv3$r_&b;(~ldUisQhV9^Zj9YbA5k8|DN)w5W`P|ga0=ay
z&vttMu)9B-yA_L=<BEu~f7sifH&;;b5UBEglW=U;Aq$^QYbC3QPolNVUfO%AWwDHm
zqU6nD`98E^3+Lf|t5_J@Knqz|ptx{uQ$REIrZFoLk;NV~T=MNo*=Qm7#&p$?DavB^
z8}^7^Sj|1`nH!b>bJYZ3H{LJ{Iaufj8}%DTzfjQOn2~hE3}SY*n6Y<nm`8bdWY@|;
zwvkU*hJAB14`1tEKtUcr5p*KqBbIAaeWSrE=;8nY8BAuuG>JYlTQ#ODlI&&Sw4m=9
z&F6FD+3|4-*;5T*;$Q+DTsg!{1fkb6g<=pOfb%uL;}jc~R5P=Wo2%9XD)p0Rf@gGm
z4Ekn^0Y&r^kH*YNKOe7@9mP<Vu7Cq!aEofSk<K!e1q0b!sU=+Bt!G-~hiqzCmzarR
zH>+dm%FmQdlc)43l<?3@{TS-WHS3?&Gdi^hkEYWp!3*cRoZ^9&sbb_5awnaF&p?2m
zdsXrr+jrc0wNjch@G-UI;}Hh$gASQ{_d3l=$+xO@!{_^okjffe@xe}4d`fp$e220f
zD#=30E=@ZYp3;3I6g~GmA4H!M(epT{e>zex!V}%s8zq`@9Z^E6CrZ?|D@s`AnDs`<
zyJakhax4@j8oDz|zDCNlh*qm*7E5FuVUk!+oGN20-3p_Z6er&p3PGHELUCq!D|M_S
zzIe-GQ|PTH?e^=56k<J*BBxx{N~A#^*IQCvrfepNHEV{0r9FlB?xZ-{Ox^+1Sd+PN
zbNt}wZ3o4+qUgN0sm(%k)_BbG4co<w%9b(dA2Ad&PqR6SEo}`3v9V##<C~rm<1X{0
zV`<TDC<!^S?NY<RZ_1H8c5}3!P3G<@Waxsna;0fZ&Q_~|B-ci4*y+&8J#@ajB^^gM
z-_n#-z6T52IaBQ2io~;Ja({D_%E=kq@M?ioo~#^~=c(AUubYFTb6gG21$Cc%=*Z%s
zLmXR<ROkwBo_!uE54M4i=7B|87?X;|6Gc9_bbRp>Fi^#<<)Cndop?g3s24j`k$D%{
zkq(-HgT8W5WP&@9&m81_D9Zs4bu<dd)vquK|7x~Mo}1;x$_YPVl=~m1i;@b9NUU&P
zHbh?7fN(Zb7|WUY@f^=XmX2!*Qa!JQjBptjn!*fu!2l~=iqHsQzHG=FF5`wOnWN`n
zzJ76PbsntYqja~<k6CUJtMnLRVxHp%`OfcMdCqofWqK$*1{R~?@B^}6z8AHNMbByC
zfqq@ru^xT=yQp<)TbB!$N2oWj8M{QW^&jo0G{({ib`{2LG~X)b<VkEOG#`(#dBS~c
z3z6{MUWT!YI|De*Mi^V%8o)88chEI<Z3i5s_q!qa2upOp+z9B}VZ0mN-e$4u6YMO#
zXvAyP*x(lK3%%!|f9<*gi)~J@B<6!r%*SG+jbHRGlO`X^@$Yi{0OJM0f_IjT$1g$l
z@ip*DC-L{bt)+LL+&!g+-LZzAfZ^YqoIDz7GkOh74&0Dr*Ud@6ofvCLcJgSV&Ax#-
z@jMT>6=RE%g)OAOFW+1b1G$i7-)|-OJKF=>pb_+wt$sE^iKOe56#I6ozn|WMC*FTH
z#m=O_-`{@?-~)Y<|A_=UK)mD;$sbL&+1CM){S3i_ZT5_WU4n<&?3)rM|68JNZF-md
z+cfBXZT2-uFDBU#=pO=XpMG`|+2867@BrEq1fODCT5MpV#+X}N)L=h$6W~41{<+YO
zg8nMvZE5WtkZ0Qv6ql#>>(KY=sZH#S6pOdy%iA{Sh<8^j&Q7B-21p+K!>vS|S}no#
z-OElq9Zm4sZ#TBEg&62bPEQhjAr+6e#F}8F5Bd1{5d?M;3Me12H{2TJ{DVS&bhtId
z;UO-+1OAXTT+*N6R^I?&`Zzs}{KvZL74NE7JWlnB1@(%@JL?tesMm?7W62<%8zQhz
zj^ZNwJU^T&;020*OCNvjzx@+i*hmWS=0WZ;<>QA*z=@#%M*ofaDXh8qTS*;T*ui%C
z(dds~zG<A>SU3e((*hXzPc+2ZVLCW6Nx(^7FAgt6*-kGO;~D1qQjr1l2lPM5ZeK?I
zcf@vj@i^^Yd^14)B^|ck)7VSU-_nqtd9>_2;!DRs$5uh|7d7_ADFuW7eZa(@lKjvJ
z|8#6!;J+0o8PAXYqK0;w|NIKf=_~L^lJpCH^b^I$FKl_Ve)Th4zu<?z`WenI;Ma&h
z$@en;6&Tio?r9<)WdF<#f@9Rr+&;;B(InNM!>^>-jh?1aMR^rbKhWc%<0trf==Wmy
z{)euU)W6`5Mh5`(6AJMnKPczv33e9wdm|Ma@EGquHjjKGKmA_3E%ZSi@qr(^GV$~L
z5q}&AqyYJazD)%Z2c7c&I^esBOfd2b`}c6efY`jo&ZpSd=U5*ESl@PR^h8(t9ns%P
zvCqzZz-a$&x1ZA3=@h#*phM{Y5Vov?U%cZs>#heL#FyfHdicIO4_fzs;l2kbX&+r5
zW#GRe1GGJheWlZ#2?>5zY#ITP_`?_NL95A+O5_b)#BO=|=DW)TSN|1$cri9D1&D{v
z3wg@XlPSMG2YR<oBAk{Fd((m~xOek@X$|=iY3O8^Ko`iKT0{Ova(~>^CC~-3Gi%8I
zR37iQs2?uK2)gBO7I-%ZP)^Qn7n5BQR%M_)I(!`wfIAUJVjjC*AYPDolBc7vTYdzE
zflod5UzGBCNS>xjUe%F)*Fm@mB8IL%N_kahvQOuel2iDftRcU+iafPcxBOq%kWa`D
z6dCCg*{AcmTYekjU(Nr5lvjR|Jn?nQ7hoTMJ!{~wl&74BFD0*V^t&69m&-+=*}VEd
zBnf1n*3W#vAtoczKGDP1{EY&VX$(kYD}2%Kd&sA5`5#F6FfYx!<?j&v!*1_lRPuL&
zuCCSkkHVw8s^7CJUNs(;2GywYN9aqDQ}t1H)TrvE#;`r=-=Ho^K#i(^atzB6O4UQL
Xt5MZQjbS;8PvueZt5M}ujY{qx?zQXb

diff --git a/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin b/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin
deleted file mode 100644
index 265d32b454c580e44b22d1a5f822ac83dbfedad1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10048
zcmeHN?{6FB9e?b*Bz6gLT38{hqt~q#?TD_vvlF{5YXjxSB#P)LObCQ-XZxI7=h)8q
z5}byD(@kiarZSB;L)wdqca`=B3<;sW7!yL<G_e<Zix5K_gDUANiZnLw^L_4lE@wM|
z(S&Z3c+@;UzQ4cU=lgu`?s1>H@A&=kSS&llf);15v9_4gu#yfID_Sr<apMzgiWS%(
z`kPsdF}*p{^gZ7$VkXDM*<j5_t2t9Ex0qgA(A`?4&UD{t`MU49e!XTj>aJCpZ#rem
zs@cA~;8;c5FU?rBx;JZ8ooUy{)bc&st+};n%PyBKGaJg<yp-mQ?K$NwljAu}$6J8F
z47YK*S})pFxb6CUU8>ibzFqTk6idDA6z8X{if7L{a#3|_n0roDAj7rY1*k1p0W;yP
zrrm4?OdG&$#J-;)gf;w@<+orV>BTNdEA@t3;y~gLp@HpdCJOrjkTwbk@z~Ysv}ZSF
zxc3dO?$`ar20V<&%WN+AK~{c2{g=5-J0+9)<g7ER*DIB#;|J+24v)km>2Zn|EV(}U
z(0vbC0lk&6?5b-cO(8FODy+R0szz5r@B|Cs=uLSv{@>0?czU|e`R6((n=X>_D1UK>
z%*4@C*l6DYRPbon6M?E?ZUU~I-x0wC)02QJY<nNP1b8e0zd=x(;`FIL`Xxax>7>Nx
zIlZ)EjqODaA6Ns3*+mW??1K-)$xivSTthn?zHJR4X3K!@7+s4`TbS?~r@yI>zJs<`
zKxs}*Vd^t9riPc>?R_lKS#Ix3@F`;iAMcYA0HiMU3f2LyDkJb$fa0uyJ{J4O$Tsvb
zc+6r4{Mp6<zwVW0+**0QgqbyMH|zD<Dc3%bbr~yV-ST{Pu4(7Yk~`<j*B89Gxy6cY
zo<8N(7N$>4c-eA!qB(aecRKg|N_H{VaO!$vk;Ptp@5R>#U){-A60QeLjVaq>mp{6s
zEXGG;A1Tkp|G;7^t4a$iB}v`W=vG!&E-7j}69YaIyQC;_Ev7t6q?ay#xVoY&(ON~f
z^1gBd=s#Hb;0lP^vDoU$N*wzbi~SMm;yg*aodq~z3{!52DL0YYJE<d^l<!jaP3nBo
zcvaa+^9QNB$f!St?(*tYPQALSqzOj$UsdL5+Mw<+>VC(`D<tJ_%54Pks3UoouPWch
z@ONcDLC6Q?ICV#;BQsW4FDr5G5hmn+{5-iuLDUx<uj$rnT0zg_)ynp!9baqCS_d;W
z)A725V%)Tv?rBGxq{1qSuco6MPuDa<(~AWyD7H@;##7q2w55X+;~M=<=Et=HjRpJ(
zSUv+-4*1NO!=EOL3ip#n{xjtjNS+D#2LpYiw~)o35x@&M=z))J(=ApVO?PU(x2SEV
zEbe1^!^SI@rq562G`-<_T3<{yBj4iEt<3oK(PLaEJ?3@Nlf6!Qa@R@EL_|;4+*Ia@
z%$kfy{30_r&-T;H|6I*?%bH;p*b*(Lab)npq$Sghy9!58130J%*OF=PE^~4X1ls^A
z-(}_iV|$NV_|B`9tRlYYYMDb+x~k>iDC$K)YfO%7Pj1EGd+P8y!coq)Lbw6p<PDya
z<i>1$L7xxYN1aTUeA_MSEqL6Vsp=vwC~j#F*!+5C0lpPxnd|xt&08Tn3*k{u4V3LP
z@`4<-Yh~%DfvTPM2x*w>`Dy~Q73So~oVpjP#RDYm@gN(<Iw3Y;)>f#UAhr1TY3Nm7
zZ}L5F1RleHd?A;iAkA0x8CPIE1CKHU1ZX2~<PR1m4;lv#BQrdiG6tTdAOKkwmJ=e7
z>lt_u28c<7xLgk5I3j=t*xX^RuUZeW$Z=pnV@$%D=6o^4P;zoh&|_~n<I`^0kpyLv
zN0DrWSgui?O5!Y&Pz<IC-Y^v)Yh25;plT}JFf<%!@oh(mzeEbPGl)Is*Y4CZ8bz>6
zI9FOWxZQoif&4V#W+>cT6HXh@?N_h|@W00|9{RGu15-A$p<i$tjv~Tbcgb^X-)YpV
zZfQ}+H)zSn%NFi*j+=)LISX#dx2ks2=OrxQGIYv=NQj_+Jui+MT>?m6$u7+}7GAs|
zPXsU<`j`uR<hwA@8^$1h20sJ+zQDz6@C96bz944T?n^H8B_Hvn(CZ5?F32>hF&KI`
zhKkHJctdP`?#NB<$=?5LYbHW>j7-EIDnZ>j9#nxcm7#SVl0p4^sYa!v&mkPrYu}&A
zHHWfMhj4BJhbqByE=V)7x{}SC`C}7z9|J{m&hvG<flI!YUhz-sf;iYlDto*fNhps6
zKm6EVdWMQeB~>Aay}yv52BxyRpjYOr)leQk&Oxy<iGU8glnR^DUJz%VH<eji_i7=U
zS51=>@hM6xFGhl$A0m@u_#X1Cz}b$}v}h?C=M^rmoq-hQIez$BU>sU|-EnGVdP_Ki
zkC>*zYe!Pt2ie7<=Pcl@s<)l#BjwlT{e-=YTea>tVAd{?4aub>9AQj7kYG1r{0hyt
z2XhYJ5t-*z)+YQd!-T?(NrAB|d&2b~%iza_?csWW?d!s;OzT=J?AER||D2D=i?Ku(
zo+|}jIf46{uXdQaE5R-*jJ=INb#$29#xFcTsEkc9bz6dsU_Khhd{QN8ywUv%P5vdu
ze~R%FG4g-_Pj?y=6WP8dCcSVem*D-kNP(Q_ZPdLR$dP(BJDy#Nbr@Y+kzrpSVPlIz
z;vS5(5q5TIxWj&oInlg~bxUO{f`q-30zCpsjeaM3XN0}fO7J_TqdOrH__M7fo8mVW
z!2c%2e%4AR>DDgs@wpUxBL(_o@@A}$4hs5b66`3}XaWCHqQjoYiu7k7Z)sbH{Yb1E
zv>xfOpNKW=-_c_1IV?LD&keEP9{_%|!_EnOafEI6aD7Mk0`PCVz=lD8{&vP1#VZQz
zRqtrU*>t5+>BND5m*lG*_Cvw{9^%tdI=iuq?}SsZf4ir|Nnq!#7lzoIDYmyYq<oKZ
zgZSb90Oax30X_Urw}yvQ0EP+w=EH3HLVTF##F#$JUS@2<ALRK)<iwwDB^c2s0(r4l
zPUBAUEC3V_;9qK~J@FpsiTA(&#aj*IJuuK6Z?!Al>DD%G-?l*C;PM3}!EZswZXafq
zWN7~g$M05`FAU=3pppH>6pMR17<Yh_B7W)C2$wg?=?}oap*8<_JXG$V%JDCNAF(8P
z#1H?LFYFj5|GE74N3b3m3D<w2HB&?Tkbe_?6720GB7U(wemwrm*YC)a2=>OqEYVEx
zv&uBAkuQ5ZWsoOC5byX6EjkJ9XW2a~3VV0AD%P;&GS(@UK)wQhl*F;OQcO!Ei2ev;
zQ`}$e=N$jw+M45o^>4us{XAZy;NLsKG;c5gfoM~(Z#S(K?hjw5*f+ePLAovHwF3FN
z#~WHJ8;H+C`-zuYTVWrh*z=1_C4#{*6gk4n^Gx{nQ|y7oPZ-s%PUi;-yO?6PhIlaj
z7uJ?_?6G_Aw(fuQ3H*4(rzak{@0fM+JC8g{PJP+FjKKd?1gK1iCtjCbQ)1PQcYztv
zM;rYvwI2U20ZjHFd->@XqaGDp{Jx-Quo=NXH1s~gYYj%cJ@Ix3_+FWW_^GHWGXYuH
z_tHMF0sqH>|HF_onDp}d8}R>G82YEr;D2QU{#Sz{%I4`MF4B7Si*nhBknm|v1Zb~7
z$Zg=ynh5cHFmFsnU^IPP@c&xCiJwY*FaH=e1~&ES-!&F6YQYfZh~$_1NWX^?Y)a6{
zVM_4JeJ1_%9x3@G{QL&|kFUc|Db>sW_6Gb7!JiQZk$(Q3C)QH`5d2@y{`2eTC%Rt!
z`=KBI`p&?w1V6<*+9bb(nV1m2I4%;**loeIEdoyV)A?x!;IJPP`UxLxZM;KL+Z7C9
zPr9OweitX3dij4L_@lHm@8zEe@`v5iLn!%A0598m`A2?}i}L?<Nm@Cc5DMie<454l
z@X7s=dgLhgOO8=}{5weOi=>yM+$T9k`N&7PACg{<a)0C)<s<)O93{ORWnASb`Th;r
C9O_j7

diff --git a/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin b/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin
deleted file mode 100644
index 0e97dbe55ea36a4c977aa64fa95edadfdc9dfd28..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10816
zcmeHN-)~#h9Y6MUQo9Xy5+X>~ZoL_eXt(UfKa$vOSqT)1rWMgq_5wn8*Y-`~&avI=
z8`o(lb-D>n(*)Ca8PXmqo)Fu^9zkNL4^YuPp#2F|h4#>OD2W(C8pHd1&yVZt>tur^
zbeqIga?bhw{&BwNd#|s3>Csas!=X@S7ZX>Qz0F#3P1D*T0j%98=BMv^n$5Bt>&H04
zLW~*pg}UpwRvs%gFE0w1&t`q0QfxA#(lAPuxhgYUyXhLPQ*x^nvsNvcb4zu*Xqpwv
zEj4U2Z@Gm9vr=^y&9XgTa<MdB$0}7ym3h-D7R}TVPuKRf)E6wrF7B8f$FAE>0}7Me
z$N6$KZ<P^t$>ngNTB*BM#hs#D8bv$5G;hv1)}pO|a;bu~W0xh-gRL$^rC|zTqMLQA
zUKc`J;4S37SHQ#7+@|R^;UU?@E-5dKTB*R1B=2E`?;RJV|3D}oC5B|Ia(UjdY70F2
znp1VF?s5$g`t&8YmwZu`U()zhZPQMvqCT@|FB;Xkxw`F&@|K53(NXj`#SK%zXC8m_
zaZ~V{Nz*EqER-qq`A>y;&_vhhDhOU+0*^7Rj>iAnIq^<U_c{MU=VaSWQXS<l?~tW&
z^fWfua{x8m8peFsh+L<E*RKrwC}MdAc#RG9;0wTqeE2<*<1FV-^zg4pe#NIWzQFmF
z7ag&eIlixhkgKa4-`|7p3zMJfZrLH*9KXARkgHYThX*_PX-QLWbN;(~_}jP+N-VAE
z(pcm<niB)7t=1tHZLhWtMfs93!cX*Q5eQ`~GKFA-Y(!-s3RIZYz!(bsJ-!=b2pATc
zbQf!rZq+F)lq$ug0#@d{Rj*bnXG_-P*(_s)v)O89xn3=0G7IOd^xXLiOEc%4`9{5Z
zE_?ReV*bKh*_v}^D$VSV%T}>*zUnsSjoLB`z4hmp-|hdyJ&eT=y0{`CZ78(y@rT-S
zcrf&_b}9TP7Ft`^nt-$z4d-cyg!Rzvbxc=TNP9k<46S{rg@9YvE|S0oa5~0I82_U6
zVZ5d7f!uA*q=kXdR>C(zAx_)4y|FF`q#}%M4uwW|E@+Rsa{+|USEk(?((WdQCTZYK
zo}txuXsDAb!s$_hPttIW(fHOSj2qejts5BD*Vfmy!vyj0c%btH-bF)&2CjF8R{LrA
z5m!D=OD=MbR{Ln+!Y-{y<d$}vmXFaOY?HmU^|il=kSRr*t=4X>ZEXB?1CFh2Xd30^
z3gv}1$!OS4y;QB}IU|clH_Mr~UA?|&9!Xlvz~dO2ech~=F4+1EHE!A5bp!2wzM`je
zBcIbnGk+$PdRG6gzH(%GLZ|<k?1Y}9Ifs9V%O-)F0zG;0*uT<6H-=|Y*?+4qNBYvx
ze?-_LySWVhQv#oxf*r(Is+aO*TQ}^A>n!WrX}gD*QM2$!ryEN%Q@T-e9la+f+tF`w
z?PhY~&e%!aAv@_iWG8cn>`dJuJJUWpne?_QS60@HPve(a!C7{gK3truxTT_=O6S-L
zf%7;r_&}z|#%mAcj-v-~R34$l#+?VIk{uL$1FHBydI~sp@RW(qk(Gj($7jY$@+h^g
za#0*bBQJSVGZXqVJ8}4)J+_H*w6mR1o`Q0U2KQkKW3k#WmPGg|G_Uc3Yn6&d6Cu|Z
z%7#n~1vc&wJ-3->Xs|osS$fm3(OGvwc?Qb6f<-#VQ5qDgKT^bkNI^_(5@{OkekYun
zf-_d7=%p7i!YE9qr>8bEi@V8An4P8^R2^lOqJddqe1>-1c56o2HR`+`YOp%)5K}6b
zOA&o>scbBiB-u%O^lWN6m(J#L8A_$2$|;TeBILo9K`#9S+#m%3!QvM0Da4H#ctK4c
z<L1g$k4n|)r0M1jU+PPFkHCvl1e2bYZk{g{ZN*TOo*)usP#suOSrpX`F35T%g2YVd
z$z~F6$P)tN5;HqqOX)cM@`Gk0%`>_JT_Pp=MeG;Xtbbci>Xda6r!>nGQ=}q;^`KdV
zA25rMIc96KJY`d!*>uM&ZCQ6NM0%04KrVRli&TW<6t+_40zLzQ<Z~Yij%~Sity(S>
zmJNJlF1UDh!z-v$>7z&OMycSMWvlM;jwqy322$uxtJ2g-D}323zEi0#m1Ln{6&7q0
zpB!AnmvmkjGhXylB6=PNl|xtRMb1Ud_eP1}|AQzY)e|M`a7~>kmF~bUe(or_=Q%IR
ztS?G5<?bj`{A3r!ndl8OrAqsY?Iwx!1d8qv2$YO(Yd=LpN_wHDlD=3|=p}(*c|{jR
zJK7{zb$q^Hun_AB)=OMxTT@#!h0ndOq!W!J-3d424;QCYggYm0(rI1?R9<tLY&v^l
z`k@n|i7Gljz*KBLI;)>^T*Ip24QbPub1xW*nWs9PML*q~A;SwPO$A0BSBho0%e-O8
zR;&oTv+@1F$)hC62Z#)CaOh+%Nf)#gOATXgsa*C{a&5$hHhbh)FP$H9LC=r^YHZ3X
zzX4=~ZRE>O;@J}ZW~dp=En0?C@g#Y&GF)C%9jYt9!toidM;xmZ-6!a_CC#U}(;2P`
z;(m$<N+cOVjGuk9$L<@4|5h=f^V<klnL<Y=DaUrbN4C__&ZDSZDbf?%MSQ@l+q@CS
z<hwvCpLgsAo_K@T#U57vl}RsSP;jT<UI!Mf0{IYIi6IciB9l>e7v@n~4~aF$5Bt{H
z2x}4j-T@-vZCQ@7kH$Q>pC$3d(vS!DF?`A7qeqzD1#4u@M-S8cMjsqv(Jq)91zkIh
zcXj*PEV4h!uHwbl4g5z22e@zaa)bW0y|XN`JIdl%4~DUxiI6sa(YrNTd?x3A$oW&u
zJQ<+TyDZ9wy1p(KLA=sQ_};gNLXZC~<Re??QFeA}zPu7@GkR@F4t_Jv_AX1!W0)&(
zc6nu>&3=wG@w^JS8DVRZh22bmUlY(crkip0N;AseF+O+?G=hG)8Dq1QXu7T?*sq$g
z7`?TPe)3|1y_W!gEH(o8!G6jAT$DXXyyOwduf*Hz1wdp!N$_Bs{Y=6&f`{Ae7ZN7_
zn_^$v^y>EgNzg~y>_th>$JsFGp8;%NeQ_7rA9C<Im*82z*IojDw9S4h`4P_m_*>s&
ztd{>ogFlh`n_>3b*I)luI}H9mBL1e<J^=RcJqVKP)54^%ed8rgKiJ%@U7@-me#FxU
zd49FQk9>?b2X><@*#Ot~1Y5lx9^kdI&lq4gBj6`0F5>@y<mEeQf}wA`8E3>17y3hQ
zUI6_v5R@;je~{}R6!h>b%}8K>`U3mY*GKyk@%E>$uX}$YUHh|oJrd_0?fb?6%g5lK
zIP=`!n@_;MVQznz?0;||vU<IL(2K7a0)2$jN4S5JkSE7+$@o^U4-E4RP(EIbgH9*a
zqld<uQ67Jk<gtIN*LRP2{IA6U$D^Kq*w1Nq2xq!=oYb?M2iTY!4*%}m`|)C%-~?bz
z^I+sJS{G-N>7Xa$fCojv5R7`-gZhBJ=h?$+8pS7J==w-w?<9zhW<~T3&?kvbFz6p^
z?EM7Oc`{nZar5Ezqrb-WeV@bdM<1mx4}A1DIPB5MKYoFW&Tq7-;VX}pI6}}*{t}4~
z@Hg5VlJpjjSAR`IxweEp)UW;qr+fOxoW5PWFY$aO!edUPpXXP=Om{Sq5AOc~f+M0M
z%nJPu8x-Hc0Df;0bSB%Jpx@_c8dVJ6^=nVC`yA0v_W;ND!-;Eh$5GEOu`wsXyl;Jw
zpT}`F#nsZ-?F4&anMH}ee;*n>(N#ZVpx;Wc$Cf{3)W6&9pJ?oAf{l7~2>lme(>(FR
zM;<ayKJ_%dJmt&NPd<9WJoAGmpOSHXHGP$V|D6obu@~pN+oxFxw&d+iQHkGw(YfAC
ze!oP<0>q#^ePbFh!POs<x5Wi1Ks@wZ#@j3AQO?EKQP6`riS$ePP_`iX)D+}>Yzz5M
zr2IfYAOJFV3;9S)SR4%q1VHxY7V`V$ZF7h>d*yv^tcL}u*5MGV<k!S}P!iNO@Lum@
zUJ~ottPe-ar=<Kl5>N8vN>F|;HU=T|*dN^|1a&caHKOFzKGN?22sbPF)RdF*YM;qI
z-Ds4Y!e7}!{`4mDR8m3t8(YXPNcp66i0t#{TnQ`tyAc0o{{MOt`@|Qte;D@h*K-Db
zFXbuc{!7U#9Q|H{<mGWuXvX%7rTWbP`A_GkCD7iuSK24K|7wj1NUHHjWXpfi??T9@
zp!{#7yq}lWLHTJ>KkVTEqmn-jy1F*2KZ=|3s(#m~c-4Gb8dS5&AECD+r}jtLQM1}F
sHT&)H2c=*_rJ&j;HT&f#MztTxj+&J^HT&f#K9xtsuV$53H7mJ)0z0U8!T<mO

diff --git a/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin b/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin
deleted file mode 100644
index cce20665191bc4720a2b2dc9bfcd39965715bd98..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10176
zcmeHNU5p#W9Upu5?(A@!F9!vVrp=~}+oZj2yuS0<2{btfUsZ!h32h}1TCMH%-3H&<
ztoMwaBjqlwP*qg~m6!Gn@fN8MeMKsWhf<*mRj7UFD?$iWX&PNZ1d&SX{{FKw>-EP$
z$sJLvGRn=&|NCQR{<}NQGxr|5FCL4{j<KM{*{f_#%xS29Edc6!g7MKCA7u-yz(&#E
z%wmk`jirX~`Bo7#IkuMrn2%;-$t|~-?lyJDt<;$A+bv)BJ;$%PX1(T^m6e8FHci*^
zou+LTEx)v6x;1avtlEo?kE!W<mg73^qG^@OrjZSGZCy)a$@1*-w(0TghV3<>FvERZ
ztk#NF6=6F*hf6iL;ajdhN4eC?c5!9Vta#S4ErF`zV(!^hK@4H52jMo&K$z%e!)i1F
zp-u1>az7{#;_80O^jq+d>|&LqmwMePaU{uy*ueL$i^6{(q>lnaGFG*^=vnn89(~=b
z`8B^?M}!f5nXM&1*p;7A|M}jgm6DtK#In7t*D95U?FZXi93Dx>rpGC6ni4+o;Jpu;
z0l%3st*T>Tn?hgoRG52BRE?eu!8cfdM~}n)zj;o=)6?t!r#dHFE|T&ne|Co~i=(Ho
z$$<l?;L#u+8JwmHjx6xni&F#qCxBPj&H;Q0_*evggWx#H`BMY@tAbzhNsZ5Pe(6P5
z>;;Y==_17JHI5%0z>mbqPx-X$l5LLP(M5>aI`C7Ik^Bf#uX6sI2KYN@dj*!})D)&Z
zO=D_&eQj+YOLW%P_9gg~F~ZLbXb}iuOT7v>kX2;{&I1)^b@Z{=KPGmdk0D|fJK!(Z
z5BN2&wB)$ul@ey=qSdI?+>?%V;E6V4r6<}a+qq(?)>?9McBS5|)-1O@-+s!?JI$v3
zgrD`ird4z7>fFgw^X=Jke#O=6Z5I3OdoR2``pPYgB@uei)R?j}cKL&gN;^Io`#^an
z{uYaETvA$qlq7Yh&|SE^ap97p#xpT6WMUT;C9cJk9}(}x%YVPLp{x?DqT4vH+&}^w
z7p{P5Ef%}9u@V1G9I^cw_Tqe(_C^bI#2cpE8dGi}yWgOWoKk*B-G1tPviPBL3(b#F
zcZ9m{pgVu%1LY}<?{91<X+lx}A1W&}tyA{{>fU73S4hj>l-p_UQAheNf2cfw;r+l6
z#SmCHOfz!h(xuBvoJWKS#UFp3!lEqdP1|cYHCHR>dAwU$-lFYmjb-y-#$r0&mr#ux
zX2UsUYx7iDMfKHm)Z;~0Gc>(e(1L3Fm|;AweOp^Sn48h)Z$3Yx6=*ErPvG(y;O0Qj
zoWA4ZbW!Dg%*cPDz5?mXLjS?Q9@#C-;?D^1g*n(kj84NTR&7nUUEgbKTd9ltm|nN=
z&ZX%q^K+VB_dRVOCtJ~Pa_wej=K9z%u9KbYb+R*io$SnACp)=_o!RV`cCOf2^AU}o
z<qpoX{q*+#r0Y9n&BzwmDuIhQGWcTBlIi-Lg~O-;992YU$+UN8HrGYLH=xRQX6Jxo
zdykp;&~r;>5g&DK<`9*xYB@NHdQtEi^E29G+j00Fzhj7Ul(X$nZa_Ijg9jyrv0Q8F
zD?#|Elj)LgIc2?th#N~)UE~GDE$xAtALbVD+u>GrXw0y@9m;2+JQ}HivYp1hpg=9R
zEaNm#wbLFEjqI>kv!HB;J9Fet%?r)q0h0E3k_}@>inDNQJIv<DEWUo~dezq(e9h}Y
z#Bd;+%M=Qr7b~`)c?KS2eBHBnV#2Y;k|(!{0Sqz)21*nj)1eF=fFSUQjJe+I;&gxn
zhHuSgp>3rY3Sn!9Li4m#D^>j|ryP1x&ISt6lTI=xj~q#s>EL0=LTz`Qp>ambv@&<F
zg>>D}a9YKe8|~^<GN_$KV!6EbO)aBQdU}*|qh*78H=rEEk5irvm1n!k2jk)?CJTLW
z@%)wzI+BQMHjE2l1A!cJ+a=Goe7jz&I;FObdyA5fH!0lM9LpX$WH+6XZ&s~_&+Ahl
zW#~vpG)1sKEQ(`Bj{?$HvPw&~iPviA6A7FRW1I_O<lg4`(-`E>;2V4}7KC^Wu|R4d
z7Ub;OW0?zM$wy+L6VbCJc;&!~PyrdTFjPUVAsP}J2uCh*-|qd-mL?a5V`L)nP<`nO
zi0{BM)tI3H$)tX|tw!}>ARrvlYhR!FZa}lqfN*YtfGWX5EI2acb!9f6%^%I(brc+p
zCp}-c>bSyd=@tK!E}4U(ORG%fNMM7-<%b^!j~<ZXIY>!@+WQI_Y7i<rO}(;Gt%j0Z
zD9EL85DawSrB<G3p_>X-W!cg_HzdR5w3p>fyoOR8q)I2m`5~f0jJgF%P}6X>BQ-5r
z%4T?l%WJ2J*R%cbnQt8GzRTEdnO*@-;|rx>^V*RVw>?&|=-ExYGWEC91FZZ~e2}qU
zaJ%3^2bQf8`H);qA`taJg53yPU%;QbGnkWUbt*E?t89&Mw~iADw;=__uIvfnQI=tB
zV`m7Duzfvrm1#Y&!fxw<`ImY`UyLPs=-epi%5mJqe6hpST?zJ<0+aYtC&#&O{4xWK
z%Jc$LcO=*Z=96*E=T*|i8{M7I<X>X^rx-uNh*a=KcNLTm*}fzu{dlR9@PoHQbo;<q
z^p@$KP4o!#bsNsC#yX6yttf!6Ot5KO@$j|>y*t6qtd4is&oC#RmjJg^w!s<Dy^{hz
zl1?p+zjr3suUZLy$8_WtXaxOCE6Eo4MFr@;PqAOLl1aL?OI&>}#okDPKbgE4@R3o$
z|8#;K0gM*NuO>R|SwLhz1AVJII_#$ct`j`bVLulz{GVzu_6)$zYtN0bKOO*mvcsMe
z^djPG#XHz-Ot~3Oy?d7J^l+U=`~~p8n_{=OlsI>n77YHLSyemiCzz9c#5WH8tcc&m
zidPik-{~oV{Jyhe6wg#kQQpS>;r0PfdShf@t@Cz=>mO;6VzdsdK!3V5KBfX3kHh{S
z9%Ad~;^TZTP3z<ACB}08DBq*RPWeo?5{&p0#Q#nz_VOv*QJw*U`$zdy`|>~1m;aFw
z%D)=se`KUL|7uVE)2$ucza4?S(e-mmg5Qcxe|el$lA-?-oW5IKKR1fghDQDuQ!MUH
zF&+RJMZVIl39fIF^B+KbW8L_8K2#o`%IRkzk5rL9l1KdO=cdLfey;!g1mLlW5dJH{
zObz|Rdgc5i#Mcgs{KfY8@%XP_|9YNOus0rJiAI8-O{M|H{@LRxqkKVwe8+ES(aBi*
z5xX1J<=x$?fZ@w!zWxd9U(k<`I`*d&(-H~dKa2+qk5~I8r+>E#b9w;3nd0_&z9u2R
zcY<l&XaWk+7U17*f)ySQhg0mU-q<MJp7R>>###6~)-5;4&x89(mfD)aE~MDAZKe{#
z=rjr=(dGFj`uP;Qzx@%Tde`avSYfZF*li&lLjMP_X&(LlJ$IS+J@P1iyyDa258r##
zJn_AUAE6|E-agO3e^mykEb;o(=hcFM*W$ed;*U1^U2K^AE`dz;5&Pxo7pp!KT>ZYF
zNU<d$Ks@x`#A_Qydgf9)1bV+t!apyn*b)Je*3WzYCh}K>{KbeYP5b5jP2}Gbj{fBn
z<X_%I{*|C;vxn(LGt&C)i}GBLu<+@mNYGwEkjucIZ4u^`U|wH{;AlE0<bNmdBu|f&
ze)(xE415}}e^tn95qX+QdAW}Cdo1A=h!|<NO$UHnXR=T4yHZZ#U))4KGen-YRKNV&
zo5(*P<THUAe2??@OaV*#V~Bs4|E(eRiLc-Oe%Qyqfiv*Dkf)qSo0ONhEG8r`j*CPy
zcDtC0j|9peou4&<mcz8LFZYMUv)#dLSBMBEWGmX}cX{%uUw)U6kMh#IUp^P?A9i;i
zqm(}mx@^PtAH_|1mH)3y^2+hJFepcvKSFOsPOgu%BS*Pja*W#JOCi@q3dm8cll-WQ
c%JH?`B%gA9<QSEs_+%cX89B<l%2CSw8>=bkkpKVy

diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
deleted file mode 100644
index 7d7e30069f9..00000000000
--- a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin b/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin
deleted file mode 100644
index b2ba290a27dadc32cf1efdced36c497c5f85ee33..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<*}C&<61Q}1pwJ<$h>kK%f}zvdz9cS=eVi|@
z(^Beo6MNA{C-!O5q#?8qo5V*nv1!`)Kv6YSh)?^lPhD0`h?Xh|1tE>${r=C(<!mRL
zmZsG<acb^){_n5P^W2?#>fu8N!=X@a0~1%6{hqbuoQ8M5D1dj9Vtn$JC)ormumRj}
zV<E<j#&pATJ*$YB8s}yN%tx~^T`RYkQEM8N+GL#>p55{c&#id%n(5Rl=HzU{E}Le}
z@+wW+ELvV^+N{;x8MA6nRXj{h&$TMGN^Q!Vw%oF3)uzpHU*Gy@YCyAHURPbN-LTyz
zl%~0_Q`LIWsv-pykHe*Ut$|RyJmu9W+r`-_bJDeDYz0&+HOyVRDv3U9bs=g^QwS5?
zY*>wk5LyLqp$`25KF;x4rq_apWET}vUK&oN#E~TLV}<X_E=vD_P(Dfw$yn9ulxsQD
zJbK5idv$NlL4*N)>FY0m=+YZ_|E%A8SD|Y3BQy4lQJ<V_*q&&4xu_H!3jwRVX)5^0
zqYpo73Vt(fTGfh$)<ri2SBkmA#IDlSB)sVa9`~$TF8^z5$6rz1YyUr5JL|HEswIA7
z7he`jQe(q?3p0TuWP1P`k@J1P>o1Q4C}L^?ud&U2_&V^R0DeevG&z5wkN>-pU-2o8
zFL8e5#j@DXIX=3Kkh2RMKhTHY940^2sk%&lmg6@rBjoHyz>f?E@*`Eg&G~Qb<L}_w
zA+a>4OJkAeXiRK6-EQw>gPqguor8SJ7~yyIX^{wNE3y~C2-%3r!1F+b*<ZjI3jIC4
z5%&-<ER^zQoRn90OVgEFdA5X^Ib}8K_1e*jm1<^LYPvpSr<PH4W|XazU28T{R^6+l
zs+D5One(RWwVaVjry3Q{{<340p0TFvhESX`mFko`Tk|S78d<5C&F0GX+_BctamRA(
z(yUjVEsm8-HM?jX&5bsv8pUIteXKZptWiDc%~orbmf_5?(C_~ClXC;-x5yPUZC4%J
zZ4f2==^y^|=D=@XLsT*3T3nHkwmG!);Z^N;csTT-77kxzp#`l4;DgIrD6EGTH7%SD
zU0Kpj0k{f44=r4|x}-fHeusq?7cMU?e0YU3Y9~Ru_TDwHTnvS-Tzfwf)-Et8f{9<B
zqA{%P$FQiyaMxyOx&Yb%h=pGcBi|aY0Ch%p1%L=R%(N{b?N(Cp1-g-|+PCPogKp1}
zTiWe3-9)zobQ`9di`(Mj;=-bK4}m7#I3IWC8Gx6yZxZqxIW44Zn$_r*q}xo7^2dA%
zw{;7FP23h17nYVTe<0Fw3hBcv9lE-J9Hv7{?_YcWqsu%AOFZizEG;fl){urarGW}#
zG*Prly`~ooJOFx@J7s%%W5(Q_wwPg+%Q#sYW}|Z4*2n0SkSB?5;B1+y=^5Q97IblP
zJeA2jt$#&7v3qovPJd(LyYvE$1^h|ecpA7o=;@R9|0`W|3O$t>|F`-Iq%RBoyM;Zn
zTgc%rBk+Yh>>$QUqf)Hex?$HmcTT_FJ$@%M91G8tx-mPJ*A2&W^}f7ZuMLx1Fw?tk
zo~_JHw3WSywsJSoRz6@Wm%Z*zlU*<x(0ZdgZj9YWZza#-bSUfD(J^*{z$vsj-sbdJ
z(%GBI?Ze{bxFTYVCEdOG>@o@t0#)9dB^=vv$i&-dtz;JQ=2=Vcp*^Qs7VF0-O5O~X
z?NjTuVxGQ#1p{O2XCMOu6ctWu3TLL?G-gE_GT3>NCC{pqjTS<0Ojiw=pbYlBq)YVO
zD|v^Va=pUOt{6VH;Po<)gMqG~QM)91g+dNSjEo~<5Hl-9jQx7O9LmEXt5)_?jby?w
z%#UW$aJBCG6l4JuLC;fxY|k;Op3&gtb8u#W45s{OX0$*IGqY7=x+3Xr22Klx-J|(@
zZagzSP9eLh{8=1Epo1%in28`1yIG+bgi*L!F8g?#TuDjI!dEg^t@~8!CN~S7OdifO
zW{W;WbQ2FJo0EPXtCVfUP?oMxEYhM{X{57EWx+r;mum^vcj@UC`5{{x)+J^l>11>)
zTlpT@%<`O`fD#^BRzHb)a?Sde^t4Vb!lUVSO7Oz@9;bMqWhxmth1^M};4={5=U$au
z+wyFuUageo47{3_JUq7G^UtB|o;`N6Qu55I)$sV9BBU}#PkgY`6Q9!E6W^gsmrAlw
zvP#poi6?W<2t?04&->BmMD#ok>K{>TnR*ePXutj_(Uoh65>kCpqP9Iz!ZOFKHA>zs
zV}6ulfhe&8x})T4q)e-5rCMgNMAi@{iS@;)GPc~UFnV2a@|~gJ$GJNYC*MlC%9-Jd
zw=6b={(910zm`ZL))y&q%2h2#>gRE-CFNzxr2SYk*<i4=r|@7W#le>49YBpWnH$fJ
z9~gb$fY??Po%c4iS%A(O&$^yrIruKwGA6y_hGOPv&SEQEjX`W|V{n2tJtf9n=1G^k
zp(NzQwo45MzbQxZ(9O|)HkmtENYe#v<x10-oUK-s${g25Y}o10$$fObyd}K=1++mc
zt9%a@wsTprcPkRlR+jslqf}1LScY5kC3&)PT%M-_j|4p&9G&B8crK{><ReE0j~wFM
zvZX>taP#c*NO`agypH=8X<<w%8c!7Y;L!2Kk3e4)x0Zv#Wp?5TsG?r%Rz>C=WJfxf
z^&Rw-gCY~$iG2D1_d{9sd8ngNK(2n7N%&W@RPvk*FIG<Y38UQqAYGJHSVUrl^RglG
z!Ulvh>B3kpn;*~dJY?v&rXbbRTEGaGaiA&4kQWTF(xV8C5abhk1O=6Jxr`I2WR9MN
z`TE7F)qSuAkJ4Q_KW4c_tkRQ+iMh7t=R3b=`5D`;mFbc6Bv_1w%@4?!{1$2zi>}?o
z!~B}AlYRR5?@nvfwk8*Dh){oAGggU0i=Bv37Fi@U$Zo;7h31>Zoa09V^YI946a90W
zh=kwq(u`f(?!!^Gld*-(J{)0s7hPkwcfn!$rW=qCvB55w8v$KAjPIdaJ1lbNAiGE(
z7x5`IG7KN^VTZo$(4TfkfkieBvN-0$Va&%Oq>W$n{gNi1%JCm^{6vmd1slFqGM)no
zvb$EnE4{?u|CN@$d2$z(8urjCdVGd8PR^VNbr^jPCWr2hvpeRbqU{)Kadzg!V26DV
zbK-dga4W(VBnz8QfM0&WP69a}XFqKX@^82Mw?HH4XIe2fLCK}-tpt0i6^qf=@WGE?
zOtA9_@W*1e0p33#`JWqP`-zu4BKZ^X4toI**-sNZ++jbEutV@jhy75(<bO-lvP0jL
z-%f!(&|%+^^kSTifc`1`>Admc2C~1|9pV9W9wGQF+tgx1?9u>piVGU-NA3l@`^CQ&
zI$_ZNh<ICCXB*V9El|Sc={r00{rT($b}7N4E%_0*13Kc}(TXw$Ju*b{;7_&&qtt$b
zT;F5t^!LMqycf0_o7j8=^f;%-i9Vl*Mq6T?Fw%#7{N^|UI|Buj57<k#hB^PR&>v2=
zMmRje<+s5fvW83glWYwP5oUnXQ^<d$r(V&XdPSpDuZUl-XtcXtk*<23{(dCx$8&cG
z_Q_FPWS{4UQw2Ov@oyU7pZ~XheiPf70K9RSdrbNGWgPI}h!3OxM!f{qVS68`WAoeC
zb}t-$_1wMV+{XM_z?$a6$p2tNtR<#{BOM1E=k?<7e3)%_BN3iqt}hW9LVrO2^X$PT
z)PGxKyBm$tp2jx_<X_TZ`y-9L3H>b%>1m^7+lVg}0Ug^1$zRjhrLzhK|HpudKOy;{
z5&r4e&cMG9ryI|Y{<?;CYJY7R=JaKFXPop4e)JQ?$1iMyw0`yTT)*IlzxsL3FW?Kr
zALsiU|0xXXL3cHg53+xL8^ICkXKtV5-Ef@h&*8UH>>gLssG_`zs2}K2(eV>}J;dAy
zeg~rK4D~Pg!{H%7F+w3;<Ok(^V}e~o{w^gVLoVa}$J)p@@)L8T9ib2Mh!6bOsfnNG
zkNBfNAO*-T^ld1RIOvrBcK|;^WP*`j*ngB82E^JLyOLlp%&`Fouz{`E{E4piJEFgr
zU|*g4gwdYf>3mybZzS05J{>~;hp=fL_{KvIm<OMD5<iaf>EXv8K42dC`r}Vf(muOB
z%fNp{255U0`%1Sv6B7Kf*fe}1@dq#3gI1DHOXOWW#9n#&1-!=uSAUovUW`pk0pg+a
zLY{K;w92o8p!ez|!t3(kZ(6YV_g>y3tH|rp(3u{A9+0(Hk$*+*j~jXfdO&t=75QJu
z<6T$(fk8&lD}SHBJAQz2a$&2O?2xc31MShlOK!u?L=cJjf|P$v;z^#4!e03#3Im_|
z?Em6UA*hSN?-3=h>R5*`;U*-X8qQ03RcEqK=aZ6C_*Ykvzqo=twN$VCl~v?Fmhx%o
z5ZR~mx>x>I#J`gNJLL<C@{{C=uUCE)_VKT84NORR%6afo@(M@)|Ayq{a#3j3Rxe1B
zK=x_<w0#aSNlN=f4_@ti1SHeolgL)^N&rDV^~(QD$_IIA-Yfrr=pXiA52KQQ1ax(+
z)PEEn<yHNER>iBv!_uG{RsINlJ#wl(%8nXUz0?@ANB!&9MG2@;6;O^rIYOy=D0Vff
V`lvA|NAamVDt<MpysA;j{S%2@=^y|A

diff --git a/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin b/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin
deleted file mode 100644
index 0a22939ed8da0dc5a10e458e0d67ce85543309e8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10936
zcmeHNTWlQF89qBRzQ%atK-17bGl94bYM8y*UMJ8H2&702A|<pBAT*lY9ebDTWim5%
z*D)!Zw4zl-0nw*EwdxZOJn+zmw5r;u0)!AkRi#$yTZ*DSln~iYQdKHt`~BxKp53)u
z)o~&!agv>L{`=+p=bYKIe*Tf8N3=vDH^4;G*l$=%j;Sl(BY^TA(LZ&|Q|u6%X8m~H
z#uAKKK{*Kh&?#V~`h}W+d2a{hda22*`n*-C&or17y3NoE{Yu!V+g_t$&&&mG$+qiG
zSebY2f)f_YcD>=(?5aCk2{E)o->K9q^;x^@_@&UPm+fq%Z*w#S(Cn5rRhRDuu0Ic@
zY3}Q6wNY@YNI@m!aIsMj5K1^fd9_MzVQ$u*@tvBhfNG_VvF}zTF@l{AM19^C!bG<N
zCkTYlI(P^3&?^w(ys&A9O?XIlF=NV0%c~SQlH?<-@O{-q=|2$4M~NXBr&^u$9k0xz
z_xwiK2p2p=7}J-&{s|COdIO)I)qD3URE2)L=GLsn%uL{hqU7bGQgkc?tnztV!N(ta
z<S|?D+iBaWRveTrsv*8o>^(Mim5w6eMJMohVk7wfTszT<>h%AAT05JviK-=jV;5f*
zOHyZBdlqIAN679N){x_az?(nY7Ndxv4ZP0AdhiY46EXZ2$uZCQlRf<3m;8!PX?&IQ
zD=${Xe#-HEs|Y!IkK_A$@M9YJsZP~Z@=1;#UPZ{!kAUB{HI^S~>V3|CYY%@LO&-aN
z#?*9XJV$+U<V>rzmkqYhwDu12A>%yzdbCJ{v}N3eU<`^~C?M*yKxynR7$p+_9393p
z0Srr|!kU)~8-B4|sh8%87}>K<&}h`3tvIP#ou$f+nwwfh(V0<lQf_@dNI8wLlB!k;
zDQ_VxH|jYnlTHPd(7o4liYJ^|HxP<bwo;w)=jvevM<XlN>d($h6i+Qo78}`9*?dsV
zd2?azRG1HZCqG*}anf^Y*~t@)Vx^qVmp#Wh;aT1SOZ@t;KYpwK58LI6DZ5qA^#h`4
zpZxYuZ}$J{7l>*Ixfabx=wpeMYnSy?+SbH1UDGbG#FE|wuyP6i^n{j9EbGr}W&$)b
z@zKiVOIJSBhe5vtn#Agh+J!_y?*j~(Wkb{7Wkj2c7>E_74x`h~Cy?7e!o0>;NoTf!
zjycEl?Fs!>7}M{j2f3x&^w>?0=d|Cmgua7@BlI|e$F&cyd~{`LX+{4WVTR~&f*#+Z
z$L|<DPts#29+w1mir~lTagZMKcq}b1T@&H)$a$chqoofoUs?HRWqIk!@(Q?@m#?m9
zZ%|;AR%;$9uQclBw3WwOpySWFp&8Wd{b`3;PN{@*A+UqWDc78$JzkyzriGJXwr*xj
zt1xYfegEl9<{9&==HmXzeI|XT^83tb>ZkFMxO^J83DDE0@B2@>=nQ!}lmD;!rb%BG
z`u7WaWOq7;Pe$OUCtwFLR)R{Q>YA2Y5B&x6dUxo(%<>$(6`I!E)P!kyp>Ou&<$7({
z+=89nck^szZlbO1O|+G}iMA$UwsP6)t~6N%lQFF~y5gqTJ@lR8B=+`_nVp<siv-T1
z%<)~!92)oT&mY9v<){Ke92)oUpUAAD;2%(>`?Fc#*q);{zDm}Mb^%`->*)iu<y1>z
z`B(+Xo58Amdb1YHGxx1wU~2OWWMF`z!cj}%)Ee{FoJd0k8_#$#bSfpQiO_>`)shLy
zV9Oi#iN1d=@32vBR`}U9!^aN1Sq5@2&=EA|ZXBgTA;%*|#t|`ynYALucD-2+O~4_i
zUW!tUWWq4aPiE3^wc$q;WC0XGp8%;a<6427IyfXID2vksE(c{_O=}is3~1oU=3%N-
zQbJiAG9YlZS=fqsL0ZOrVz@V#g|@juBt%7`gg|1oxvKT;N-1)sB<(>hoaE-zG9k$l
z<rEzmT%8DL+-Ihnz^QtKb&0`plV9HEvpg4z<N){2JdJtaO3kmBX_KaQ4SxhL{O<CH
zM^d6xlRrqB^at(&fs*GA6@AwUU9VBC6c;SqJQPE`ZQ#!3X!gJXcfL{#?Wz-md@m4E
z8LKNa*y##Q>0S-(Xr@CODK9$3vTNf_IJ9DcbEhYwz;hyS9s<=wN5Vy9qGEajL?LY;
zKuGljh#Bn)5SBS+0|D~#n2G|NiUo-B?F^7teMwYsZ-5yT{stl>v7QiBvR2Cz^}W#$
z`4XOvLfjt<k+1oVNzCx2Sfcg2R+-YKypbRw))OR5a8;{8Mk(CDL~@5S=_t%hHXdhA
z&E<^P!Ls~_p-P&`<+J(2lMfyiyN9BSy6B{{;G`c~j)#lCrZp3uvJ^9~{45s4dJKFD
zvXgLv_BJIZwn9oYiN**b79zlJN{>7kb5vq8xg*nQYS31y%v&>a)v8jN<C_n$VMCj`
zub0keSJJ7zX*-qf<oH_8a@#p<PjM2@R+jrq8#J4#IhJ3KBzdxgJdYH|b{7ZFP>!qN
zTBz{kYe5FD1-NeWtg0&&dLjxwJ3LYzY=G~(kwsb{Gm6F&MLu|R6m_Z+0n-K%u@g^B
z6|Jm@##NEC9>SCkW+Mkf<seU0jyo}tKFs}4mLnb>Y>sPR<q_`H9hEyzq>g7ThY8`{
z$H*dYBJtByKIFNo1~FBT&7`NNa@mP|j(d(Wi?d}!HiZci@laH8h6!FS<Y}iK)K8pE
z><q+)dL<DLRV!1Jqc>F?OPwcY{H)w(@^h5?f~9#Hu`u5aBMlP=xGw(k>()#3u6P<O
zR^ajza7f-TI)#Go&f_I}L+x~rKK^UY2DNR-g&QK&8&}OKQfNbqL&&0$8f3S?Fx^BN
zV`5Bx7~5jwyun(8`}_!z@Y`6Lv8%fyxR32+Y-ucl4Q6)Gb+)4e<~Kkw`2-v6pmQUj
z>&I}%xU<cSI}COV*LKEM*vC~1-L}w2zkQk+!-H%T<E<LTQwC|`jqV9)@QLjICHs$M
ze_62MHj451CCKhv2e0%JfA6g)-D)u=uNk?9hu6^~(KoF-yO?M*x{jp)K1j0L7o_Rk
z=&ez9c2R4yA7D&81;9;%ElC!^FQXj?^5>)MCryLjpgyo28lmrOvyUC3$)NUjlD*pO
z>!aIf;~y_2**i(__x0Td_<?@O|D3@dAYSr_<QE6q>;*t%e~J7VZnGasI8F4CHhWRR
z1^C|-bKj;b={I))Hrm@U9H6`1i;uHypnrm4`}|7->|&CA0seCLXo3Kb`2#$l_M@P~
zzcJr12+`jAGKaT;{{m`+%Y(kvPY$5;TI~yMuJ7!k(PrO+0Qvi+Q8wPxlAtq9gTD7t
z?976u5zKa4Bb2Z4<{+mJ68(dh*)LDQ@w1?lJn}W(9Om?4qF+q*omm(hjN~tp{0OIy
z2)cG=VR$&A?;&}E{9^{E?}Gi2k%+!G3Ao?jxA!~0IKn;{Ws_kN>!NkAO-VuicZZtx
z`deSl0|NXwU|o-3_%|4c+GZx`=~2L=xc5gR81=FR`G&slu!oj(c5#;>Vd%Q5vp17O
z$MPil3g{`K6Ab#_b@qOenS5cj4l;I#(R`S%aeZIsF#I!jj1vDU{TzoQI{C*N{j_iw
z{A=nczm_O5=qG=n02>58@OQA8l=K#l-+WC+I4z-%?4RTGNdIoXzaK7w84dHd8}&03
z!Ql6Gkq_?wE`s}ceh4Q2eQi|yIs^Ef6u|=;>5}xjeSMht`Of-wGI6)BZ6P}C$yBdH
zem{QqpmuhE>Sf3uXbXA3pd+7{-t&jp1?2l;lJbjkCK&l8*k}EG{#!IZ;P;tW04>Z1
z@-wiP$l#|2`oJ`i32rg=Zj!yQz<B-i@5JVdcpg*x2W#o<Z%OvGg^wBSsqOZ6b#^|<
zc0_dO`xL;oefXOXKWHC$@+tf{%ZJCFc;v8s{2Na^2?sxGpJm|FkpbF{#X9fw=a2+n
z(K_+OA8%U#tR<h8$UD1;-SYGcY?leH{utjp@OM&DfOu$c;@dWQyxH+~1oUp5L^#J!
zTgVHx=-JJCd>wglJ}_4A66gY1YaRKQuvt_4v*@B{xBOe{$j{O_jdqUCk67!Lmw!g)
z#aQ^z+$k`7BtcDs(ZwjYP47?_{$VNqMogYu>6RbI#K5N>`&XsB8Iz}>l2`NCfH2_>
z5i!<w+#vup&*VS7=PEgce|;VK^cwP1Qr+^G){%cy%BO`JyvFG~?Uuh4@vr57a}E2%
z*DXH@`}p6p240c!l=FB~@(M?P|3UI{xhOP?zEH?72b4crKdn^`J{p(yReeO{yTnMk
z5uwReyd{AkpStDmkn(X}8h6V-i2M=mp)N)x|0w9HtyO;%H|16RJ*DDR{V{1!^(uda
zz8*O>Kgy2k)qJTwZjVoe`l6=<R4?aAeVL2P@wr_mpK5+oAD5%}R34QX)vLU!UdjC%
Dx2T&K

diff --git a/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin b/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin
deleted file mode 100644
index 392966b0e2e00490663a7a868650f5f2a49cdfa9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<Vcq#HFK+4DK%qlg5glck1Vg8@eMzqPt>=sD
zw3NEt#0J{v#J+6Oq*Z*{7$4EZrfKU3%Bl$=K27`BMQB2_R7ofZX$<f8e_k#ZJK3}}
zt+t6%e9!ZLe|?_ke9p;-4(tzyLb-KJTw(Tm)|PV`-n&--?<K|f#7$4Iah7L&xZlD;
zj2VrohUa=#0W&quR0PaNvoTdIwU|+D8cubh#thGHd4}gYUae}@YmPZF-LOlhS+zW;
zX`2PhD^8i!np-i;_N3!sYI?5aRGsRiIc2#e&#F$DBfh@1(bRxuyR@ddT)SbrO(;!s
zUnk48f>lNe9FN1rTD5^tykW|#QL+ouljekLRcr;6ohs(8U6w>2ww55OO;ZRH-E3Hm
zh7ei-Z=nvo0zR(pwM?%C56Lbnro1%jPLU%?-p2~x7hRP81EGAB7?QEd<w@77Px0vM
zZq2KCGj&85(3igU0*EfXj`z>|y?Yg^Mn6=sD@JW%qG5ZY<>jJMbSwm{@}{ZaLytW4
zh$;BZv}u(c3$2T82(A=!n~7azNt5uV6L{RSYPtNctsQ?wEnWNn(b`#)O;j!M>$~`}
zSdtnW>{*xz93fi+*od6(0bV~l6rhNy3B1NO_TX#4hXVKk$<gHei5~v%Nq)trG`__7
zl^2U*Kj--9B0|pI;`qKE{Khc(sZP~J@)3?-zle~t9|Au#7|4%Q`8Majxre`lYn#N<
zoGy(;o}n?Z;Z(c5o%MH4wYT^4DPx4+(W6Boq^-zq1S4c4Dg(~~6=r`0V<`0Z_<Gz!
zz_3uttJG6o%`Hwj)zWkkGjq~v)N0kkj+Hukl%=L>6+5+vqBEmprR-|6k+N!@lPWuf
zRDH&qs#SAFCY@?Hp8b`&Reai-v>QTk%2cXT?sV02a5S>w(WCB>!_8{Nv7Dk+@+PXS
z+)R1&aOH^Gw1(@BTdo|*7MzAVG1GcF=N+*thaICn!$QCN+mFxp{c)3AF;jNAZo3Vl
zgg^VkpWo>F?Q4iChFps)64Ew?7CyS79SaYJKGMSBi!3y!wE%p0NehMb(7dLF)1k`?
z+DQOc0O+B)%U2e(XT$HX(EQw`g}IL|b4Kk1NLSy#3YLqZ(B-QiM8euz42odl*C%NV
zYkM)wYcbrlX`0S~whv<Avti_0;}xLJSXu!f0uD26Q%Jj+RD6kU<f`^<x^1J|Gvt<b
zD@`}hZ6DnR>E_}#KR-V=uiZtUNjJ{N-FX_|CGA^;JWoywX`5zMx+UpW=~DivPvN$1
zBCv_u-2B|a!le&IT23N;n59Ek=8(g5XyJpaAAEd?Ct-nS{lkU%dCD5n(55s{VT>k<
z=G3Zs-WbDkq~%W9p5CaKJJS|3tWpUlOT%n9$83F+P6>IE=myS~$*P{wjY3`*C&!bS
z%v1VT_2WB7cIfmO9owPjY0Tpzabs!VhCxrCxbI)-qEqO}%-FxxmnVH$=-(;qk==X_
zpNzoghhYaXIt`~#wspg<dhU#Vt$X}-X4EY_SL(*}=&)|oJy-9^%eC4txdk)5<Hp&_
z+(28|8)z$c18oflY~`}o+-b54Mgm%|cgKygd+1xq5u6SsJv%bWjuSYEHpjO)J(jHR
z&gAxB@p4=NF~*Ya?&0hr3JwBQ+MOjF+jhXjx6x|REa01GHNA`WoN`I5AEO|7Gg!7y
zuGNZp>b_+RjIN!53=B|IIISt1O08*3i!@}g^CXL&<&=yTLT^l!4Vj<}_PnG^^qtFj
zhn;e*!p|-nKDOYsGLVCTB|)QhN%RVZ9E=zlN5mjzmWvqs^;$VJ42P_0$xk(s3Bxcy
zl1anWn(I@L1yBTim<nWjb))PV4PL%F&J2*jluKvQBcl}NblI44B;C!xX~8g-8OdkI
z^7$Nv?5gr-aTtLPt{h?x5`<<DU=?SoQkccbm2|V5nuV`qu3Yn})J-n&?1V3k>4HxX
z&BUY0!UZmO)G67Dp(H&4hcc)Z8mTN%RWOjr#Ztoa9eTP&Zpe;?afumE)-yU*t^69<
z%<_~ThZ5ntegfs>n)NU1X`MP`8K(p<obPgq$62C^kyA*WbP7HL0e<dP(X}nluGh*=
zamK(m)1rr`7QFpAklnS*ZaPKJEL#naZz)15V|2v_J6-W9-Cgk=$ShGw7K&DJ$~N&}
z?iqpTx#z=v^f?hdkAwP4)LN!qgeSVMH%c_+Dx!o`Pn4)_SCp{KF{_P|cgv_B<!B(v
zvF<4O5-HIlTCSEEtdLcNNn$;5s*EjmD~w)KoP1-*`*H3J#L0J(CFRWUwObN<LT^23
zt6xo|5bKE)IpwMrBlYvR+LH1zWzv4EnQSmv+EaM26XIOU@(!TJn#hf1$M%ifzfbIA
ziq3nR+AKh4jU%pSSarOXY#9^YF+(x)G-pS!rLDjqHa6OSY130;+-070EUf?`Cw5(G
zIQUIDl4ov?_OprH{(PD)Xe&8QV`92oRw{E`8?j-hLnrsp`SO<ZJQUFWEZxcRJy_Vz
zWyRL5NIY9v?r)A#IZ?3;x9UssWC?km3Oo^Xad32wtKqqz?vqa(89Z@_W6PEb>w=qS
zpGV4rZQ$FuZ;=+pgrf08kq>o*+@*?J%R%8HJMjcmQ7<l4Mds_sj&v~VJLo9~MJBit
z!|8q84`tcsp^ioYx%x#W;a}BK$*X60v2wys80G#4>7t~<A`&Z{mklv2Y(O}Z&X4A@
z!(%y~hYX$86r_4s3mD-tb!ZAQ3=0NW=~9G72=a+Nf`UrAT&5nVWR4z%`TE7Fwe(;O
z9;G{Ue#~-<SfwWr6LW3P&-d`I#Yb$rTB0Y?6JRkKHa{R^@>Qr+D7bbL&+@CfPW0&G
zKRc~b+p1i+E<(L=RjeX~7CRn0!6K=Cb`$Jw!Y8s(%*l<&P+&e5VQs>FVFQuy8(x~R
zt6P0I%C<8$x6y|qOkYCR*sV+8Ful(W$cI?}5||qST|0<(p_@A_a(h3!NG}%g8Z|Nq
zAMjF#-s{juyDiTm>-$+8^T9CYqY=`^FM4-Llh5S%4>^8{@rq!<J4(jmmms@i1-#Np
z{Jn2y>0KvxPpM%KuAs+fxW>up<Dm|t*TLl2opE;Cj8wQ4V>QlBAMfw5?_*9pF9U8x
z*qmfxvkCCaH`qxaXXEUrt$zOQcJC%=1pRa?#>OeZbiJ8iFSTMZdK=#V$%_egAp!na
z>=wX#`y~G}{cJDsl1C(eJl<i?0V4Zpf(JY71qs&)9_p|kNSOR@iF$VEo$}i$(EB><
zyOLgrvmwwwgFl^fFRml|8{GjOK<8nC&#(<GHo$%oVfDhC2K$k_0srciSMKeELH`rt
zZE2k?P{%ew374n$?9g}d%sTdNf<;^MC2j|F#JjB(WoOYR10)arWUD_)E!fZXJ<3i!
zAMWQpvDw(bW+R}-IXzDF*+ewj5^IH#KIG#!#}L?QD4=}6Ua~dF`3HsmaI!VT;UO-+
z1^$pVT+*Lpt8aiXeVm>`{v%!Wigwj28l`$g{CY*BOY0R`Qm<3bN8)}wcZOh}9K}WU
zd44!mz_S$phCcrKfAbeNu<Z%J>j$~Vl#gG;0rwC2F#2!AOJFUw_mDa^yM=A_!r@oX
z-#x}{%$@<PX+Dhn_cz3vVmdg|almn2FAmR!*;Y3a;Th)o5}^U~2lPM79#}yAw?wwO
z(J1X}e3L-_B^|au*4P`+-_nqtHd?lg_)-zjv4xQQRgJxSM#13!1TgU@BtJC5KOI{d
z`1jy=<N48F*U(PwuP?%!z6fuRlYYUEexmsJg{_d*uYQ5+7yR&7zrgtg{1)-Y`5woA
z3d4HPT}|YJ>|fYIaD@7q+b4N99H;to_{|i%%hfchD6b;w2YOU={5W3^F*kzWf#^C-
z{R{qZcmPn0P>2`#K{?Njvx~^zyNSqv%Xt5>Hu8=9#N22{=z~1s13z|a;^+Az{wNSg
z0rCrd>+&QHI_3Wzzz-9dVB{C}AK``pv9`u8C)jf{tPcXLZ!-=6qO1Lm=<g@k*JeIt
zw6AwM-_h8)1iRIzL+JkyHqCwCeDHpA|Km^K$8kPA_}D}H%tPOJ>~Tuk=hx>M_^-$S
zZO>v~S?bQX1b-wp4WCH-!Hf2w<>b>6c}EwqTb_Oa?=r#FALNG@V^dOqc<8*4ryM=8
z@@qfn-8zZzx_k(n5^VmxoA=NP^13v1x=Wx7WbGB?UzYphx-Nk(key#a{+IH2-=hA3
zK}OIme~-Y|{Q%|U!e%kqCSg?u+M|P)+=lDpK_upvrTl9WPx5pWcFQMG82Hp<|LpBT
zP#1&WBT8P?u?AtnjY~c?oRjjZ&Sam?CncxwudX2fmu2LsrMl%WuOL4s<<rt3vQOuA
zxBShBe>wkk`9($fN%F+kEk6SL`0H5%c_~jh4_-=M;pqQwNM0@%g=TH_MM)CKKCPd&
z&mksBX`kr9t9_S%WEy-D*$Q3>Ajqd~`JYMoATQ0k<?k2$!yf2jRPqmluCC?!kHVw8
zs{dzIylOls4XRP)kI>g5r|P5Zs8Q8RjX`_VzkXemfErZ+<rtJBl&XhfSEH(r8iR5a
QpUR`+SEI_S8kO8X0of?&djJ3c

diff --git a/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin b/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin
deleted file mode 100644
index 6b3a326908c9130dd8168f761ec4d95b5b8d0993..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10944
zcmeHN+ix7z89%$Tc6JlvwT%T1Nt-dmYNER1z4)>TG#J9ARf9+geF35A?8Wv*>)p-l
z*sNm$o3uhzRS{I4ctbq#(1$*P)S^5<MEZdE6HtWu5H5185FwSa{e9m#XFNM=7pP6t
zs*JpI&Ue3@?|f%=)-ODK;*n@1lHDcb6%}ubmYUO0KP-XzK{-Bk+f!mz<U}v-!y+Pt
z)mUhFuIJ=2)8opbgn6_Z3)Ny%Sk+~#T%D^4%PTcK%X7<Kt!me6WqWR^Q7YPY)$z*9
zB|GnUg$27>a~JJOX};`XYJ09zu9mCwwo@$H>1kir_O&z?9Jf^5F+FanQF51|FvUL3
zS892ug0RaT!-ZP4;W<@rl5%MkOZlaFd(L$hOB$$@tC+i`iX!^3(}Aci+ftb5cEf2j
zq|g?42f6PS@NspoX?sn0NOrMG+DofmE-;eheXR6-!$svk5ZXtDAsMGqnRlJ~0!LqW
zYhKM;sUyOGzSQ=TFRSu9x_@8Wv{JgLPc4=ft=imNqvXl*R+~rDQS{iw%eID3J@)Wp
zw&b@{wo@rPC{ySQ?h5;$jjqvA5L{pqk9$UMjsLZM;_sf$eg3WX$+nxM-pb$ICd**!
z8DgMo0~$CQMg!QGnrDDFUmFTg<n$Eqh8XO^7l4lh@Ou=;Y35IM@vka=&8IcK$o$%i
z4Y5}k-?M>GvrCNc?ZWp&$xnS+Zjeii-@SoQv-7|Y4FvL|RKCspw|DWkajBzL(3~zq
z#Ga!u*>}FxIwa!l^Q}X1o(kp}@6w_W%2q6gV5DqJXJ8jlQBenDB=V2`-MB}95s?XR
zu|DC|+`>Y+T3jk%X3sl~TCIAz>`ctKLKJ4))7e@vKVLXocN)!REt_pN3*P)pVY1oC
z<e#n2FE3>0if5LxwbN&&md?yA<<D646%qN%-(PvR_pf&dkwED3ibagU$okFCjFsp>
z<fidL^aBxDyKXcA840?b#qE>zwU5_7y>7&!X2f_tnu@G_W<)T*Zk!|jb>K|gU%>ri
zqX+kE#vT%5CL;=ju^Rm_5@Fi<C+pWGfmB5Q97W*6oC;c^&Qt&qa8($0MvU9ZDT{9G
z<C8QSr(1(ukv@+Q{3zYt6?A`yZXDoEV+r%mKfbYM946$mbmKVZX?7dks&r$$Q#9L8
zw_mgJF`BZ-8Jg{*8w-0hJB8ad;}}iZ_GPxc40!GO+UGK4O3`MswQFnZ>z}S)2j{wB
zP+nf9ywDn1%O$r_u2s#PHG?NN$DJ>EW@FJll5&KFr!n;UhTSNiEt%8Qw^eU9Ep+?&
zs+l&ee9n};{ONT18S|&+>XE5&lRnck<7SS=96ky+lLBrM^wha~|D7&67oJYfe67A5
z>B~U>5owR?=Cb&tB|bL^JBYE|DCa9B(<)Uxcg5UJ*F7Yxx`QV=(^{IIG_AVlnq4{B
zj((f9+o|zeV<&x!>|}0{o$M{LGkJ^bOa<&@Gux_MRaw&kjo)Pj&xphH-QrBuD;Ldl
zCMQ-2oX3{I_c1duR=+QI3_XBRd4!f2bMMO@*+9WJpo;fpCV>+NPuTbxSuNOkd}XYr
zj#BHY6y;X5@`^V-J#Id|6PxdudpA*zcD57B(@;*);25Sb7HiAak_<nM<~3IEoO01>
zBIL$G#Zrl(z{Xsn=Qi^U4R$9y%WN7pI_pj-&q8@eut?_^N`pcTMv7PvDTwJ!BF&)P
z?}RgxaK@<?{q!P67=`J~)Z}JnahmLe*(u6F&DCZp8km*Fr)kAYUfrsAR)g!I4y$7>
zF{R-a{1W-OlA^g8CQ?OYaNHqRjRjZNQ3eMh1i_JkR-EC!5Ll%~+%p|1$ly>05)40?
zhN;F<-WQ^(WH)dpFBX?7*0bfJ?@CeGgQFm`IlUiAvM7reT#)rE0CA6-sb=cl00wR|
zW_GNeHnF+Y*U04zCu|kEWJb($SPs@~{@6^Jl++Hh9Db2%2-d@95n;$IBL5n*oVH2d
zY-Yo3cP?a#k+V=PIKM@zF>)&F=K`q$0{rYl!7Vvnsa~s;3o91B5Enc=n&Ac0iOkWX
zrR8$Lvnx)+<Ax`t(iTz}Osm$U`^uP$-mzJHC(<1%$wI*?ER<|~FYv5D(s})-{pcrU
z^c=@jIQ25;vZTACMDYJXl#uF*5_VYAMwD82sCd3{l<axVk8&muC3<dWfu(ul7Ull!
z4l}Jw``fK1iFE~v<`4>$jBjf_Wxq-Jp{7%TSkq`Fp<ub9i?ZQt60F`n-!fQ;bp`7u
zZev-~TQr4lyso4ZjU%%WZZ;S$H8gG>bMg$%@JUPMHJ6>q%p9M3;JEB*nl9_4lP(%(
zT+edqc>me7=Df3(X6D^aO?WE?UW?2OoS=@Y#Ijso+%QNqK;T$-J+SjAN$U9^OIe!B
zK9Wn(1#QLhvNg9<srV{ch}h7?QO{j;wx;Mupnw{ic8424R@z463X(Wm(%&qla&FPF
z+^R3h$;z_4tU6Rzh=o&>mCrXEcbd)#Iz^QEv~-<SK|Dy2PRk@iNbufAd+a<of=9);
z$;Sw*OrxWdlw0!rC$jX>jVH}gwMY+W=kWcpQQ}6NP;dI2eBLcB<AFAOo$F%dA4Iwt
z!-6{ncRR4?6v&6fY65}8CgS2Y*c!nnHYn$0IyMxT&%{KFaCi0*32(x3Lfjbj;a-sv
zVr|fed&Hp*dQ6xdutC-W@=<!17=R-p-T|{w(2bLL54W!^V*BIb1H9B)!zVV-$G*{<
z3;Gy)XGLsxT=Zi;5XF2tM%wtI_h~fwN{#<i<CkhgI{2b@Rg@Eb{Xk8^c&(T4-ERl!
zU7HZWw~P;Mp+})_*>GVs(iZgEk^;E9U+i5`rbjVW`^AOTzP9*1=EU<l;ATv$DHd@x
z34RsT7?4-{#jDLYzhk`r4rm1ZLNg&|X))-!oD_d-CKB}4HvYv+N%3A1{E5Uc;QM<O
z|8sG1Kk<@BB){6<7B2!K`zeA4+TwQ#t`j`e7JpDM`QMal-==q;?@xd}+!ik>dY<S;
z`%Zv8^w#gAi(&};Ujc4kdTE#VC@Jo08uaQ~E*Sj1uo`cR-vTE4h;Pu9%h;X;Kl~qb
z4TIR*sf#S%2Yn)q-%ayh7^H6q`YxfSSU>m&+}(J8EL!ak+sxn7q{_h6hE>=bYxeC%
zDT}@+^#AQ~asEoQk85k6)hDjTL=W=M7x6=XPgA|*B^db{YxWD`=qLHB$;ex0LB9Y5
z`Oo?XSpR^eM_+BmLhIENTCbiSTCbSDUOhdX>lN!*uk%-8{p``cANGlS0{+Q9!SP*v
z9R3Zl{UNgd(Sg|cE4>4Le8mXp!%QD$|0W<$j^k4CoxjpI#2KJ`yxtEw?QWkQ8EeKl
z{y52BzKHzo9`^a)=m*>%_x;0qPI-gq1Fd7EUR*sOM!jhCPw)O{hIJyJqn=^-F!C2~
z$kGrdI8yz92WUM7!>F%4wEm*?ym)X8>wh4oVCcSKh<B1iM}s5!I_MKbCm8gbhIl_I
zOio7Y7>+AJ^<}=n`hLMM{4q!9$Akd=O@@6s`NtQyXg|lB27Ur*$t?x_<S&u<27lwt
zK}BzI{g`i1J-4Jjvi~O2ef^_u&o18aoUdeb)Q$CWekCkS*O2*O{|^ux<NOdz{<~s8
z{w4$Xy$OO<1CjK5u%D=6c-1dIF7Clk4%C15;6w7GZoi>c9QAw=a8mfc0)hV6&xw()
zmLWb#iWgTzocMe9;cy_j?r%iDmK2Y!d?~2sx7)ul#HFMd@#zryKY(rf_^%#%z<%V(
zr|{z~PftGa@NxUpFQ0gllKB1geFpxkGC*xc_LEMxW);|qcH)UYcu^nOOn$#YCPKup
zJpE!DGQsK(sbhFS2@ntUEpF!+N0>{9qo9X%66t02(72%Z^bqEKbPM?}l>DBMKnN7x
z7V<_yS{w-pgh27u7V`VlF}>zDW2rAJum7D)$EU}>yX5SkLg;1SvqdvuT$1zpYyd~o
zl#+i(;Yps(xv>0REDU_AD>i?)q2x`43ES7}NdJ#OI%gH19!B=bnO<j-r-Mz)Y5Z$j
z$fq`ur;-ZGf4GJG&y;*hy1_Nh&)Evr_IDxv&HQg}VxRcJ_7B58{<`+S%SxV~EtE|f
zw7f=TFd=!hT{K#V{c5TRlt0=(Err&@US(gekH(9;<xH#5c(N6|=;5Ax3d`?T@<Cpj
zhvlb`Kf*m2V$|{{LD$!2^+(}RUiJS~YF<5_R0j2^^GE3I$m#XbcJ!#%OOHW&{9qPN
hs20=<sK%fiqqfMWR;S0H9L1;esLkk6=T(nd?q6&!cl7`O

diff --git a/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin b/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin
deleted file mode 100644
index ecd10024549b703b0f583e6eeaa463f4df7e6433..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&YSVCDQ8$1NQM3XEw*biiH&L#MNSNxVAt^?V6V
zOQ~C?X`qQtd-*VF9}u4=HffrKCN@o5KLDyGw4r^N_OXl5glOrKP!Q4>-tYgsoX>W$
zmo%-miBodV^M8MRp67h_@p}*47Y>E8o0z!5?ANR%=QO-^s{q~_72~5<J<29ofeqk(
z4GS@5G-eu}>sdw2)VMG!U_P3SnOeEYjM}_WsZG_H;n_{k@Z5@5ubEE0VouF9?6PUr
zEUz+en?=hj&6u^iJ8M?$>57M`>A6;=R;f*!GnQNStlEs39`_Y)kg5iB+vSZ_=GqP0
zorl&G_jbBkFIrV(q2h74RIfD<jhCa;8fCjUH*HS2)~v07YNdv`YgZ-FhpjF|ZQc~Z
zL^m5&qalRWz*{IqzkrW(yr$_j;UU>Y(Ug~lQz>yI$@^I0d&foTKM=}Ci6I%QTAg++
zXNE`bxOK1YEjWlUpf9xn6%d_zIqx4g{VI!5b^F7!_N-B#nrhgd=zF=e6dkJpt9{;7
z@WT(>`+zC<&6H_XD;D|}EfHKU<}MSvOIN4x&J%dtGiuHJuPq>dS#@s!|Iq^4n9Woz
z^vk*gGgy`y8}3`2BREQS2Cxx1zYBQ%`H=ucOpCy4Y-=Ch1wItO4@r&%&Y$Sxe^&A<
zKBe*doL_m-5qp8-qaB2t{hs3o`ta9;$xn5<cE}#bZ|)%E?AySP3<vTfRsMnVU){&w
z#<fdgX-=2MB2Uto*m9!P8e_5ciPl(*PZ=Zp?mjINA#FwWAs8VWQ5kp&s4)987(=1I
z#W&*~0)~Z>-mH`K>TYSKQY+7uFf*sEM!jD9TE$8puCwG!eb!EPP;_vVt)yL>ZzQd{
zS4mbY#iX;~&D3jIBb`b%DxQ6tW0ekB({@8BPMS(}(w(b$6`YT(bhv(`UUA3g%K6&i
zVyT|5P2?(>{GsuM>G{K%Bh#5fj#Diio?Cb#SI<;SPh@lX3By@np<n&wyRQzM*(O)a
zj9qnXw?UNfN5A>wD+9lL2~iCq*W!wVw5_3)_bzBh!o#8Wv~c(=3oU6)0PmjDLSa3$
ztZCs?==_Ry9KZztdT8nVg%$0o@Ea_&ymW45>AmxuQ9B0G#kVhl<!mT){^C24u=YBG
zBAEF3aT>$gofww2LEN=Dnl6F1A7bI>!^pSBD?pvmT>&5h4l`|ANV}R;+(b8WRr@O4
zcG2xga!b3Grd#N?pKil+b8%Z<US3+(ZY0p88|UNh90GVw`!XS4C8ve7O|u%^M(H-&
zqx?ai!fjne;5=?i%S$UO=iU`*Iga#UmI_^1LJm`*m3J<_^Zq%WgcY9kcUP8|DQieW
zi_$=aG3HUUO1-8Rj0rqXTJE&%>5W-)Ps(D3RW9RXX_$@55nIpGDIrf1-N4y0UDMOL
zQ7q`<<ajKdeq8^Iess_HZk_(~6T9^SjRpKm+(Zhv9O$WIxBoL;bP7F|p7@vg3ZyRs
z{d<HxvRlaFKP~Wu9PA**N~2P&+PY!aJa<9A<UM|j8IFagOWl~u=XAsIT)i(bmu$r3
zCd}0CD`+fzC5>gSq_OOkG?oh(%VsXU-()L{2h?8nzRR;)=p)KuoDyX{GoEKh37kf^
z<FlPUIO^<6XYarw=C~rF92|A`<uV-<JOrw|FGD!C>wt+*r?rw<#3#{OYA@|Q)v{Pd
zMp5#nv3wufu!ZyZ?W<VGZ=i)VEKpoHw<)06`n)kG5|PFpG+OekO4(>4_{L1tkSR)I
z_ZxMIzGpS}uxD;q0y3*60K4&qS;)ddSJ<fEDEfth4#te6BW4iOtHq4Hd&4}+!6U0y
z_Op$A!ZPfSr&I8??)ntu0Te;cQH^ZRF{+-?;1zUmfPf69iSbOH=(BTGW2Pd>ZW>NY
zdV0K&nJ5&p6tb%tAcKPmbZ})6bC4hu(|H9?%Vni7gHvqO&2VZ4u9CTG-KSDNxx|w#
zz?sHe(Wi)h;?ZQ_0~dUvQnnRCS-JuaNOFs6wUN#;l?4OY?9>vj@77aI@<TQ?tV_(~
zsFT*QbmeEt`3z6#Q7944>&H+}u37)Ip3<pBR&h%3!ucMjc%Wsf7&(R9NvGg55a8!t
zm0a8MY^PqWlokwpOf7kMgu(lu1DU;h?fFW{Gpkm^<NJz`N*g`#!A?(nN_S6u2hv?C
z$wJ90&DbWM(mf*(J@-84N1qkZ^Ejw~x>7H~6W!MzC7N;_Q9`OOO4PO|N?7KY^+w6N
zCGSU>4@5c98zo;OWm-h5)iRAGvW_rGtS?TLu}-(b=q1I;H->^A=bk{Ed?)EDXPPhG
zve*>*>q)!)dLo5bU!=$>SJjEs&*OSa%FC2a`LU)m!C+}m;lWOcqb<WbfEsHmJCT{#
zKYrJKv8^aN?`>+c0G%}syPjb=cv0Cjro1DDV&-YijAKh%gF$R;*em&_r^L9+Jn3>b
zl!UC<cB$duH|0nkyII=Lrn2`HQglIExiW7|%~h+)lPuRpY}o10$$fObyd}K=1++mc
zt9%a@wzC<rcPkRlR)+hVrBqJMT83NmC3&*4T%M-_4+T9O9G&HAcrK{><U>aq4;|vz
zvZX>taP#c*NO`agd^Gnh(!!WhG@dB(!9mD9s<^c*6n5B&C!mUYv0D|HcaR<FV8(aQ
zQx1wua3^x9{oD^_+2^5-Mgh6{4wLY&W~t;kX<n?X@DoP4|3SJasj!H|3g=}*<b(|f
zr&EP|Hj|sk@;s#JxTYZ0^IE_NmvNve$dD5Zu+pOljS%D$djthF>T($;P{}Mk5A*el
zQ>*)64IZVtb$-lpi&&+{5EFB4&(C*mZ|6DNu9fMb^cYx-hRqMiLHS<PDi&RP9uM^E
zx{mef<KIQCQ`@>+xI9Arfz4Vaif!=dAf+*qjIpaQZln2DF(*$VBZ2uugtZ9wi7iCJ
zcY7(uF7EW<C>vvJX{!%MnBGO#*tK17nBMOO<U=gh1#=^yYX|XebbFgcu8*;^^r8{3
zRU^aj0WW#zJrDhB*A-Y~bBx6?9}Z)lkB~Ng(Ys8Vd?d%e%ke{u7X%C5Su!5K1lcFo
zz$=}^-~YCj-hFcSlp1!=8hU(&e{*v3XsFHTH844FL!4c=AO&|~ti{>Mqp>#oCg#NR
z0^nwZElC!(m;k?gb3F>=Vx0Y;8RPG4@7xBBpr33GvPnuLU9TnBcbbEP^cFn!!E*_A
zCISAz!D|5DIUxC;jIle3mpmf*qwzNT1|YJZB6zsXo|Uje@JO3|OTy%TQ`D_Z?~;F)
z1bv{*zAov-I2!@|Bly!k{oE$9zttV$0krQY_!QgHWJBzx5Oa!48tg}I2K?g}U%a&)
z2K~2)x2d&vKpoo#C0w4~uS4G(r#7)S6D-=4FK^qRBi>!jC~KiXhDaX#qs>^9S}n%)
zJ;+Wx6OQrPZ#TBE#R%wePLC6PF%gY6#hPHG5Bd1T5d?M;3Me12H`*NL{KG<jc(ggf
z;Snys1OAXTT+*M>=D-kP1~@&5{6~7~744~4G)nb~`1OiLyXzI{s@I8UB5^;S8$z&8
zj^ZNwJU^T&;6;jm%K(4vzx`8N*jNJa=3(wJ<>N<jz_Ae@M*od_39Px+9i)ye?qEB;
zaQG*$-aNr=ES>_aX+Dhn#~NboFdZDJIN&(17l#+aY^NKE@C<W(iO>-G1Nxt0cdwxS
zJ0d&XXq5IZz8N6@k`CMNYwQ*1Z)!+S3oYA1e8~vt*eXc=qQ>4lrC{)X0GRj_k{=r3
zpN_2y{CD6a<N48F*3eF^&v#%>@4#bm(l7YYPZS@&u;tPE)z5JKf*=0sXE?urUnl-J
z-^=({U|0{jtBHJ&{WCiVj!-{y`y}s%<5YhRzm{Y-x|&86<yA!eK#z)!pXBRd(2d~x
zAG%Ib|AIdp9s)E-D8!5Wpq!^C*;(Z8%|v9#WxW4b3;9NV2Hj{|=z~1s13z|U;^+Az
z{wNSg0rCrdn+hZjI_3Wj!1oiGVB{C}AK-=ov6jZpC)hU@*Z>6Bz;<l(L|6MA(cez6
z&n|q(X#Z}vpVrvv1iRL!L+JkyHqHHCyyq_SzDFL#m*RYS@S%J6n-72Cp+_ibA73A5
z;J+dRv^|S`rQ4lJ34Tv(8a|QugBR^VtI3Z`<PAN<UU~ZFyT=4q{}q0CF*YLwh=<M#
zdCJj~DZf4kdaq6*oR$xJGlI>(_ws&e4f!!?=wy#T56GTgL;gj%KW^#~=mFW8HROLL
zkN4Zu4;N$vz4EsRyyFKbCug^d$u0@2GSD6!yp9OKnG7N^k6bShFG)Pf(^1$fKZe4<
zr#|~HOL;vYPg5nY>PWxqAlxJo1J|FVys9(Vr}Ih4Df~~@kY8Fwo?5C`{;zAu$K(f!
zl=O-0(|O%1zYX!P=6^xTD?dq|_<H3Fu#dmKHE>YMQ_h2zl2<tT-3`gh<)YB6r9Kcz
z0@<ha)ABjQWK7y8dhlx9C?J_epG3BT7yZ76eCn0|p_C8u(!5vx9??JS?jA-Ze?REz
zTCM*mJj$#3J*(nX<3VXqjVgbHz7#oCA7w|4s$Oag+N1vU>!JkIs0t{@pd6u8Jruhd
WRejVLl%x1m9u>bDRbJJo<o*F}Wa=#d

diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
deleted file mode 100644
index 37ee5419513..00000000000
--- a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
+++ /dev/null
@@ -1,965 +0,0 @@
-
-#include <torch/csrc/inductor/aoti_include/cuda.h>
-// Definition of AOTI runtime interface functions
-
-#include <torch/csrc/inductor/aoti_runtime/interface.h>
-#include <torch/csrc/inductor/aoti_runtime/model_container.h>
-
-#include <iostream>
-#include <vector>
-
-#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
-  try {                                           \
-    __VA_ARGS__                                   \
-  } catch (const std::exception& e) {             \
-    std::cerr << "Error: " << e.what() << '\n';   \
-    return AOTI_RUNTIME_FAILURE;                  \
-  } catch (...) {                                 \
-    std::cerr << "Unknown exception occurred.\n"; \
-    return AOTI_RUNTIME_FAILURE;                  \
-  }                                               \
-  return AOTI_RUNTIME_SUCCESS;
-
-#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
-  do {                                                            \
-    AOTI_RUNTIME_CHECK(                                           \
-        actual_size == expected_size,                             \
-        "expected " + std::string(name) + " vector size to be " + \
-            std::to_string(expected_size) + ", but got " +        \
-            std::to_string(actual_size));                         \
-  } while (0)
-
-// AOTInductor uses at::addmm_out, which doesn't supports
-// arguments that requires gradient. For this reason, we
-// enforce no_grad context for run APIs.
-//
-// A RAII, thread local (!) guard that enables or disables grad mode upon
-// construction, and sets it back to the original value upon destruction.
-struct AOTINoGradGuard {
-  AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(false);
-  }
-  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
-  ~AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(prev_mode);
-  }
-  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
-  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
-};
-
-extern "C" {
-
-AOTIRuntimeError AOTInductorModelContainerCreate(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    bool is_cpu,
-    const char* cubin_dir) {
-      return AOTInductorModelContainerCreateWithDevice(
-        container_handle,
-        num_models,
-        is_cpu ? "cpu" : "cuda",
-        cubin_dir);
-}
-
-AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    const char* device_str,
-    const char* cubin_dir) {
-  if (num_models == 0) {
-    std::cerr << "Error: num_models must be positive, but got 0\n";
-    return AOTI_RUNTIME_FAILURE;
-  }
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    std::optional<std::string> cubin_dir_opt;
-    if (cubin_dir != nullptr) {
-      cubin_dir_opt.emplace(cubin_dir);
-    }
-    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
-        num_models, std::string(device_str), cubin_dir_opt);
-    *container_handle =
-        reinterpret_cast<AOTInductorModelContainerHandle>(container);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerDelete(
-    AOTInductorModelContainerHandle container_handle) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto* container =
-        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-            container_handle);
-    delete container;
-  });
-}
-
-AOTIRuntimeError AOTInductorModelContainerRun(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_single_threaded(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *num_constants = container->num_constants(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** name) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *name = container->constant_name(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** original_fqn) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *original_fqn = container->constant_original_fqn(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    bool* from_folded) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantType(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* type) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* dtype) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *dtype = container->constant_dtype(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
-  AOTInductorModelContainerHandle container_handle,
-  size_t idx,
-  size_t* data_size) {
-  auto* container =
-    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-        container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *data_size = container->constant_data_size(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { const auto ret = container->extract_constants_map(use_inactive);
-      for (const auto& pair: ret) {
-        constants_map->emplace(pair.first, pair.second);
-      }
-    })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
-          constant_map_handle,
-          /*use_inactive*/ true,
-          /*validate_full_update*/ true);
-}
-
-AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->free_inactive_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
-    AOTInductorModelContainerHandle container_handle,
-    bool use_inactive,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->swap_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_inputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_inputs = container->num_inputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetInputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t input_idx,
-    const char** ret_input_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_input_names = container->input_name(input_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_outputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_outputs = container->num_outputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetOutputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t output_idx,
-    const char** ret_output_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_output_names = container->output_name(output_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
-    AOTInductorModelContainerHandle container_handle,
-    const char** in_spec,
-    const char** out_spec) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    *in_spec = container->get_in_spec();
-    *out_spec = container->get_out_spec();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelCreate(
-    AOTInductorModelHandle* model_handle,
-    AOTInductorConstantMapHandle constant_map_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
-      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-
-      auto model = new torch::aot_inductor::AOTInductorModel(
-          constant_map,
-          constant_array,
-          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
-          ""
-      );
-
-      if (input_map) {
-        for (auto const& kv : *input_map) {
-          constant_map->emplace(kv.first, kv.second);
-        }
-      } else {
-        model->load_constants();
-      }
-
-      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
-    })}
-
-AOTIRuntimeError AOTInductorModelRun(
-    AOTInductorModelHandle model_handle,
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    model->run_impl(
-        input_handles,
-        output_handles,
-        (torch::aot_inductor::DeviceStreamType) nullptr,
-        nullptr);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
-          model_handle);
-      delete model;
-    })}
-
-AOTIRuntimeError AOTInductorModelGetNumOutputs(
-    AOTInductorModelHandle model_handle,
-    size_t* ret_num_outputs) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-      *ret_num_outputs = model->num_outputs();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
-    AOTInductorModelHandle model_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-    auto input_map =
-        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
-            constant_map_handle);
-
-    for (auto const& kv : *input_map) {
-      constant_map->emplace(kv.first, kv.second);
-    }
-    model->update_constants_map(std::move(constant_map));
-  })
-}
-
-} // extern "C"
-
-
-#define CUDA_DRIVER_CHECK(EXPR)                    \
-do {                                               \
-    CUresult code = EXPR;                          \
-    const char *msg;                               \
-    CUresult code_get_error = cuGetErrorString(code, &msg); \
-    if (code_get_error != CUDA_SUCCESS) {          \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string("invalid error code!"));   \
-    }                                              \
-    if (code != CUDA_SUCCESS) {                    \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string(msg));                     \
-    }                                              \
-} while (0);
-
-static inline CUfunction loadKernel(
-        std::string filePath,
-        const std::string &funcName,
-        uint32_t sharedMemBytes,
-        const std::optional<std::string> &cubinDir = std::nullopt) {
-    if (cubinDir) {
-        std::filesystem::path p1{*cubinDir};
-        std::filesystem::path p2{filePath};
-        filePath = (p1 / p2.filename()).string();
-    }
-
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline void launchKernel(
-        CUfunction func,
-        uint32_t gridX,
-        uint32_t gridY,
-        uint32_t gridZ,
-        uint32_t numWarps,
-        uint32_t sharedMemBytes,
-        void* args[],
-        cudaStream_t stream) {
-    CUDA_DRIVER_CHECK(cuLaunchKernel(
-        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
-    ));
-}
-CACHE_TORCH_DTYPE(float32);
-CACHE_TORCH_DEVICE(cuda);
-CACHE_TORCH_LAYOUT(strided);
-namespace torch::aot_inductor {
-namespace {
-class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
-  public:
-    CUfunction triton_poi_fused_convolution_0{nullptr};
-    CUfunction triton_poi_fused_convolution_1{nullptr};
-    CUfunction triton_poi_fused_convolution_2{nullptr};
-};
-}  // namespace
-
-
-
-AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
-                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-                                   const std::string& device_str,
-                                   std::optional<std::string> cubin_dir)
-    : AOTInductorModelBase(1,
-                           1,
-                           1,
-                           device_str,
-                           std::move(cubin_dir),
-                           true) {
-    inputs_info_[0].name = "arg2_1";
-    constants_info_[0].name = "conv_weight";
-    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[0].offset = 0;
-    constants_info_[0].data_size = 540;
-    constants_info_[0].from_folded = false;
-    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[0].shape = {5, 3, 3, 3};
-    constants_info_[0].stride = {27, 9, 3, 1};
-    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[0].original_fqn = "conv.weight";
-    update_constants_map(std::move(constants_map));
-    update_constants_array(std::move(constants_array));
-    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
-    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
-    outputs_info_[0].name = "output0";
-    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
-}
-
-std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor,
-    bool initialization
-) {
-
-    if (!initialization) {
-        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
-                  << "aot_inductor.use_runtime_constant_folding=False\n";
-    }
-    return {};
-}
-} // namespace torch::aot_inductor
-using namespace torch::aot_inductor;
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_0(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_0', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 12
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
-        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
-    }
-    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_2 = ynumel;
-    int var_3 = xnumel;
-    CUdeviceptr global_scratch_4 = 0;
-    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
-    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_1(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_1', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 15
-        xnumel = 9
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
-        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
-    }
-    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_7 = ynumel;
-    int var_8 = xnumel;
-    CUdeviceptr global_scratch_9 = 0;
-    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
-    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_2(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_2', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 20
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y0 = (yindex % 5)
-        y1 = yindex // 5
-        y3 = yindex
-        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
-        tmp1 = y0
-        tmp2 = tl.full([1, 1], 2, tl.int64)
-        tmp3 = tmp1 < tmp2
-        tmp4 = tl.full([1, 1], 1, tl.int64)
-        tmp5 = tmp1 < tmp4
-        tmp6 = -0.16373057663440704
-        tmp7 = 0.04603243246674538
-        tmp8 = tl.where(tmp5, tmp6, tmp7)
-        tmp9 = tl.full([1, 1], 3, tl.int64)
-        tmp10 = tmp1 < tmp9
-        tmp11 = tl.full([1, 1], 4, tl.int64)
-        tmp12 = tmp1 < tmp11
-        tmp13 = 0.16525162756443024
-        tmp14 = 0.022457100450992584
-        tmp15 = tl.where(tmp12, tmp13, tmp14)
-        tmp16 = -0.08230065554380417
-        tmp17 = tl.where(tmp10, tmp16, tmp15)
-        tmp18 = tl.where(tmp3, tmp8, tmp17)
-        tmp19 = tmp0 + tmp18
-        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
-    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
-        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
-    }
-    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_12 = ynumel;
-    int var_13 = xnumel;
-    CUdeviceptr global_scratch_14 = 0;
-    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
-    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
-}
-
-namespace torch::aot_inductor {
-
-void AOTInductorModel::_const_run_impl(
-    std::vector<AtenTensorHandle>& output_handles,
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {}
-
-AOTI_NOINLINE static void check_input_0(
-    AtenTensorHandle* input_handles
-) {
-    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
-    int32_t arg2_1_dtype;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
-
-    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
-    if (arg2_1_expected_dtype != arg2_1_dtype) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dtype, "
-           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
-           << "but got: " << arg2_1_dtype << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_size = arg2_1.sizes();
-
-    if (4 != arg2_1_size[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 0, "
-           << "expected: 4, " << "but got: " << arg2_1_size[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (3 != arg2_1_size[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 1, "
-           << "expected: 3, " << "but got: " << arg2_1_size[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_size[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 3, "
-           << "expected: 8, " << "but got: " << arg2_1_size[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_stride = arg2_1.strides();
-
-    if (192 != arg2_1_stride[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 0, "
-           << "expected: 192, " << "but got: " << arg2_1_stride[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (64 != arg2_1_stride[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 1, "
-           << "expected: 64, " << "but got: " << arg2_1_stride[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_stride[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_stride[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (1 != arg2_1_stride[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 3, "
-           << "expected: 1, " << "but got: " << arg2_1_stride[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    int32_t arg2_1_device_type;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
-
-    int32_t arg2_1_expected_device_type = 1;
-    if (arg2_1_expected_device_type != arg2_1_device_type) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched device type, "
-        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
-        << "but got: " << arg2_1_device_type << "\n";
-        throw std::runtime_error(ss.str());
-    }
-}
-
-static bool _check_aoti_runtime_check_inputs_env() {
-    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
-    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
-    return result;
-}
-
-AOTI_NOINLINE static void __check_inputs_outputs(
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-    if (!_check_aoti_runtime_check_inputs_env()){
-        return;
-    }
-    check_input_0(input_handles);
-}
-
-void AOTInductorModel::run_impl(
-    AtenTensorHandle*
-        input_handles, // array of input AtenTensorHandle; handles
-                        // are stolen; the array itself is borrowed
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {
-    __check_inputs_outputs(input_handles, output_handles);
-
-    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
-    auto arg2_1 = std::move(inputs[0]);
-    [[maybe_unused]] auto& conv_weight = constants_->at(0);
-
-    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
-        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
-        AtenTensorHandle arg2_1_aligned;
-        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
-        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
-    }
-    inputs.clear();
-    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
-
-    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
-    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
-    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
-    AtenTensorHandle buf0_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
-    RAIIAtenTensorHandle buf0(buf0_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    arg2_1.reset();
-    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
-    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
-    AtenTensorHandle buf1_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
-    RAIIAtenTensorHandle buf1(buf1_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    AtenTensorHandle buf2_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
-    RAIIAtenTensorHandle buf2(buf2_handle);
-    buf0.reset();
-    buf1.reset();
-    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
-    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
-    AtenTensorHandle buf3_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
-    RAIIAtenTensorHandle buf3(buf3_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf2.reset();
-    output_handles[0] = buf3.release();
-} // AOTInductorModel::run_impl
-} // namespace torch::aot_inductor
-
-
-
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin b/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin
deleted file mode 100644
index 1be0cd3083897a28e082defde99cc1da6a9ef442..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10936
zcmeHNTWlQF89qC+zQ%ZyfYP9ZW)k8O+I046dlS-N2uYC|L`vWVM2Fekv3JROcQP|x
z)-frYv}&tf0MVyD74->G9{SJ+T2*Zx03n1BQq@*{OA)H7HjQkPRFz8Ee*d|QXLs#Z
zI8H<*j^a7zzhBOO&Y3xzXO12}rp02}0VbNpe$85POkMd-0hITO{?VHrWm7EA`tiJl
z#TYaFvLASXUBF27^Roixz2%o{r6x0L4YN|4t}`=mnt>U3m7rd;+<L{Dp7WiOW!3DU
z(r~PT9Tdw}t?tcQRcEFWU}y!NU8z-SGgjI5N`YM~Te(o*#%S`P*(q(PF3<5DuK}eg
z?(0moUa+f3K_%dDv0n2LN|2+xnkA<&H)BnE_N=3TYNdv;=Ts#zgzXMQtzijaqFcW0
z`$A|9yp4J26$o)|(6oXkJS4lAG3BM{R*D=+@*!6EzT%?v9|+~6#E^_#t<HG1TjtTb
zUOlJ>^DZKc=u2Ju1c)lVj?Yi)y?Yg^LO(g{%$oJ-Y2OJ%$;(Bh=vW9?<qb>0Cm%Ta
zfF<~?lx0^dHcA)O5M3$OUJJWQN0IQN6L>r`YPtNst(|a1b*}x7wRSdS6IDz6`Yyf<
zmZZ)$_bki=j*#6EY+Q~H18=;zH9`?X3wWJv>A}~5k45mCBu9huCwln5E%_Cn()a@B
zS6-}${g~tXRuFRZHpln(;J0Yxr#e+v$fr4ecm*LxKLCE~=16{|sdqU4%{}~WG<hU5
z8dK9*{AubFBj;MJeQdCOuC;HF4;km#-=jq$q^<a71QVy&g#x0^0Hv`%VHAt~YjhaT
z7%(iB3})SAQ1^=EO06_k#K@Yl{d&FjWW`RFT$U`?XPx8<iq4FZopfpqKWW#4O0rrh
zB;EO-T(4!#bSmjr0_TgaT|8yaIKEJvw3O<kH&+WPI2u{8<hnECnQ8yY^66aBo$&or
zK~OF^ryEY;^r>1QJL|dG(%gJ?@~KK;E+~5L^rY#|v)Hfx{KMD#f45Dpn6guK9nU9<
z_R(*C|62bqe~zezkZaN6F?~yH`RZl;jJ7#;RoAqOEVigO0qDAxis>;tE-&lPXhw`b
z7B4R^US7O<2?Ss?dMs!cV==uC!{tlMOL0wqixGM*WFS`9If71mH-^Oi7Pd9ML^@Lq
zbi^~JZ;R<SlP7!W!F~D~jrP#vY3*GW)3?)bgdRunSXx?IT)J{eV222MiXM;9g9~k?
zkx!4E^dP}YLi{Fz8+a@(EnXGD@tAp-oukFSUH)MC@5@VzA1p0{dui#)vUY*OqKuji
zWV=$Y8F_OOFMzf;;{=93YaK}0%(P1-90|VVSI#)b1a0r~2rx_>3Ntk$ZJ33;A-4R-
z(&@*IFBuC5#`hcaH!->2$Wx!kpTte3fXjiNI(yfD(?!R}W9iBNsV`6ZGSGiO*dx37
zEdJ60pU=S#VyyU;Le()$rxtkg#<gzF`<Ut4cp)^*xrv-%x`Ai(<mFmzSloh@+JED0
zrEj3E%nh`ay@9rJ5nI{JHCLLfg7Jvf>s@gZ>`wZeIE`(+WMsxC*aCqwD06&<8AD_4
zp~=HoyBt+Oh(lxEp<HGK1^<959m)`n?LBVcE3#I!3ix8Ir4G`TQ!R<*V-_TD8msoP
zjao2|-?fT?iH$Rmh5?ESr!0jtTW^?iA`NM5JY&Vcu9VCsLifv6Qzj^lEpN;t`hnHF
z!$!GL;b&G2A3N|y8OXvwN6?tNF_a309E}(mN5mkeSBn_i^+q|AgF|+$6s8)<gkhK;
zPp9B&-3uwm0w{u>qbTO8X1QYea`xa8$Y7dGkLNR!`Fxfl^3?2RaMXYft}H@|5^&ZC
z0+iy!R0=aVm&Uvdr)J;^nXA@ADiw=IJO?JAZmtj#M6GaVGjM^+Jyj_=ilHPu0f#cE
z))A>JQPR<o$(5OcgZqtCliZL63*!=l^(a5=8X2Cf1;`7xjkB02uGIK~kuqrRSMf*i
z!tXABc!(vM3-Sk9lm5V6Ai&QZDte9`IBvaKDbAbtRxAd19l?Ff@yx-4PNPx`tg7t?
ze6tWzX|pRd*y##Q>0Sx#c)CLyDKFZ^vSZ;bIWQxEbEk7*;8_tk4}pqC%&<(jh)k4J
zZ-A(@bp!~ho&YhUT>-)}$E+hjULF%+fD@4bC%XgWYobJpVzn7eV@a$dLK5o<Q6+1o
zJYjUbA@Vhz4?{c<36XCK9g~>m>$F5Gd(|LSb$z@#rOm!4NSNrF$1sKKm`Lt$Iu(YQ
z&P3y+t%JuoE%vtzpLeRH>Fi`?^2qo-N5poe=)9Pzt|D~ipY{UNc5x}#G^c|zrefw9
z&WvN{T7yCCU)b9C-loL3%RJv$OSI`pLRM_9Gz0LPYCxWxS=zRyv&Zr&YS31yG|cI_
zYBiMP+K3Gs8tS))&eyY~=b?bMVP%zXwZe8bBX(#-;@Qe@f3uXz=~>(KYM~@gR+h{2
zRN!3i;^62kSHp8bg(u$|(s*ycRh(;99jVae{%3jid89m8AK${EMOqHiipCQ~KDcxi
zb*d5xf%z47;)$rDGVD}E&br8ebTAV-7$^rt8o3j>)DiB7vK;b=baL%0Ji@)2t#aq`
zlw^5MvX~I=eS|DBD-w^a3eROV<an8oY&w;n$Yyerl(I3G>W~6duUjIU+yobKF@sTt
zoM3>FPCaxINA-wQa$mTJi>j3=%F?SWj-}3%GkR9;H~2ZqO<`%CMJ&v7f>1;5AlJp;
zj-6VGUMA0i#q=G10uIUBN4rq)oCe;)*VWGU=;OcitW(>%T(~Ymy>ZRjMG9?bVF+1_
zCkNS0Fif|U@hxIZe#Ey%#*=Z@BHU+2h=kwGQjA^M9m0KVA7hJKLO9Nh4!X{^cfkCD
zC?X$YgB^5k1a$ob?jLuwS$t=lP2ox}z8Us$7em)E^rzpNXYt`dHj43P4daP8Y2uCU
z5NYs{?EfwM57A#1EV!s*Ja!4P+t$D<oy6aJ8%h^mjLGXquHn8l^horEcjp&kZARCz
z<mh_|cI&(}u^YWP%FZunZT20EiKhU#8E1=<Mexh0#(@0EDEm<}&Tmlf-Uf}(cfQ%j
zrYPam-bk<)ntgqA8y)}WvkCTQ0{ne_w*bDoU-Ca4XLl1Xc|`IHgKhRLKxBWB{26Yu
z?@BmD^pQ6EzJv?#zbWRoO;^%y?qXX3e+2vO3(pRay)E7Vces5&;I|%RcQo0+PhWod
z^KFe_lJ9Hpf+*VtA<i%6r+x9c0ro!Gi?`V~rM=IMvc1g&dl7XpK=R-pYibFU8`B7W
zJIT(?YZ}39hdDy|8EX!5`XJHYdyf6$3>-Z#^&vlF&G-QE$BF*_F31lKhVp+I1>7I!
zx9vMVJHp;0eZySeu&^H+YmRVugv%$ibMwQ)q5QrC<VQwAcu#_j2MN@A>u{Uq2>IR}
zXxb~U-!TaY@PmMLJ%r)kpf75c8K9>|0gvLIAB|wt$0p<(`o7NYUDV-!T*A<GMQ5)i
zh>j&k^kvYKL?;;Zf9UL;1T*-8X&q*4is#>WiR=3chvA>GeU$iD=&y1(q?3QV(N76`
z;9pZmxwS+|K|lEm1z7a>gTI5#grv84{KiW<!f6S8m`~$XP7n3(_WJwbBAC%If4fmn
zLm>=)PZ#;%{_i5VpXY~Q^54@&#ji1d-%Jubpph;~zunV^iJ$M5ZzN*3d)g+V(>_b}
zH{|u>cMfXj2WUMFc>`@B4;Xah6XkJXid{s$-%n6}QSSsJzXW@%pVv=|<_G*96APk+
z`9OXK_7NHU)IcA|6Pe%^qxJEvdB*Fne+M>H#PfjKFIY=we@(D2&wt2hpKQ0kp|cAK
zwmqan-zN~ZtRs)ycaL@K;Yab~EFYeD=;#sa<X0bh7!H5hKFz=<BLlQOi*?`W&y)n8
z*E;dUA8nfetR|n5$lJPz-SYGcY?leH{siAl@c*Qw0P)aX#kX<vczfgR80g(PiSVj?
zeJBgI@Y&6Kat(QLUNCm5OP~v6tu^Gohs~SXA4C^EyX9YBL%u-oC1`)7^CZ%`<>h}P
zb0aK#cxZ>f?3DyH4e>5UxsAG0UHB)Y{Ldot<Vv^v7$ycj_1NFMQwSOnc^WEtHIH=&
z6K;x#k(QG3YM#k|dY@Ht3jfL)^7pSIPbJkYe`yW*C!~Bz`b75WeC?LM8S$^?|MRQZ
zC%$g^aoESdo;C22l&73Wo03;J`u`s!FPDo#v+%<~egu&Jw0>GE9DFn;?W_6-$$P{|
zx)GwuR<tF6AfLMBcS`vvFO9q9??L_ucW)P?lD{8x)mE!NiktGP{y(MSRs9KRQ1vQ*
zguWIzH9yLZ>eYOyK5CCoh5DdV0;-quq(0iBa(r&r$fueg)kozhK9xshM)fMMs#kLV
E0n);sHUIzs

diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
deleted file mode 100644
index cc963cd88f0..00000000000
--- a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp
+++ /dev/null
@@ -1,6144 +0,0 @@
-
-#include <torch/csrc/inductor/aoti_include/cuda.h>
-// Definition of AOTI runtime interface functions
-
-#include <torch/csrc/inductor/aoti_runtime/interface.h>
-#include <torch/csrc/inductor/aoti_runtime/model_container.h>
-
-#include <iostream>
-#include <vector>
-
-#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
-  try {                                           \
-    __VA_ARGS__                                   \
-  } catch (const std::exception& e) {             \
-    std::cerr << "Error: " << e.what() << '\n';   \
-    return AOTI_RUNTIME_FAILURE;                  \
-  } catch (...) {                                 \
-    std::cerr << "Unknown exception occurred.\n"; \
-    return AOTI_RUNTIME_FAILURE;                  \
-  }                                               \
-  return AOTI_RUNTIME_SUCCESS;
-
-#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
-  do {                                                            \
-    AOTI_RUNTIME_CHECK(                                           \
-        actual_size == expected_size,                             \
-        "expected " + std::string(name) + " vector size to be " + \
-            std::to_string(expected_size) + ", but got " +        \
-            std::to_string(actual_size));                         \
-  } while (0)
-
-// AOTInductor uses at::addmm_out, which doesn't supports
-// arguments that requires gradient. For this reason, we
-// enforce no_grad context for run APIs.
-//
-// A RAII, thread local (!) guard that enables or disables grad mode upon
-// construction, and sets it back to the original value upon destruction.
-struct AOTINoGradGuard {
-  AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(false);
-  }
-  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
-  ~AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(prev_mode);
-  }
-  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
-  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
-};
-
-extern "C" {
-
-AOTIRuntimeError AOTInductorModelContainerCreate(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    bool is_cpu,
-    const char* cubin_dir) {
-      return AOTInductorModelContainerCreateWithDevice(
-        container_handle,
-        num_models,
-        is_cpu ? "cpu" : "cuda",
-        cubin_dir);
-}
-
-AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    const char* device_str,
-    const char* cubin_dir) {
-  if (num_models == 0) {
-    std::cerr << "Error: num_models must be positive, but got 0\n";
-    return AOTI_RUNTIME_FAILURE;
-  }
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    std::optional<std::string> cubin_dir_opt;
-    if (cubin_dir != nullptr) {
-      cubin_dir_opt.emplace(cubin_dir);
-    }
-    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
-        num_models, std::string(device_str), cubin_dir_opt);
-    *container_handle =
-        reinterpret_cast<AOTInductorModelContainerHandle>(container);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerDelete(
-    AOTInductorModelContainerHandle container_handle) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto* container =
-        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-            container_handle);
-    delete container;
-  });
-}
-
-AOTIRuntimeError AOTInductorModelContainerRun(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_single_threaded(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *num_constants = container->num_constants(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** name) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *name = container->constant_name(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** original_fqn) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *original_fqn = container->constant_original_fqn(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    bool* from_folded) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantType(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* type) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* dtype) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *dtype = container->constant_dtype(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
-  AOTInductorModelContainerHandle container_handle,
-  size_t idx,
-  size_t* data_size) {
-  auto* container =
-    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-        container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *data_size = container->constant_data_size(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { const auto ret = container->extract_constants_map(use_inactive);
-      for (const auto& pair: ret) {
-        constants_map->emplace(pair.first, pair.second);
-      }
-    })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
-          constant_map_handle,
-          /*use_inactive*/ true,
-          /*validate_full_update*/ true);
-}
-
-AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->free_inactive_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
-    AOTInductorModelContainerHandle container_handle,
-    bool use_inactive,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->swap_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_inputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_inputs = container->num_inputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetInputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t input_idx,
-    const char** ret_input_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_input_names = container->input_name(input_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_outputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_outputs = container->num_outputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetOutputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t output_idx,
-    const char** ret_output_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_output_names = container->output_name(output_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
-    AOTInductorModelContainerHandle container_handle,
-    const char** in_spec,
-    const char** out_spec) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    *in_spec = container->get_in_spec();
-    *out_spec = container->get_out_spec();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelCreate(
-    AOTInductorModelHandle* model_handle,
-    AOTInductorConstantMapHandle constant_map_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
-      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-
-      auto model = new torch::aot_inductor::AOTInductorModel(
-          constant_map,
-          constant_array,
-          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
-          ""
-      );
-
-      if (input_map) {
-        for (auto const& kv : *input_map) {
-          constant_map->emplace(kv.first, kv.second);
-        }
-      } else {
-        model->load_constants();
-      }
-
-      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
-    })}
-
-AOTIRuntimeError AOTInductorModelRun(
-    AOTInductorModelHandle model_handle,
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    model->run_impl(
-        input_handles,
-        output_handles,
-        (torch::aot_inductor::DeviceStreamType) nullptr,
-        nullptr);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
-          model_handle);
-      delete model;
-    })}
-
-AOTIRuntimeError AOTInductorModelGetNumOutputs(
-    AOTInductorModelHandle model_handle,
-    size_t* ret_num_outputs) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-      *ret_num_outputs = model->num_outputs();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
-    AOTInductorModelHandle model_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-    auto input_map =
-        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
-            constant_map_handle);
-
-    for (auto const& kv : *input_map) {
-      constant_map->emplace(kv.first, kv.second);
-    }
-    model->update_constants_map(std::move(constant_map));
-  })
-}
-
-} // extern "C"
-
-
-#define CUDA_DRIVER_CHECK(EXPR)                    \
-do {                                               \
-    CUresult code = EXPR;                          \
-    const char *msg;                               \
-    CUresult code_get_error = cuGetErrorString(code, &msg); \
-    if (code_get_error != CUDA_SUCCESS) {          \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string("invalid error code!"));   \
-    }                                              \
-    if (code != CUDA_SUCCESS) {                    \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string(msg));                     \
-    }                                              \
-} while (0);
-
-static inline CUfunction loadKernel(
-        std::string filePath,
-        const std::string &funcName,
-        uint32_t sharedMemBytes,
-        const std::optional<std::string> &cubinDir = std::nullopt) {
-    if (cubinDir) {
-        std::filesystem::path p1{*cubinDir};
-        std::filesystem::path p2{filePath};
-        filePath = (p1 / p2.filename()).string();
-    }
-
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline void launchKernel(
-        CUfunction func,
-        uint32_t gridX,
-        uint32_t gridY,
-        uint32_t gridZ,
-        uint32_t numWarps,
-        uint32_t sharedMemBytes,
-        void* args[],
-        cudaStream_t stream) {
-    CUDA_DRIVER_CHECK(cuLaunchKernel(
-        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
-    ));
-}
-CACHE_TORCH_DTYPE(float32);
-CACHE_TORCH_DEVICE(cuda);
-CACHE_TORCH_LAYOUT(strided);
-namespace torch::aot_inductor {
-namespace {
-class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
-  public:
-    CUfunction triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_10{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_14{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_17{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_21{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_24{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_3{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_6{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_12{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_16{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_19{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_23{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_8{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7{nullptr};
-    CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9{nullptr};
-    CUfunction triton_poi_fused_convolution_0{nullptr};
-    CUfunction triton_poi_fused_convolution_1{nullptr};
-    CUfunction triton_poi_fused_permute_copy_26{nullptr};
-};
-}  // namespace
-
-
-
-AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
-                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-                                   const std::string& device_str,
-                                   std::optional<std::string> cubin_dir)
-    : AOTInductorModelBase(1,
-                           1,
-                           262,
-                           device_str,
-                           std::move(cubin_dir),
-                           true) {
-    inputs_info_[0].name = "arg262_1";
-    constants_info_[0].name = "mv2_features_0_0_weight";
-    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[0].offset = 0;
-    constants_info_[0].data_size = 3456;
-    constants_info_[0].from_folded = false;
-    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[0].shape = {32, 3, 3, 3};
-    constants_info_[0].stride = {27, 9, 3, 1};
-    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[0].original_fqn = "mv2.features.0.0.weight";
-    constants_info_[1].name = "mv2_features_0_1_weight";
-    constants_info_[1].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[1].offset = 0;
-    constants_info_[1].data_size = 128;
-    constants_info_[1].from_folded = false;
-    constants_info_[1].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[1].shape = {32};
-    constants_info_[1].stride = {1};
-    constants_info_[1].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[1].original_fqn = "mv2.features.0.1.weight";
-    constants_info_[2].name = "mv2_features_0_1_bias";
-    constants_info_[2].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[2].offset = 0;
-    constants_info_[2].data_size = 128;
-    constants_info_[2].from_folded = false;
-    constants_info_[2].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[2].shape = {32};
-    constants_info_[2].stride = {1};
-    constants_info_[2].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[2].original_fqn = "mv2.features.0.1.bias";
-    constants_info_[3].name = "mv2_features_1_conv_0_0_weight";
-    constants_info_[3].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[3].offset = 0;
-    constants_info_[3].data_size = 1152;
-    constants_info_[3].from_folded = false;
-    constants_info_[3].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[3].shape = {32, 1, 3, 3};
-    constants_info_[3].stride = {9, 9, 3, 1};
-    constants_info_[3].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[3].original_fqn = "mv2.features.1.conv.0.0.weight";
-    constants_info_[4].name = "mv2_features_1_conv_0_1_weight";
-    constants_info_[4].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[4].offset = 0;
-    constants_info_[4].data_size = 128;
-    constants_info_[4].from_folded = false;
-    constants_info_[4].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[4].shape = {32};
-    constants_info_[4].stride = {1};
-    constants_info_[4].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[4].original_fqn = "mv2.features.1.conv.0.1.weight";
-    constants_info_[5].name = "mv2_features_1_conv_0_1_bias";
-    constants_info_[5].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[5].offset = 0;
-    constants_info_[5].data_size = 128;
-    constants_info_[5].from_folded = false;
-    constants_info_[5].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[5].shape = {32};
-    constants_info_[5].stride = {1};
-    constants_info_[5].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[5].original_fqn = "mv2.features.1.conv.0.1.bias";
-    constants_info_[6].name = "mv2_features_1_conv_1_weight";
-    constants_info_[6].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[6].offset = 0;
-    constants_info_[6].data_size = 2048;
-    constants_info_[6].from_folded = false;
-    constants_info_[6].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[6].shape = {16, 32, 1, 1};
-    constants_info_[6].stride = {32, 1, 1, 1};
-    constants_info_[6].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[6].original_fqn = "mv2.features.1.conv.1.weight";
-    constants_info_[7].name = "mv2_features_1_conv_2_weight";
-    constants_info_[7].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[7].offset = 0;
-    constants_info_[7].data_size = 64;
-    constants_info_[7].from_folded = false;
-    constants_info_[7].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[7].shape = {16};
-    constants_info_[7].stride = {1};
-    constants_info_[7].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[7].original_fqn = "mv2.features.1.conv.2.weight";
-    constants_info_[8].name = "mv2_features_1_conv_2_bias";
-    constants_info_[8].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[8].offset = 0;
-    constants_info_[8].data_size = 64;
-    constants_info_[8].from_folded = false;
-    constants_info_[8].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[8].shape = {16};
-    constants_info_[8].stride = {1};
-    constants_info_[8].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[8].original_fqn = "mv2.features.1.conv.2.bias";
-    constants_info_[9].name = "mv2_features_2_conv_0_0_weight";
-    constants_info_[9].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[9].offset = 0;
-    constants_info_[9].data_size = 6144;
-    constants_info_[9].from_folded = false;
-    constants_info_[9].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[9].shape = {96, 16, 1, 1};
-    constants_info_[9].stride = {16, 1, 1, 1};
-    constants_info_[9].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[9].original_fqn = "mv2.features.2.conv.0.0.weight";
-    constants_info_[10].name = "mv2_features_2_conv_0_1_weight";
-    constants_info_[10].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[10].offset = 0;
-    constants_info_[10].data_size = 384;
-    constants_info_[10].from_folded = false;
-    constants_info_[10].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[10].shape = {96};
-    constants_info_[10].stride = {1};
-    constants_info_[10].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[10].original_fqn = "mv2.features.2.conv.0.1.weight";
-    constants_info_[11].name = "mv2_features_2_conv_0_1_bias";
-    constants_info_[11].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[11].offset = 0;
-    constants_info_[11].data_size = 384;
-    constants_info_[11].from_folded = false;
-    constants_info_[11].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[11].shape = {96};
-    constants_info_[11].stride = {1};
-    constants_info_[11].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[11].original_fqn = "mv2.features.2.conv.0.1.bias";
-    constants_info_[12].name = "mv2_features_2_conv_1_0_weight";
-    constants_info_[12].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[12].offset = 0;
-    constants_info_[12].data_size = 3456;
-    constants_info_[12].from_folded = false;
-    constants_info_[12].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[12].shape = {96, 1, 3, 3};
-    constants_info_[12].stride = {9, 9, 3, 1};
-    constants_info_[12].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[12].original_fqn = "mv2.features.2.conv.1.0.weight";
-    constants_info_[13].name = "mv2_features_2_conv_1_1_weight";
-    constants_info_[13].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[13].offset = 0;
-    constants_info_[13].data_size = 384;
-    constants_info_[13].from_folded = false;
-    constants_info_[13].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[13].shape = {96};
-    constants_info_[13].stride = {1};
-    constants_info_[13].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[13].original_fqn = "mv2.features.2.conv.1.1.weight";
-    constants_info_[14].name = "mv2_features_2_conv_1_1_bias";
-    constants_info_[14].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[14].offset = 0;
-    constants_info_[14].data_size = 384;
-    constants_info_[14].from_folded = false;
-    constants_info_[14].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[14].shape = {96};
-    constants_info_[14].stride = {1};
-    constants_info_[14].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[14].original_fqn = "mv2.features.2.conv.1.1.bias";
-    constants_info_[15].name = "mv2_features_2_conv_2_weight";
-    constants_info_[15].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[15].offset = 0;
-    constants_info_[15].data_size = 9216;
-    constants_info_[15].from_folded = false;
-    constants_info_[15].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[15].shape = {24, 96, 1, 1};
-    constants_info_[15].stride = {96, 1, 1, 1};
-    constants_info_[15].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[15].original_fqn = "mv2.features.2.conv.2.weight";
-    constants_info_[16].name = "mv2_features_2_conv_3_weight";
-    constants_info_[16].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[16].offset = 0;
-    constants_info_[16].data_size = 96;
-    constants_info_[16].from_folded = false;
-    constants_info_[16].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[16].shape = {24};
-    constants_info_[16].stride = {1};
-    constants_info_[16].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[16].original_fqn = "mv2.features.2.conv.3.weight";
-    constants_info_[17].name = "mv2_features_2_conv_3_bias";
-    constants_info_[17].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[17].offset = 0;
-    constants_info_[17].data_size = 96;
-    constants_info_[17].from_folded = false;
-    constants_info_[17].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[17].shape = {24};
-    constants_info_[17].stride = {1};
-    constants_info_[17].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[17].original_fqn = "mv2.features.2.conv.3.bias";
-    constants_info_[18].name = "mv2_features_3_conv_0_0_weight";
-    constants_info_[18].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[18].offset = 0;
-    constants_info_[18].data_size = 13824;
-    constants_info_[18].from_folded = false;
-    constants_info_[18].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[18].shape = {144, 24, 1, 1};
-    constants_info_[18].stride = {24, 1, 1, 1};
-    constants_info_[18].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[18].original_fqn = "mv2.features.3.conv.0.0.weight";
-    constants_info_[19].name = "mv2_features_3_conv_0_1_weight";
-    constants_info_[19].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[19].offset = 0;
-    constants_info_[19].data_size = 576;
-    constants_info_[19].from_folded = false;
-    constants_info_[19].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[19].shape = {144};
-    constants_info_[19].stride = {1};
-    constants_info_[19].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[19].original_fqn = "mv2.features.3.conv.0.1.weight";
-    constants_info_[20].name = "mv2_features_3_conv_0_1_bias";
-    constants_info_[20].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[20].offset = 0;
-    constants_info_[20].data_size = 576;
-    constants_info_[20].from_folded = false;
-    constants_info_[20].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[20].shape = {144};
-    constants_info_[20].stride = {1};
-    constants_info_[20].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[20].original_fqn = "mv2.features.3.conv.0.1.bias";
-    constants_info_[21].name = "mv2_features_3_conv_1_0_weight";
-    constants_info_[21].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[21].offset = 0;
-    constants_info_[21].data_size = 5184;
-    constants_info_[21].from_folded = false;
-    constants_info_[21].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[21].shape = {144, 1, 3, 3};
-    constants_info_[21].stride = {9, 9, 3, 1};
-    constants_info_[21].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[21].original_fqn = "mv2.features.3.conv.1.0.weight";
-    constants_info_[22].name = "mv2_features_3_conv_1_1_weight";
-    constants_info_[22].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[22].offset = 0;
-    constants_info_[22].data_size = 576;
-    constants_info_[22].from_folded = false;
-    constants_info_[22].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[22].shape = {144};
-    constants_info_[22].stride = {1};
-    constants_info_[22].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[22].original_fqn = "mv2.features.3.conv.1.1.weight";
-    constants_info_[23].name = "mv2_features_3_conv_1_1_bias";
-    constants_info_[23].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[23].offset = 0;
-    constants_info_[23].data_size = 576;
-    constants_info_[23].from_folded = false;
-    constants_info_[23].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[23].shape = {144};
-    constants_info_[23].stride = {1};
-    constants_info_[23].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[23].original_fqn = "mv2.features.3.conv.1.1.bias";
-    constants_info_[24].name = "mv2_features_3_conv_2_weight";
-    constants_info_[24].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[24].offset = 0;
-    constants_info_[24].data_size = 13824;
-    constants_info_[24].from_folded = false;
-    constants_info_[24].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[24].shape = {24, 144, 1, 1};
-    constants_info_[24].stride = {144, 1, 1, 1};
-    constants_info_[24].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[24].original_fqn = "mv2.features.3.conv.2.weight";
-    constants_info_[25].name = "mv2_features_3_conv_3_weight";
-    constants_info_[25].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[25].offset = 0;
-    constants_info_[25].data_size = 96;
-    constants_info_[25].from_folded = false;
-    constants_info_[25].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[25].shape = {24};
-    constants_info_[25].stride = {1};
-    constants_info_[25].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[25].original_fqn = "mv2.features.3.conv.3.weight";
-    constants_info_[26].name = "mv2_features_3_conv_3_bias";
-    constants_info_[26].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[26].offset = 0;
-    constants_info_[26].data_size = 96;
-    constants_info_[26].from_folded = false;
-    constants_info_[26].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[26].shape = {24};
-    constants_info_[26].stride = {1};
-    constants_info_[26].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[26].original_fqn = "mv2.features.3.conv.3.bias";
-    constants_info_[27].name = "mv2_features_4_conv_0_0_weight";
-    constants_info_[27].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[27].offset = 0;
-    constants_info_[27].data_size = 13824;
-    constants_info_[27].from_folded = false;
-    constants_info_[27].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[27].shape = {144, 24, 1, 1};
-    constants_info_[27].stride = {24, 1, 1, 1};
-    constants_info_[27].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[27].original_fqn = "mv2.features.4.conv.0.0.weight";
-    constants_info_[28].name = "mv2_features_4_conv_0_1_weight";
-    constants_info_[28].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[28].offset = 0;
-    constants_info_[28].data_size = 576;
-    constants_info_[28].from_folded = false;
-    constants_info_[28].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[28].shape = {144};
-    constants_info_[28].stride = {1};
-    constants_info_[28].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[28].original_fqn = "mv2.features.4.conv.0.1.weight";
-    constants_info_[29].name = "mv2_features_4_conv_0_1_bias";
-    constants_info_[29].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[29].offset = 0;
-    constants_info_[29].data_size = 576;
-    constants_info_[29].from_folded = false;
-    constants_info_[29].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[29].shape = {144};
-    constants_info_[29].stride = {1};
-    constants_info_[29].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[29].original_fqn = "mv2.features.4.conv.0.1.bias";
-    constants_info_[30].name = "mv2_features_4_conv_1_0_weight";
-    constants_info_[30].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[30].offset = 0;
-    constants_info_[30].data_size = 5184;
-    constants_info_[30].from_folded = false;
-    constants_info_[30].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[30].shape = {144, 1, 3, 3};
-    constants_info_[30].stride = {9, 9, 3, 1};
-    constants_info_[30].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[30].original_fqn = "mv2.features.4.conv.1.0.weight";
-    constants_info_[31].name = "mv2_features_4_conv_1_1_weight";
-    constants_info_[31].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[31].offset = 0;
-    constants_info_[31].data_size = 576;
-    constants_info_[31].from_folded = false;
-    constants_info_[31].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[31].shape = {144};
-    constants_info_[31].stride = {1};
-    constants_info_[31].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[31].original_fqn = "mv2.features.4.conv.1.1.weight";
-    constants_info_[32].name = "mv2_features_4_conv_1_1_bias";
-    constants_info_[32].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[32].offset = 0;
-    constants_info_[32].data_size = 576;
-    constants_info_[32].from_folded = false;
-    constants_info_[32].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[32].shape = {144};
-    constants_info_[32].stride = {1};
-    constants_info_[32].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[32].original_fqn = "mv2.features.4.conv.1.1.bias";
-    constants_info_[33].name = "mv2_features_4_conv_2_weight";
-    constants_info_[33].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[33].offset = 0;
-    constants_info_[33].data_size = 18432;
-    constants_info_[33].from_folded = false;
-    constants_info_[33].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[33].shape = {32, 144, 1, 1};
-    constants_info_[33].stride = {144, 1, 1, 1};
-    constants_info_[33].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[33].original_fqn = "mv2.features.4.conv.2.weight";
-    constants_info_[34].name = "mv2_features_4_conv_3_weight";
-    constants_info_[34].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[34].offset = 0;
-    constants_info_[34].data_size = 128;
-    constants_info_[34].from_folded = false;
-    constants_info_[34].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[34].shape = {32};
-    constants_info_[34].stride = {1};
-    constants_info_[34].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[34].original_fqn = "mv2.features.4.conv.3.weight";
-    constants_info_[35].name = "mv2_features_4_conv_3_bias";
-    constants_info_[35].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[35].offset = 0;
-    constants_info_[35].data_size = 128;
-    constants_info_[35].from_folded = false;
-    constants_info_[35].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[35].shape = {32};
-    constants_info_[35].stride = {1};
-    constants_info_[35].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[35].original_fqn = "mv2.features.4.conv.3.bias";
-    constants_info_[36].name = "mv2_features_5_conv_0_0_weight";
-    constants_info_[36].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[36].offset = 0;
-    constants_info_[36].data_size = 24576;
-    constants_info_[36].from_folded = false;
-    constants_info_[36].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[36].shape = {192, 32, 1, 1};
-    constants_info_[36].stride = {32, 1, 1, 1};
-    constants_info_[36].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[36].original_fqn = "mv2.features.5.conv.0.0.weight";
-    constants_info_[37].name = "mv2_features_5_conv_0_1_weight";
-    constants_info_[37].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[37].offset = 0;
-    constants_info_[37].data_size = 768;
-    constants_info_[37].from_folded = false;
-    constants_info_[37].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[37].shape = {192};
-    constants_info_[37].stride = {1};
-    constants_info_[37].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[37].original_fqn = "mv2.features.5.conv.0.1.weight";
-    constants_info_[38].name = "mv2_features_5_conv_0_1_bias";
-    constants_info_[38].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[38].offset = 0;
-    constants_info_[38].data_size = 768;
-    constants_info_[38].from_folded = false;
-    constants_info_[38].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[38].shape = {192};
-    constants_info_[38].stride = {1};
-    constants_info_[38].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[38].original_fqn = "mv2.features.5.conv.0.1.bias";
-    constants_info_[39].name = "mv2_features_5_conv_1_0_weight";
-    constants_info_[39].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[39].offset = 0;
-    constants_info_[39].data_size = 6912;
-    constants_info_[39].from_folded = false;
-    constants_info_[39].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[39].shape = {192, 1, 3, 3};
-    constants_info_[39].stride = {9, 9, 3, 1};
-    constants_info_[39].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[39].original_fqn = "mv2.features.5.conv.1.0.weight";
-    constants_info_[40].name = "mv2_features_5_conv_1_1_weight";
-    constants_info_[40].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[40].offset = 0;
-    constants_info_[40].data_size = 768;
-    constants_info_[40].from_folded = false;
-    constants_info_[40].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[40].shape = {192};
-    constants_info_[40].stride = {1};
-    constants_info_[40].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[40].original_fqn = "mv2.features.5.conv.1.1.weight";
-    constants_info_[41].name = "mv2_features_5_conv_1_1_bias";
-    constants_info_[41].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[41].offset = 0;
-    constants_info_[41].data_size = 768;
-    constants_info_[41].from_folded = false;
-    constants_info_[41].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[41].shape = {192};
-    constants_info_[41].stride = {1};
-    constants_info_[41].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[41].original_fqn = "mv2.features.5.conv.1.1.bias";
-    constants_info_[42].name = "mv2_features_5_conv_2_weight";
-    constants_info_[42].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[42].offset = 0;
-    constants_info_[42].data_size = 24576;
-    constants_info_[42].from_folded = false;
-    constants_info_[42].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[42].shape = {32, 192, 1, 1};
-    constants_info_[42].stride = {192, 1, 1, 1};
-    constants_info_[42].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[42].original_fqn = "mv2.features.5.conv.2.weight";
-    constants_info_[43].name = "mv2_features_5_conv_3_weight";
-    constants_info_[43].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[43].offset = 0;
-    constants_info_[43].data_size = 128;
-    constants_info_[43].from_folded = false;
-    constants_info_[43].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[43].shape = {32};
-    constants_info_[43].stride = {1};
-    constants_info_[43].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[43].original_fqn = "mv2.features.5.conv.3.weight";
-    constants_info_[44].name = "mv2_features_5_conv_3_bias";
-    constants_info_[44].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[44].offset = 0;
-    constants_info_[44].data_size = 128;
-    constants_info_[44].from_folded = false;
-    constants_info_[44].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[44].shape = {32};
-    constants_info_[44].stride = {1};
-    constants_info_[44].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[44].original_fqn = "mv2.features.5.conv.3.bias";
-    constants_info_[45].name = "mv2_features_6_conv_0_0_weight";
-    constants_info_[45].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[45].offset = 0;
-    constants_info_[45].data_size = 24576;
-    constants_info_[45].from_folded = false;
-    constants_info_[45].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[45].shape = {192, 32, 1, 1};
-    constants_info_[45].stride = {32, 1, 1, 1};
-    constants_info_[45].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[45].original_fqn = "mv2.features.6.conv.0.0.weight";
-    constants_info_[46].name = "mv2_features_6_conv_0_1_weight";
-    constants_info_[46].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[46].offset = 0;
-    constants_info_[46].data_size = 768;
-    constants_info_[46].from_folded = false;
-    constants_info_[46].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[46].shape = {192};
-    constants_info_[46].stride = {1};
-    constants_info_[46].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[46].original_fqn = "mv2.features.6.conv.0.1.weight";
-    constants_info_[47].name = "mv2_features_6_conv_0_1_bias";
-    constants_info_[47].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[47].offset = 0;
-    constants_info_[47].data_size = 768;
-    constants_info_[47].from_folded = false;
-    constants_info_[47].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[47].shape = {192};
-    constants_info_[47].stride = {1};
-    constants_info_[47].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[47].original_fqn = "mv2.features.6.conv.0.1.bias";
-    constants_info_[48].name = "mv2_features_6_conv_1_0_weight";
-    constants_info_[48].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[48].offset = 0;
-    constants_info_[48].data_size = 6912;
-    constants_info_[48].from_folded = false;
-    constants_info_[48].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[48].shape = {192, 1, 3, 3};
-    constants_info_[48].stride = {9, 9, 3, 1};
-    constants_info_[48].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[48].original_fqn = "mv2.features.6.conv.1.0.weight";
-    constants_info_[49].name = "mv2_features_6_conv_1_1_weight";
-    constants_info_[49].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[49].offset = 0;
-    constants_info_[49].data_size = 768;
-    constants_info_[49].from_folded = false;
-    constants_info_[49].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[49].shape = {192};
-    constants_info_[49].stride = {1};
-    constants_info_[49].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[49].original_fqn = "mv2.features.6.conv.1.1.weight";
-    constants_info_[50].name = "mv2_features_6_conv_1_1_bias";
-    constants_info_[50].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[50].offset = 0;
-    constants_info_[50].data_size = 768;
-    constants_info_[50].from_folded = false;
-    constants_info_[50].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[50].shape = {192};
-    constants_info_[50].stride = {1};
-    constants_info_[50].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[50].original_fqn = "mv2.features.6.conv.1.1.bias";
-    constants_info_[51].name = "mv2_features_6_conv_2_weight";
-    constants_info_[51].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[51].offset = 0;
-    constants_info_[51].data_size = 24576;
-    constants_info_[51].from_folded = false;
-    constants_info_[51].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[51].shape = {32, 192, 1, 1};
-    constants_info_[51].stride = {192, 1, 1, 1};
-    constants_info_[51].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[51].original_fqn = "mv2.features.6.conv.2.weight";
-    constants_info_[52].name = "mv2_features_6_conv_3_weight";
-    constants_info_[52].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[52].offset = 0;
-    constants_info_[52].data_size = 128;
-    constants_info_[52].from_folded = false;
-    constants_info_[52].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[52].shape = {32};
-    constants_info_[52].stride = {1};
-    constants_info_[52].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[52].original_fqn = "mv2.features.6.conv.3.weight";
-    constants_info_[53].name = "mv2_features_6_conv_3_bias";
-    constants_info_[53].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[53].offset = 0;
-    constants_info_[53].data_size = 128;
-    constants_info_[53].from_folded = false;
-    constants_info_[53].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[53].shape = {32};
-    constants_info_[53].stride = {1};
-    constants_info_[53].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[53].original_fqn = "mv2.features.6.conv.3.bias";
-    constants_info_[54].name = "mv2_features_7_conv_0_0_weight";
-    constants_info_[54].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[54].offset = 0;
-    constants_info_[54].data_size = 24576;
-    constants_info_[54].from_folded = false;
-    constants_info_[54].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[54].shape = {192, 32, 1, 1};
-    constants_info_[54].stride = {32, 1, 1, 1};
-    constants_info_[54].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[54].original_fqn = "mv2.features.7.conv.0.0.weight";
-    constants_info_[55].name = "mv2_features_7_conv_0_1_weight";
-    constants_info_[55].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[55].offset = 0;
-    constants_info_[55].data_size = 768;
-    constants_info_[55].from_folded = false;
-    constants_info_[55].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[55].shape = {192};
-    constants_info_[55].stride = {1};
-    constants_info_[55].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[55].original_fqn = "mv2.features.7.conv.0.1.weight";
-    constants_info_[56].name = "mv2_features_7_conv_0_1_bias";
-    constants_info_[56].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[56].offset = 0;
-    constants_info_[56].data_size = 768;
-    constants_info_[56].from_folded = false;
-    constants_info_[56].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[56].shape = {192};
-    constants_info_[56].stride = {1};
-    constants_info_[56].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[56].original_fqn = "mv2.features.7.conv.0.1.bias";
-    constants_info_[57].name = "mv2_features_7_conv_1_0_weight";
-    constants_info_[57].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[57].offset = 0;
-    constants_info_[57].data_size = 6912;
-    constants_info_[57].from_folded = false;
-    constants_info_[57].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[57].shape = {192, 1, 3, 3};
-    constants_info_[57].stride = {9, 9, 3, 1};
-    constants_info_[57].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[57].original_fqn = "mv2.features.7.conv.1.0.weight";
-    constants_info_[58].name = "mv2_features_7_conv_1_1_weight";
-    constants_info_[58].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[58].offset = 0;
-    constants_info_[58].data_size = 768;
-    constants_info_[58].from_folded = false;
-    constants_info_[58].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[58].shape = {192};
-    constants_info_[58].stride = {1};
-    constants_info_[58].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[58].original_fqn = "mv2.features.7.conv.1.1.weight";
-    constants_info_[59].name = "mv2_features_7_conv_1_1_bias";
-    constants_info_[59].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[59].offset = 0;
-    constants_info_[59].data_size = 768;
-    constants_info_[59].from_folded = false;
-    constants_info_[59].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[59].shape = {192};
-    constants_info_[59].stride = {1};
-    constants_info_[59].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[59].original_fqn = "mv2.features.7.conv.1.1.bias";
-    constants_info_[60].name = "mv2_features_7_conv_2_weight";
-    constants_info_[60].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[60].offset = 0;
-    constants_info_[60].data_size = 49152;
-    constants_info_[60].from_folded = false;
-    constants_info_[60].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[60].shape = {64, 192, 1, 1};
-    constants_info_[60].stride = {192, 1, 1, 1};
-    constants_info_[60].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[60].original_fqn = "mv2.features.7.conv.2.weight";
-    constants_info_[61].name = "mv2_features_7_conv_3_weight";
-    constants_info_[61].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[61].offset = 0;
-    constants_info_[61].data_size = 256;
-    constants_info_[61].from_folded = false;
-    constants_info_[61].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[61].shape = {64};
-    constants_info_[61].stride = {1};
-    constants_info_[61].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[61].original_fqn = "mv2.features.7.conv.3.weight";
-    constants_info_[62].name = "mv2_features_7_conv_3_bias";
-    constants_info_[62].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[62].offset = 0;
-    constants_info_[62].data_size = 256;
-    constants_info_[62].from_folded = false;
-    constants_info_[62].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[62].shape = {64};
-    constants_info_[62].stride = {1};
-    constants_info_[62].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[62].original_fqn = "mv2.features.7.conv.3.bias";
-    constants_info_[63].name = "mv2_features_8_conv_0_0_weight";
-    constants_info_[63].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[63].offset = 0;
-    constants_info_[63].data_size = 98304;
-    constants_info_[63].from_folded = false;
-    constants_info_[63].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[63].shape = {384, 64, 1, 1};
-    constants_info_[63].stride = {64, 1, 1, 1};
-    constants_info_[63].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[63].original_fqn = "mv2.features.8.conv.0.0.weight";
-    constants_info_[64].name = "mv2_features_8_conv_0_1_weight";
-    constants_info_[64].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[64].offset = 0;
-    constants_info_[64].data_size = 1536;
-    constants_info_[64].from_folded = false;
-    constants_info_[64].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[64].shape = {384};
-    constants_info_[64].stride = {1};
-    constants_info_[64].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[64].original_fqn = "mv2.features.8.conv.0.1.weight";
-    constants_info_[65].name = "mv2_features_8_conv_0_1_bias";
-    constants_info_[65].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[65].offset = 0;
-    constants_info_[65].data_size = 1536;
-    constants_info_[65].from_folded = false;
-    constants_info_[65].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[65].shape = {384};
-    constants_info_[65].stride = {1};
-    constants_info_[65].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[65].original_fqn = "mv2.features.8.conv.0.1.bias";
-    constants_info_[66].name = "mv2_features_8_conv_1_0_weight";
-    constants_info_[66].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[66].offset = 0;
-    constants_info_[66].data_size = 13824;
-    constants_info_[66].from_folded = false;
-    constants_info_[66].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[66].shape = {384, 1, 3, 3};
-    constants_info_[66].stride = {9, 9, 3, 1};
-    constants_info_[66].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[66].original_fqn = "mv2.features.8.conv.1.0.weight";
-    constants_info_[67].name = "mv2_features_8_conv_1_1_weight";
-    constants_info_[67].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[67].offset = 0;
-    constants_info_[67].data_size = 1536;
-    constants_info_[67].from_folded = false;
-    constants_info_[67].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[67].shape = {384};
-    constants_info_[67].stride = {1};
-    constants_info_[67].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[67].original_fqn = "mv2.features.8.conv.1.1.weight";
-    constants_info_[68].name = "mv2_features_8_conv_1_1_bias";
-    constants_info_[68].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[68].offset = 0;
-    constants_info_[68].data_size = 1536;
-    constants_info_[68].from_folded = false;
-    constants_info_[68].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[68].shape = {384};
-    constants_info_[68].stride = {1};
-    constants_info_[68].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[68].original_fqn = "mv2.features.8.conv.1.1.bias";
-    constants_info_[69].name = "mv2_features_8_conv_2_weight";
-    constants_info_[69].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[69].offset = 0;
-    constants_info_[69].data_size = 98304;
-    constants_info_[69].from_folded = false;
-    constants_info_[69].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[69].shape = {64, 384, 1, 1};
-    constants_info_[69].stride = {384, 1, 1, 1};
-    constants_info_[69].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[69].original_fqn = "mv2.features.8.conv.2.weight";
-    constants_info_[70].name = "mv2_features_8_conv_3_weight";
-    constants_info_[70].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[70].offset = 0;
-    constants_info_[70].data_size = 256;
-    constants_info_[70].from_folded = false;
-    constants_info_[70].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[70].shape = {64};
-    constants_info_[70].stride = {1};
-    constants_info_[70].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[70].original_fqn = "mv2.features.8.conv.3.weight";
-    constants_info_[71].name = "mv2_features_8_conv_3_bias";
-    constants_info_[71].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[71].offset = 0;
-    constants_info_[71].data_size = 256;
-    constants_info_[71].from_folded = false;
-    constants_info_[71].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[71].shape = {64};
-    constants_info_[71].stride = {1};
-    constants_info_[71].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[71].original_fqn = "mv2.features.8.conv.3.bias";
-    constants_info_[72].name = "mv2_features_9_conv_0_0_weight";
-    constants_info_[72].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[72].offset = 0;
-    constants_info_[72].data_size = 98304;
-    constants_info_[72].from_folded = false;
-    constants_info_[72].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[72].shape = {384, 64, 1, 1};
-    constants_info_[72].stride = {64, 1, 1, 1};
-    constants_info_[72].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[72].original_fqn = "mv2.features.9.conv.0.0.weight";
-    constants_info_[73].name = "mv2_features_9_conv_0_1_weight";
-    constants_info_[73].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[73].offset = 0;
-    constants_info_[73].data_size = 1536;
-    constants_info_[73].from_folded = false;
-    constants_info_[73].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[73].shape = {384};
-    constants_info_[73].stride = {1};
-    constants_info_[73].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[73].original_fqn = "mv2.features.9.conv.0.1.weight";
-    constants_info_[74].name = "mv2_features_9_conv_0_1_bias";
-    constants_info_[74].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[74].offset = 0;
-    constants_info_[74].data_size = 1536;
-    constants_info_[74].from_folded = false;
-    constants_info_[74].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[74].shape = {384};
-    constants_info_[74].stride = {1};
-    constants_info_[74].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[74].original_fqn = "mv2.features.9.conv.0.1.bias";
-    constants_info_[75].name = "mv2_features_9_conv_1_0_weight";
-    constants_info_[75].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[75].offset = 0;
-    constants_info_[75].data_size = 13824;
-    constants_info_[75].from_folded = false;
-    constants_info_[75].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[75].shape = {384, 1, 3, 3};
-    constants_info_[75].stride = {9, 9, 3, 1};
-    constants_info_[75].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[75].original_fqn = "mv2.features.9.conv.1.0.weight";
-    constants_info_[76].name = "mv2_features_9_conv_1_1_weight";
-    constants_info_[76].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[76].offset = 0;
-    constants_info_[76].data_size = 1536;
-    constants_info_[76].from_folded = false;
-    constants_info_[76].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[76].shape = {384};
-    constants_info_[76].stride = {1};
-    constants_info_[76].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[76].original_fqn = "mv2.features.9.conv.1.1.weight";
-    constants_info_[77].name = "mv2_features_9_conv_1_1_bias";
-    constants_info_[77].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[77].offset = 0;
-    constants_info_[77].data_size = 1536;
-    constants_info_[77].from_folded = false;
-    constants_info_[77].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[77].shape = {384};
-    constants_info_[77].stride = {1};
-    constants_info_[77].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[77].original_fqn = "mv2.features.9.conv.1.1.bias";
-    constants_info_[78].name = "mv2_features_9_conv_2_weight";
-    constants_info_[78].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[78].offset = 0;
-    constants_info_[78].data_size = 98304;
-    constants_info_[78].from_folded = false;
-    constants_info_[78].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[78].shape = {64, 384, 1, 1};
-    constants_info_[78].stride = {384, 1, 1, 1};
-    constants_info_[78].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[78].original_fqn = "mv2.features.9.conv.2.weight";
-    constants_info_[79].name = "mv2_features_9_conv_3_weight";
-    constants_info_[79].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[79].offset = 0;
-    constants_info_[79].data_size = 256;
-    constants_info_[79].from_folded = false;
-    constants_info_[79].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[79].shape = {64};
-    constants_info_[79].stride = {1};
-    constants_info_[79].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[79].original_fqn = "mv2.features.9.conv.3.weight";
-    constants_info_[80].name = "mv2_features_9_conv_3_bias";
-    constants_info_[80].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[80].offset = 0;
-    constants_info_[80].data_size = 256;
-    constants_info_[80].from_folded = false;
-    constants_info_[80].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[80].shape = {64};
-    constants_info_[80].stride = {1};
-    constants_info_[80].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[80].original_fqn = "mv2.features.9.conv.3.bias";
-    constants_info_[81].name = "mv2_features_10_conv_0_0_weight";
-    constants_info_[81].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[81].offset = 0;
-    constants_info_[81].data_size = 98304;
-    constants_info_[81].from_folded = false;
-    constants_info_[81].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[81].shape = {384, 64, 1, 1};
-    constants_info_[81].stride = {64, 1, 1, 1};
-    constants_info_[81].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[81].original_fqn = "mv2.features.10.conv.0.0.weight";
-    constants_info_[82].name = "mv2_features_10_conv_0_1_weight";
-    constants_info_[82].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[82].offset = 0;
-    constants_info_[82].data_size = 1536;
-    constants_info_[82].from_folded = false;
-    constants_info_[82].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[82].shape = {384};
-    constants_info_[82].stride = {1};
-    constants_info_[82].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[82].original_fqn = "mv2.features.10.conv.0.1.weight";
-    constants_info_[83].name = "mv2_features_10_conv_0_1_bias";
-    constants_info_[83].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[83].offset = 0;
-    constants_info_[83].data_size = 1536;
-    constants_info_[83].from_folded = false;
-    constants_info_[83].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[83].shape = {384};
-    constants_info_[83].stride = {1};
-    constants_info_[83].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[83].original_fqn = "mv2.features.10.conv.0.1.bias";
-    constants_info_[84].name = "mv2_features_10_conv_1_0_weight";
-    constants_info_[84].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[84].offset = 0;
-    constants_info_[84].data_size = 13824;
-    constants_info_[84].from_folded = false;
-    constants_info_[84].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[84].shape = {384, 1, 3, 3};
-    constants_info_[84].stride = {9, 9, 3, 1};
-    constants_info_[84].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[84].original_fqn = "mv2.features.10.conv.1.0.weight";
-    constants_info_[85].name = "mv2_features_10_conv_1_1_weight";
-    constants_info_[85].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[85].offset = 0;
-    constants_info_[85].data_size = 1536;
-    constants_info_[85].from_folded = false;
-    constants_info_[85].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[85].shape = {384};
-    constants_info_[85].stride = {1};
-    constants_info_[85].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[85].original_fqn = "mv2.features.10.conv.1.1.weight";
-    constants_info_[86].name = "mv2_features_10_conv_1_1_bias";
-    constants_info_[86].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[86].offset = 0;
-    constants_info_[86].data_size = 1536;
-    constants_info_[86].from_folded = false;
-    constants_info_[86].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[86].shape = {384};
-    constants_info_[86].stride = {1};
-    constants_info_[86].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[86].original_fqn = "mv2.features.10.conv.1.1.bias";
-    constants_info_[87].name = "mv2_features_10_conv_2_weight";
-    constants_info_[87].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[87].offset = 0;
-    constants_info_[87].data_size = 98304;
-    constants_info_[87].from_folded = false;
-    constants_info_[87].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[87].shape = {64, 384, 1, 1};
-    constants_info_[87].stride = {384, 1, 1, 1};
-    constants_info_[87].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[87].original_fqn = "mv2.features.10.conv.2.weight";
-    constants_info_[88].name = "mv2_features_10_conv_3_weight";
-    constants_info_[88].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[88].offset = 0;
-    constants_info_[88].data_size = 256;
-    constants_info_[88].from_folded = false;
-    constants_info_[88].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[88].shape = {64};
-    constants_info_[88].stride = {1};
-    constants_info_[88].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[88].original_fqn = "mv2.features.10.conv.3.weight";
-    constants_info_[89].name = "mv2_features_10_conv_3_bias";
-    constants_info_[89].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[89].offset = 0;
-    constants_info_[89].data_size = 256;
-    constants_info_[89].from_folded = false;
-    constants_info_[89].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[89].shape = {64};
-    constants_info_[89].stride = {1};
-    constants_info_[89].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[89].original_fqn = "mv2.features.10.conv.3.bias";
-    constants_info_[90].name = "mv2_features_11_conv_0_0_weight";
-    constants_info_[90].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[90].offset = 0;
-    constants_info_[90].data_size = 98304;
-    constants_info_[90].from_folded = false;
-    constants_info_[90].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[90].shape = {384, 64, 1, 1};
-    constants_info_[90].stride = {64, 1, 1, 1};
-    constants_info_[90].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[90].original_fqn = "mv2.features.11.conv.0.0.weight";
-    constants_info_[91].name = "mv2_features_11_conv_0_1_weight";
-    constants_info_[91].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[91].offset = 0;
-    constants_info_[91].data_size = 1536;
-    constants_info_[91].from_folded = false;
-    constants_info_[91].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[91].shape = {384};
-    constants_info_[91].stride = {1};
-    constants_info_[91].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[91].original_fqn = "mv2.features.11.conv.0.1.weight";
-    constants_info_[92].name = "mv2_features_11_conv_0_1_bias";
-    constants_info_[92].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[92].offset = 0;
-    constants_info_[92].data_size = 1536;
-    constants_info_[92].from_folded = false;
-    constants_info_[92].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[92].shape = {384};
-    constants_info_[92].stride = {1};
-    constants_info_[92].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[92].original_fqn = "mv2.features.11.conv.0.1.bias";
-    constants_info_[93].name = "mv2_features_11_conv_1_0_weight";
-    constants_info_[93].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[93].offset = 0;
-    constants_info_[93].data_size = 13824;
-    constants_info_[93].from_folded = false;
-    constants_info_[93].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[93].shape = {384, 1, 3, 3};
-    constants_info_[93].stride = {9, 9, 3, 1};
-    constants_info_[93].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[93].original_fqn = "mv2.features.11.conv.1.0.weight";
-    constants_info_[94].name = "mv2_features_11_conv_1_1_weight";
-    constants_info_[94].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[94].offset = 0;
-    constants_info_[94].data_size = 1536;
-    constants_info_[94].from_folded = false;
-    constants_info_[94].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[94].shape = {384};
-    constants_info_[94].stride = {1};
-    constants_info_[94].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[94].original_fqn = "mv2.features.11.conv.1.1.weight";
-    constants_info_[95].name = "mv2_features_11_conv_1_1_bias";
-    constants_info_[95].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[95].offset = 0;
-    constants_info_[95].data_size = 1536;
-    constants_info_[95].from_folded = false;
-    constants_info_[95].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[95].shape = {384};
-    constants_info_[95].stride = {1};
-    constants_info_[95].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[95].original_fqn = "mv2.features.11.conv.1.1.bias";
-    constants_info_[96].name = "mv2_features_11_conv_2_weight";
-    constants_info_[96].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[96].offset = 0;
-    constants_info_[96].data_size = 147456;
-    constants_info_[96].from_folded = false;
-    constants_info_[96].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[96].shape = {96, 384, 1, 1};
-    constants_info_[96].stride = {384, 1, 1, 1};
-    constants_info_[96].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[96].original_fqn = "mv2.features.11.conv.2.weight";
-    constants_info_[97].name = "mv2_features_11_conv_3_weight";
-    constants_info_[97].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[97].offset = 0;
-    constants_info_[97].data_size = 384;
-    constants_info_[97].from_folded = false;
-    constants_info_[97].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[97].shape = {96};
-    constants_info_[97].stride = {1};
-    constants_info_[97].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[97].original_fqn = "mv2.features.11.conv.3.weight";
-    constants_info_[98].name = "mv2_features_11_conv_3_bias";
-    constants_info_[98].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[98].offset = 0;
-    constants_info_[98].data_size = 384;
-    constants_info_[98].from_folded = false;
-    constants_info_[98].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[98].shape = {96};
-    constants_info_[98].stride = {1};
-    constants_info_[98].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[98].original_fqn = "mv2.features.11.conv.3.bias";
-    constants_info_[99].name = "mv2_features_12_conv_0_0_weight";
-    constants_info_[99].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[99].offset = 0;
-    constants_info_[99].data_size = 221184;
-    constants_info_[99].from_folded = false;
-    constants_info_[99].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[99].shape = {576, 96, 1, 1};
-    constants_info_[99].stride = {96, 1, 1, 1};
-    constants_info_[99].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[99].original_fqn = "mv2.features.12.conv.0.0.weight";
-    constants_info_[100].name = "mv2_features_12_conv_0_1_weight";
-    constants_info_[100].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[100].offset = 0;
-    constants_info_[100].data_size = 2304;
-    constants_info_[100].from_folded = false;
-    constants_info_[100].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[100].shape = {576};
-    constants_info_[100].stride = {1};
-    constants_info_[100].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[100].original_fqn = "mv2.features.12.conv.0.1.weight";
-    constants_info_[101].name = "mv2_features_12_conv_0_1_bias";
-    constants_info_[101].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[101].offset = 0;
-    constants_info_[101].data_size = 2304;
-    constants_info_[101].from_folded = false;
-    constants_info_[101].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[101].shape = {576};
-    constants_info_[101].stride = {1};
-    constants_info_[101].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[101].original_fqn = "mv2.features.12.conv.0.1.bias";
-    constants_info_[102].name = "mv2_features_12_conv_1_0_weight";
-    constants_info_[102].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[102].offset = 0;
-    constants_info_[102].data_size = 20736;
-    constants_info_[102].from_folded = false;
-    constants_info_[102].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[102].shape = {576, 1, 3, 3};
-    constants_info_[102].stride = {9, 9, 3, 1};
-    constants_info_[102].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[102].original_fqn = "mv2.features.12.conv.1.0.weight";
-    constants_info_[103].name = "mv2_features_12_conv_1_1_weight";
-    constants_info_[103].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[103].offset = 0;
-    constants_info_[103].data_size = 2304;
-    constants_info_[103].from_folded = false;
-    constants_info_[103].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[103].shape = {576};
-    constants_info_[103].stride = {1};
-    constants_info_[103].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[103].original_fqn = "mv2.features.12.conv.1.1.weight";
-    constants_info_[104].name = "mv2_features_12_conv_1_1_bias";
-    constants_info_[104].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[104].offset = 0;
-    constants_info_[104].data_size = 2304;
-    constants_info_[104].from_folded = false;
-    constants_info_[104].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[104].shape = {576};
-    constants_info_[104].stride = {1};
-    constants_info_[104].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[104].original_fqn = "mv2.features.12.conv.1.1.bias";
-    constants_info_[105].name = "mv2_features_12_conv_2_weight";
-    constants_info_[105].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[105].offset = 0;
-    constants_info_[105].data_size = 221184;
-    constants_info_[105].from_folded = false;
-    constants_info_[105].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[105].shape = {96, 576, 1, 1};
-    constants_info_[105].stride = {576, 1, 1, 1};
-    constants_info_[105].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[105].original_fqn = "mv2.features.12.conv.2.weight";
-    constants_info_[106].name = "mv2_features_12_conv_3_weight";
-    constants_info_[106].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[106].offset = 0;
-    constants_info_[106].data_size = 384;
-    constants_info_[106].from_folded = false;
-    constants_info_[106].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[106].shape = {96};
-    constants_info_[106].stride = {1};
-    constants_info_[106].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[106].original_fqn = "mv2.features.12.conv.3.weight";
-    constants_info_[107].name = "mv2_features_12_conv_3_bias";
-    constants_info_[107].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[107].offset = 0;
-    constants_info_[107].data_size = 384;
-    constants_info_[107].from_folded = false;
-    constants_info_[107].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[107].shape = {96};
-    constants_info_[107].stride = {1};
-    constants_info_[107].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[107].original_fqn = "mv2.features.12.conv.3.bias";
-    constants_info_[108].name = "mv2_features_13_conv_0_0_weight";
-    constants_info_[108].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[108].offset = 0;
-    constants_info_[108].data_size = 221184;
-    constants_info_[108].from_folded = false;
-    constants_info_[108].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[108].shape = {576, 96, 1, 1};
-    constants_info_[108].stride = {96, 1, 1, 1};
-    constants_info_[108].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[108].original_fqn = "mv2.features.13.conv.0.0.weight";
-    constants_info_[109].name = "mv2_features_13_conv_0_1_weight";
-    constants_info_[109].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[109].offset = 0;
-    constants_info_[109].data_size = 2304;
-    constants_info_[109].from_folded = false;
-    constants_info_[109].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[109].shape = {576};
-    constants_info_[109].stride = {1};
-    constants_info_[109].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[109].original_fqn = "mv2.features.13.conv.0.1.weight";
-    constants_info_[110].name = "mv2_features_13_conv_0_1_bias";
-    constants_info_[110].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[110].offset = 0;
-    constants_info_[110].data_size = 2304;
-    constants_info_[110].from_folded = false;
-    constants_info_[110].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[110].shape = {576};
-    constants_info_[110].stride = {1};
-    constants_info_[110].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[110].original_fqn = "mv2.features.13.conv.0.1.bias";
-    constants_info_[111].name = "mv2_features_13_conv_1_0_weight";
-    constants_info_[111].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[111].offset = 0;
-    constants_info_[111].data_size = 20736;
-    constants_info_[111].from_folded = false;
-    constants_info_[111].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[111].shape = {576, 1, 3, 3};
-    constants_info_[111].stride = {9, 9, 3, 1};
-    constants_info_[111].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[111].original_fqn = "mv2.features.13.conv.1.0.weight";
-    constants_info_[112].name = "mv2_features_13_conv_1_1_weight";
-    constants_info_[112].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[112].offset = 0;
-    constants_info_[112].data_size = 2304;
-    constants_info_[112].from_folded = false;
-    constants_info_[112].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[112].shape = {576};
-    constants_info_[112].stride = {1};
-    constants_info_[112].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[112].original_fqn = "mv2.features.13.conv.1.1.weight";
-    constants_info_[113].name = "mv2_features_13_conv_1_1_bias";
-    constants_info_[113].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[113].offset = 0;
-    constants_info_[113].data_size = 2304;
-    constants_info_[113].from_folded = false;
-    constants_info_[113].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[113].shape = {576};
-    constants_info_[113].stride = {1};
-    constants_info_[113].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[113].original_fqn = "mv2.features.13.conv.1.1.bias";
-    constants_info_[114].name = "mv2_features_13_conv_2_weight";
-    constants_info_[114].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[114].offset = 0;
-    constants_info_[114].data_size = 221184;
-    constants_info_[114].from_folded = false;
-    constants_info_[114].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[114].shape = {96, 576, 1, 1};
-    constants_info_[114].stride = {576, 1, 1, 1};
-    constants_info_[114].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[114].original_fqn = "mv2.features.13.conv.2.weight";
-    constants_info_[115].name = "mv2_features_13_conv_3_weight";
-    constants_info_[115].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[115].offset = 0;
-    constants_info_[115].data_size = 384;
-    constants_info_[115].from_folded = false;
-    constants_info_[115].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[115].shape = {96};
-    constants_info_[115].stride = {1};
-    constants_info_[115].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[115].original_fqn = "mv2.features.13.conv.3.weight";
-    constants_info_[116].name = "mv2_features_13_conv_3_bias";
-    constants_info_[116].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[116].offset = 0;
-    constants_info_[116].data_size = 384;
-    constants_info_[116].from_folded = false;
-    constants_info_[116].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[116].shape = {96};
-    constants_info_[116].stride = {1};
-    constants_info_[116].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[116].original_fqn = "mv2.features.13.conv.3.bias";
-    constants_info_[117].name = "mv2_features_14_conv_0_0_weight";
-    constants_info_[117].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[117].offset = 0;
-    constants_info_[117].data_size = 221184;
-    constants_info_[117].from_folded = false;
-    constants_info_[117].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[117].shape = {576, 96, 1, 1};
-    constants_info_[117].stride = {96, 1, 1, 1};
-    constants_info_[117].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[117].original_fqn = "mv2.features.14.conv.0.0.weight";
-    constants_info_[118].name = "mv2_features_14_conv_0_1_weight";
-    constants_info_[118].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[118].offset = 0;
-    constants_info_[118].data_size = 2304;
-    constants_info_[118].from_folded = false;
-    constants_info_[118].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[118].shape = {576};
-    constants_info_[118].stride = {1};
-    constants_info_[118].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[118].original_fqn = "mv2.features.14.conv.0.1.weight";
-    constants_info_[119].name = "mv2_features_14_conv_0_1_bias";
-    constants_info_[119].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[119].offset = 0;
-    constants_info_[119].data_size = 2304;
-    constants_info_[119].from_folded = false;
-    constants_info_[119].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[119].shape = {576};
-    constants_info_[119].stride = {1};
-    constants_info_[119].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[119].original_fqn = "mv2.features.14.conv.0.1.bias";
-    constants_info_[120].name = "mv2_features_14_conv_1_0_weight";
-    constants_info_[120].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[120].offset = 0;
-    constants_info_[120].data_size = 20736;
-    constants_info_[120].from_folded = false;
-    constants_info_[120].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[120].shape = {576, 1, 3, 3};
-    constants_info_[120].stride = {9, 9, 3, 1};
-    constants_info_[120].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[120].original_fqn = "mv2.features.14.conv.1.0.weight";
-    constants_info_[121].name = "mv2_features_14_conv_1_1_weight";
-    constants_info_[121].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[121].offset = 0;
-    constants_info_[121].data_size = 2304;
-    constants_info_[121].from_folded = false;
-    constants_info_[121].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[121].shape = {576};
-    constants_info_[121].stride = {1};
-    constants_info_[121].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[121].original_fqn = "mv2.features.14.conv.1.1.weight";
-    constants_info_[122].name = "mv2_features_14_conv_1_1_bias";
-    constants_info_[122].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[122].offset = 0;
-    constants_info_[122].data_size = 2304;
-    constants_info_[122].from_folded = false;
-    constants_info_[122].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[122].shape = {576};
-    constants_info_[122].stride = {1};
-    constants_info_[122].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[122].original_fqn = "mv2.features.14.conv.1.1.bias";
-    constants_info_[123].name = "mv2_features_14_conv_2_weight";
-    constants_info_[123].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[123].offset = 0;
-    constants_info_[123].data_size = 368640;
-    constants_info_[123].from_folded = false;
-    constants_info_[123].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[123].shape = {160, 576, 1, 1};
-    constants_info_[123].stride = {576, 1, 1, 1};
-    constants_info_[123].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[123].original_fqn = "mv2.features.14.conv.2.weight";
-    constants_info_[124].name = "mv2_features_14_conv_3_weight";
-    constants_info_[124].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[124].offset = 0;
-    constants_info_[124].data_size = 640;
-    constants_info_[124].from_folded = false;
-    constants_info_[124].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[124].shape = {160};
-    constants_info_[124].stride = {1};
-    constants_info_[124].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[124].original_fqn = "mv2.features.14.conv.3.weight";
-    constants_info_[125].name = "mv2_features_14_conv_3_bias";
-    constants_info_[125].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[125].offset = 0;
-    constants_info_[125].data_size = 640;
-    constants_info_[125].from_folded = false;
-    constants_info_[125].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[125].shape = {160};
-    constants_info_[125].stride = {1};
-    constants_info_[125].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[125].original_fqn = "mv2.features.14.conv.3.bias";
-    constants_info_[126].name = "mv2_features_15_conv_0_0_weight";
-    constants_info_[126].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[126].offset = 0;
-    constants_info_[126].data_size = 614400;
-    constants_info_[126].from_folded = false;
-    constants_info_[126].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[126].shape = {960, 160, 1, 1};
-    constants_info_[126].stride = {160, 1, 1, 1};
-    constants_info_[126].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[126].original_fqn = "mv2.features.15.conv.0.0.weight";
-    constants_info_[127].name = "mv2_features_15_conv_0_1_weight";
-    constants_info_[127].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[127].offset = 0;
-    constants_info_[127].data_size = 3840;
-    constants_info_[127].from_folded = false;
-    constants_info_[127].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[127].shape = {960};
-    constants_info_[127].stride = {1};
-    constants_info_[127].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[127].original_fqn = "mv2.features.15.conv.0.1.weight";
-    constants_info_[128].name = "mv2_features_15_conv_0_1_bias";
-    constants_info_[128].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[128].offset = 0;
-    constants_info_[128].data_size = 3840;
-    constants_info_[128].from_folded = false;
-    constants_info_[128].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[128].shape = {960};
-    constants_info_[128].stride = {1};
-    constants_info_[128].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[128].original_fqn = "mv2.features.15.conv.0.1.bias";
-    constants_info_[129].name = "mv2_features_15_conv_1_0_weight";
-    constants_info_[129].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[129].offset = 0;
-    constants_info_[129].data_size = 34560;
-    constants_info_[129].from_folded = false;
-    constants_info_[129].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[129].shape = {960, 1, 3, 3};
-    constants_info_[129].stride = {9, 9, 3, 1};
-    constants_info_[129].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[129].original_fqn = "mv2.features.15.conv.1.0.weight";
-    constants_info_[130].name = "mv2_features_15_conv_1_1_weight";
-    constants_info_[130].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[130].offset = 0;
-    constants_info_[130].data_size = 3840;
-    constants_info_[130].from_folded = false;
-    constants_info_[130].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[130].shape = {960};
-    constants_info_[130].stride = {1};
-    constants_info_[130].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[130].original_fqn = "mv2.features.15.conv.1.1.weight";
-    constants_info_[131].name = "mv2_features_15_conv_1_1_bias";
-    constants_info_[131].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[131].offset = 0;
-    constants_info_[131].data_size = 3840;
-    constants_info_[131].from_folded = false;
-    constants_info_[131].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[131].shape = {960};
-    constants_info_[131].stride = {1};
-    constants_info_[131].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[131].original_fqn = "mv2.features.15.conv.1.1.bias";
-    constants_info_[132].name = "mv2_features_15_conv_2_weight";
-    constants_info_[132].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[132].offset = 0;
-    constants_info_[132].data_size = 614400;
-    constants_info_[132].from_folded = false;
-    constants_info_[132].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[132].shape = {160, 960, 1, 1};
-    constants_info_[132].stride = {960, 1, 1, 1};
-    constants_info_[132].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[132].original_fqn = "mv2.features.15.conv.2.weight";
-    constants_info_[133].name = "mv2_features_15_conv_3_weight";
-    constants_info_[133].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[133].offset = 0;
-    constants_info_[133].data_size = 640;
-    constants_info_[133].from_folded = false;
-    constants_info_[133].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[133].shape = {160};
-    constants_info_[133].stride = {1};
-    constants_info_[133].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[133].original_fqn = "mv2.features.15.conv.3.weight";
-    constants_info_[134].name = "mv2_features_15_conv_3_bias";
-    constants_info_[134].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[134].offset = 0;
-    constants_info_[134].data_size = 640;
-    constants_info_[134].from_folded = false;
-    constants_info_[134].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[134].shape = {160};
-    constants_info_[134].stride = {1};
-    constants_info_[134].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[134].original_fqn = "mv2.features.15.conv.3.bias";
-    constants_info_[135].name = "mv2_features_16_conv_0_0_weight";
-    constants_info_[135].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[135].offset = 0;
-    constants_info_[135].data_size = 614400;
-    constants_info_[135].from_folded = false;
-    constants_info_[135].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[135].shape = {960, 160, 1, 1};
-    constants_info_[135].stride = {160, 1, 1, 1};
-    constants_info_[135].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[135].original_fqn = "mv2.features.16.conv.0.0.weight";
-    constants_info_[136].name = "mv2_features_16_conv_0_1_weight";
-    constants_info_[136].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[136].offset = 0;
-    constants_info_[136].data_size = 3840;
-    constants_info_[136].from_folded = false;
-    constants_info_[136].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[136].shape = {960};
-    constants_info_[136].stride = {1};
-    constants_info_[136].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[136].original_fqn = "mv2.features.16.conv.0.1.weight";
-    constants_info_[137].name = "mv2_features_16_conv_0_1_bias";
-    constants_info_[137].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[137].offset = 0;
-    constants_info_[137].data_size = 3840;
-    constants_info_[137].from_folded = false;
-    constants_info_[137].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[137].shape = {960};
-    constants_info_[137].stride = {1};
-    constants_info_[137].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[137].original_fqn = "mv2.features.16.conv.0.1.bias";
-    constants_info_[138].name = "mv2_features_16_conv_1_0_weight";
-    constants_info_[138].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[138].offset = 0;
-    constants_info_[138].data_size = 34560;
-    constants_info_[138].from_folded = false;
-    constants_info_[138].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[138].shape = {960, 1, 3, 3};
-    constants_info_[138].stride = {9, 9, 3, 1};
-    constants_info_[138].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[138].original_fqn = "mv2.features.16.conv.1.0.weight";
-    constants_info_[139].name = "mv2_features_16_conv_1_1_weight";
-    constants_info_[139].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[139].offset = 0;
-    constants_info_[139].data_size = 3840;
-    constants_info_[139].from_folded = false;
-    constants_info_[139].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[139].shape = {960};
-    constants_info_[139].stride = {1};
-    constants_info_[139].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[139].original_fqn = "mv2.features.16.conv.1.1.weight";
-    constants_info_[140].name = "mv2_features_16_conv_1_1_bias";
-    constants_info_[140].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[140].offset = 0;
-    constants_info_[140].data_size = 3840;
-    constants_info_[140].from_folded = false;
-    constants_info_[140].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[140].shape = {960};
-    constants_info_[140].stride = {1};
-    constants_info_[140].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[140].original_fqn = "mv2.features.16.conv.1.1.bias";
-    constants_info_[141].name = "mv2_features_16_conv_2_weight";
-    constants_info_[141].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[141].offset = 0;
-    constants_info_[141].data_size = 614400;
-    constants_info_[141].from_folded = false;
-    constants_info_[141].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[141].shape = {160, 960, 1, 1};
-    constants_info_[141].stride = {960, 1, 1, 1};
-    constants_info_[141].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[141].original_fqn = "mv2.features.16.conv.2.weight";
-    constants_info_[142].name = "mv2_features_16_conv_3_weight";
-    constants_info_[142].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[142].offset = 0;
-    constants_info_[142].data_size = 640;
-    constants_info_[142].from_folded = false;
-    constants_info_[142].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[142].shape = {160};
-    constants_info_[142].stride = {1};
-    constants_info_[142].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[142].original_fqn = "mv2.features.16.conv.3.weight";
-    constants_info_[143].name = "mv2_features_16_conv_3_bias";
-    constants_info_[143].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[143].offset = 0;
-    constants_info_[143].data_size = 640;
-    constants_info_[143].from_folded = false;
-    constants_info_[143].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[143].shape = {160};
-    constants_info_[143].stride = {1};
-    constants_info_[143].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[143].original_fqn = "mv2.features.16.conv.3.bias";
-    constants_info_[144].name = "mv2_features_17_conv_0_0_weight";
-    constants_info_[144].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[144].offset = 0;
-    constants_info_[144].data_size = 614400;
-    constants_info_[144].from_folded = false;
-    constants_info_[144].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[144].shape = {960, 160, 1, 1};
-    constants_info_[144].stride = {160, 1, 1, 1};
-    constants_info_[144].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[144].original_fqn = "mv2.features.17.conv.0.0.weight";
-    constants_info_[145].name = "mv2_features_17_conv_0_1_weight";
-    constants_info_[145].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[145].offset = 0;
-    constants_info_[145].data_size = 3840;
-    constants_info_[145].from_folded = false;
-    constants_info_[145].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[145].shape = {960};
-    constants_info_[145].stride = {1};
-    constants_info_[145].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[145].original_fqn = "mv2.features.17.conv.0.1.weight";
-    constants_info_[146].name = "mv2_features_17_conv_0_1_bias";
-    constants_info_[146].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[146].offset = 0;
-    constants_info_[146].data_size = 3840;
-    constants_info_[146].from_folded = false;
-    constants_info_[146].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[146].shape = {960};
-    constants_info_[146].stride = {1};
-    constants_info_[146].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[146].original_fqn = "mv2.features.17.conv.0.1.bias";
-    constants_info_[147].name = "mv2_features_17_conv_1_0_weight";
-    constants_info_[147].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[147].offset = 0;
-    constants_info_[147].data_size = 34560;
-    constants_info_[147].from_folded = false;
-    constants_info_[147].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[147].shape = {960, 1, 3, 3};
-    constants_info_[147].stride = {9, 9, 3, 1};
-    constants_info_[147].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[147].original_fqn = "mv2.features.17.conv.1.0.weight";
-    constants_info_[148].name = "mv2_features_17_conv_1_1_weight";
-    constants_info_[148].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[148].offset = 0;
-    constants_info_[148].data_size = 3840;
-    constants_info_[148].from_folded = false;
-    constants_info_[148].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[148].shape = {960};
-    constants_info_[148].stride = {1};
-    constants_info_[148].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[148].original_fqn = "mv2.features.17.conv.1.1.weight";
-    constants_info_[149].name = "mv2_features_17_conv_1_1_bias";
-    constants_info_[149].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[149].offset = 0;
-    constants_info_[149].data_size = 3840;
-    constants_info_[149].from_folded = false;
-    constants_info_[149].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[149].shape = {960};
-    constants_info_[149].stride = {1};
-    constants_info_[149].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[149].original_fqn = "mv2.features.17.conv.1.1.bias";
-    constants_info_[150].name = "mv2_features_17_conv_2_weight";
-    constants_info_[150].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[150].offset = 0;
-    constants_info_[150].data_size = 1228800;
-    constants_info_[150].from_folded = false;
-    constants_info_[150].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[150].shape = {320, 960, 1, 1};
-    constants_info_[150].stride = {960, 1, 1, 1};
-    constants_info_[150].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[150].original_fqn = "mv2.features.17.conv.2.weight";
-    constants_info_[151].name = "mv2_features_17_conv_3_weight";
-    constants_info_[151].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[151].offset = 0;
-    constants_info_[151].data_size = 1280;
-    constants_info_[151].from_folded = false;
-    constants_info_[151].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[151].shape = {320};
-    constants_info_[151].stride = {1};
-    constants_info_[151].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[151].original_fqn = "mv2.features.17.conv.3.weight";
-    constants_info_[152].name = "mv2_features_17_conv_3_bias";
-    constants_info_[152].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[152].offset = 0;
-    constants_info_[152].data_size = 1280;
-    constants_info_[152].from_folded = false;
-    constants_info_[152].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[152].shape = {320};
-    constants_info_[152].stride = {1};
-    constants_info_[152].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[152].original_fqn = "mv2.features.17.conv.3.bias";
-    constants_info_[153].name = "mv2_features_18_0_weight";
-    constants_info_[153].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[153].offset = 0;
-    constants_info_[153].data_size = 1638400;
-    constants_info_[153].from_folded = false;
-    constants_info_[153].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[153].shape = {1280, 320, 1, 1};
-    constants_info_[153].stride = {320, 1, 1, 1};
-    constants_info_[153].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[153].original_fqn = "mv2.features.18.0.weight";
-    constants_info_[154].name = "mv2_features_18_1_weight";
-    constants_info_[154].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[154].offset = 0;
-    constants_info_[154].data_size = 5120;
-    constants_info_[154].from_folded = false;
-    constants_info_[154].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[154].shape = {1280};
-    constants_info_[154].stride = {1};
-    constants_info_[154].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[154].original_fqn = "mv2.features.18.1.weight";
-    constants_info_[155].name = "mv2_features_18_1_bias";
-    constants_info_[155].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[155].offset = 0;
-    constants_info_[155].data_size = 5120;
-    constants_info_[155].from_folded = false;
-    constants_info_[155].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[155].shape = {1280};
-    constants_info_[155].stride = {1};
-    constants_info_[155].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[155].original_fqn = "mv2.features.18.1.bias";
-    constants_info_[156].name = "mv2_classifier_1_weight";
-    constants_info_[156].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[156].offset = 0;
-    constants_info_[156].data_size = 5120000;
-    constants_info_[156].from_folded = false;
-    constants_info_[156].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[156].shape = {1000, 1280};
-    constants_info_[156].stride = {1280, 1};
-    constants_info_[156].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[156].original_fqn = "mv2.classifier.1.weight";
-    constants_info_[157].name = "mv2_classifier_1_bias";
-    constants_info_[157].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[157].offset = 0;
-    constants_info_[157].data_size = 4000;
-    constants_info_[157].from_folded = false;
-    constants_info_[157].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[157].shape = {1000};
-    constants_info_[157].stride = {1};
-    constants_info_[157].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[157].original_fqn = "mv2.classifier.1.bias";
-    constants_info_[158].name = "mv2_features_0_1_running_mean";
-    constants_info_[158].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[158].offset = 0;
-    constants_info_[158].data_size = 128;
-    constants_info_[158].from_folded = false;
-    constants_info_[158].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[158].shape = {32};
-    constants_info_[158].stride = {1};
-    constants_info_[158].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[158].original_fqn = "mv2.features.0.1.running_mean";
-    constants_info_[159].name = "mv2_features_0_1_running_var";
-    constants_info_[159].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[159].offset = 0;
-    constants_info_[159].data_size = 128;
-    constants_info_[159].from_folded = false;
-    constants_info_[159].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[159].shape = {32};
-    constants_info_[159].stride = {1};
-    constants_info_[159].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[159].original_fqn = "mv2.features.0.1.running_var";
-    constants_info_[160].name = "mv2_features_1_conv_0_1_running_mean";
-    constants_info_[160].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[160].offset = 0;
-    constants_info_[160].data_size = 128;
-    constants_info_[160].from_folded = false;
-    constants_info_[160].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[160].shape = {32};
-    constants_info_[160].stride = {1};
-    constants_info_[160].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[160].original_fqn = "mv2.features.1.conv.0.1.running_mean";
-    constants_info_[161].name = "mv2_features_1_conv_0_1_running_var";
-    constants_info_[161].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[161].offset = 0;
-    constants_info_[161].data_size = 128;
-    constants_info_[161].from_folded = false;
-    constants_info_[161].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[161].shape = {32};
-    constants_info_[161].stride = {1};
-    constants_info_[161].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[161].original_fqn = "mv2.features.1.conv.0.1.running_var";
-    constants_info_[162].name = "mv2_features_1_conv_2_running_mean";
-    constants_info_[162].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[162].offset = 0;
-    constants_info_[162].data_size = 64;
-    constants_info_[162].from_folded = false;
-    constants_info_[162].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[162].shape = {16};
-    constants_info_[162].stride = {1};
-    constants_info_[162].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[162].original_fqn = "mv2.features.1.conv.2.running_mean";
-    constants_info_[163].name = "mv2_features_1_conv_2_running_var";
-    constants_info_[163].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[163].offset = 0;
-    constants_info_[163].data_size = 64;
-    constants_info_[163].from_folded = false;
-    constants_info_[163].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[163].shape = {16};
-    constants_info_[163].stride = {1};
-    constants_info_[163].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[163].original_fqn = "mv2.features.1.conv.2.running_var";
-    constants_info_[164].name = "mv2_features_2_conv_0_1_running_mean";
-    constants_info_[164].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[164].offset = 0;
-    constants_info_[164].data_size = 384;
-    constants_info_[164].from_folded = false;
-    constants_info_[164].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[164].shape = {96};
-    constants_info_[164].stride = {1};
-    constants_info_[164].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[164].original_fqn = "mv2.features.2.conv.0.1.running_mean";
-    constants_info_[165].name = "mv2_features_2_conv_0_1_running_var";
-    constants_info_[165].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[165].offset = 0;
-    constants_info_[165].data_size = 384;
-    constants_info_[165].from_folded = false;
-    constants_info_[165].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[165].shape = {96};
-    constants_info_[165].stride = {1};
-    constants_info_[165].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[165].original_fqn = "mv2.features.2.conv.0.1.running_var";
-    constants_info_[166].name = "mv2_features_2_conv_1_1_running_mean";
-    constants_info_[166].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[166].offset = 0;
-    constants_info_[166].data_size = 384;
-    constants_info_[166].from_folded = false;
-    constants_info_[166].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[166].shape = {96};
-    constants_info_[166].stride = {1};
-    constants_info_[166].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[166].original_fqn = "mv2.features.2.conv.1.1.running_mean";
-    constants_info_[167].name = "mv2_features_2_conv_1_1_running_var";
-    constants_info_[167].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[167].offset = 0;
-    constants_info_[167].data_size = 384;
-    constants_info_[167].from_folded = false;
-    constants_info_[167].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[167].shape = {96};
-    constants_info_[167].stride = {1};
-    constants_info_[167].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[167].original_fqn = "mv2.features.2.conv.1.1.running_var";
-    constants_info_[168].name = "mv2_features_2_conv_3_running_mean";
-    constants_info_[168].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[168].offset = 0;
-    constants_info_[168].data_size = 96;
-    constants_info_[168].from_folded = false;
-    constants_info_[168].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[168].shape = {24};
-    constants_info_[168].stride = {1};
-    constants_info_[168].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[168].original_fqn = "mv2.features.2.conv.3.running_mean";
-    constants_info_[169].name = "mv2_features_2_conv_3_running_var";
-    constants_info_[169].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[169].offset = 0;
-    constants_info_[169].data_size = 96;
-    constants_info_[169].from_folded = false;
-    constants_info_[169].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[169].shape = {24};
-    constants_info_[169].stride = {1};
-    constants_info_[169].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[169].original_fqn = "mv2.features.2.conv.3.running_var";
-    constants_info_[170].name = "mv2_features_3_conv_0_1_running_mean";
-    constants_info_[170].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[170].offset = 0;
-    constants_info_[170].data_size = 576;
-    constants_info_[170].from_folded = false;
-    constants_info_[170].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[170].shape = {144};
-    constants_info_[170].stride = {1};
-    constants_info_[170].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[170].original_fqn = "mv2.features.3.conv.0.1.running_mean";
-    constants_info_[171].name = "mv2_features_3_conv_0_1_running_var";
-    constants_info_[171].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[171].offset = 0;
-    constants_info_[171].data_size = 576;
-    constants_info_[171].from_folded = false;
-    constants_info_[171].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[171].shape = {144};
-    constants_info_[171].stride = {1};
-    constants_info_[171].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[171].original_fqn = "mv2.features.3.conv.0.1.running_var";
-    constants_info_[172].name = "mv2_features_3_conv_1_1_running_mean";
-    constants_info_[172].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[172].offset = 0;
-    constants_info_[172].data_size = 576;
-    constants_info_[172].from_folded = false;
-    constants_info_[172].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[172].shape = {144};
-    constants_info_[172].stride = {1};
-    constants_info_[172].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[172].original_fqn = "mv2.features.3.conv.1.1.running_mean";
-    constants_info_[173].name = "mv2_features_3_conv_1_1_running_var";
-    constants_info_[173].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[173].offset = 0;
-    constants_info_[173].data_size = 576;
-    constants_info_[173].from_folded = false;
-    constants_info_[173].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[173].shape = {144};
-    constants_info_[173].stride = {1};
-    constants_info_[173].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[173].original_fqn = "mv2.features.3.conv.1.1.running_var";
-    constants_info_[174].name = "mv2_features_3_conv_3_running_mean";
-    constants_info_[174].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[174].offset = 0;
-    constants_info_[174].data_size = 96;
-    constants_info_[174].from_folded = false;
-    constants_info_[174].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[174].shape = {24};
-    constants_info_[174].stride = {1};
-    constants_info_[174].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[174].original_fqn = "mv2.features.3.conv.3.running_mean";
-    constants_info_[175].name = "mv2_features_3_conv_3_running_var";
-    constants_info_[175].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[175].offset = 0;
-    constants_info_[175].data_size = 96;
-    constants_info_[175].from_folded = false;
-    constants_info_[175].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[175].shape = {24};
-    constants_info_[175].stride = {1};
-    constants_info_[175].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[175].original_fqn = "mv2.features.3.conv.3.running_var";
-    constants_info_[176].name = "mv2_features_4_conv_0_1_running_mean";
-    constants_info_[176].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[176].offset = 0;
-    constants_info_[176].data_size = 576;
-    constants_info_[176].from_folded = false;
-    constants_info_[176].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[176].shape = {144};
-    constants_info_[176].stride = {1};
-    constants_info_[176].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[176].original_fqn = "mv2.features.4.conv.0.1.running_mean";
-    constants_info_[177].name = "mv2_features_4_conv_0_1_running_var";
-    constants_info_[177].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[177].offset = 0;
-    constants_info_[177].data_size = 576;
-    constants_info_[177].from_folded = false;
-    constants_info_[177].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[177].shape = {144};
-    constants_info_[177].stride = {1};
-    constants_info_[177].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[177].original_fqn = "mv2.features.4.conv.0.1.running_var";
-    constants_info_[178].name = "mv2_features_4_conv_1_1_running_mean";
-    constants_info_[178].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[178].offset = 0;
-    constants_info_[178].data_size = 576;
-    constants_info_[178].from_folded = false;
-    constants_info_[178].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[178].shape = {144};
-    constants_info_[178].stride = {1};
-    constants_info_[178].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[178].original_fqn = "mv2.features.4.conv.1.1.running_mean";
-    constants_info_[179].name = "mv2_features_4_conv_1_1_running_var";
-    constants_info_[179].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[179].offset = 0;
-    constants_info_[179].data_size = 576;
-    constants_info_[179].from_folded = false;
-    constants_info_[179].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[179].shape = {144};
-    constants_info_[179].stride = {1};
-    constants_info_[179].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[179].original_fqn = "mv2.features.4.conv.1.1.running_var";
-    constants_info_[180].name = "mv2_features_4_conv_3_running_mean";
-    constants_info_[180].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[180].offset = 0;
-    constants_info_[180].data_size = 128;
-    constants_info_[180].from_folded = false;
-    constants_info_[180].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[180].shape = {32};
-    constants_info_[180].stride = {1};
-    constants_info_[180].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[180].original_fqn = "mv2.features.4.conv.3.running_mean";
-    constants_info_[181].name = "mv2_features_4_conv_3_running_var";
-    constants_info_[181].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[181].offset = 0;
-    constants_info_[181].data_size = 128;
-    constants_info_[181].from_folded = false;
-    constants_info_[181].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[181].shape = {32};
-    constants_info_[181].stride = {1};
-    constants_info_[181].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[181].original_fqn = "mv2.features.4.conv.3.running_var";
-    constants_info_[182].name = "mv2_features_5_conv_0_1_running_mean";
-    constants_info_[182].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[182].offset = 0;
-    constants_info_[182].data_size = 768;
-    constants_info_[182].from_folded = false;
-    constants_info_[182].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[182].shape = {192};
-    constants_info_[182].stride = {1};
-    constants_info_[182].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[182].original_fqn = "mv2.features.5.conv.0.1.running_mean";
-    constants_info_[183].name = "mv2_features_5_conv_0_1_running_var";
-    constants_info_[183].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[183].offset = 0;
-    constants_info_[183].data_size = 768;
-    constants_info_[183].from_folded = false;
-    constants_info_[183].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[183].shape = {192};
-    constants_info_[183].stride = {1};
-    constants_info_[183].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[183].original_fqn = "mv2.features.5.conv.0.1.running_var";
-    constants_info_[184].name = "mv2_features_5_conv_1_1_running_mean";
-    constants_info_[184].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[184].offset = 0;
-    constants_info_[184].data_size = 768;
-    constants_info_[184].from_folded = false;
-    constants_info_[184].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[184].shape = {192};
-    constants_info_[184].stride = {1};
-    constants_info_[184].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[184].original_fqn = "mv2.features.5.conv.1.1.running_mean";
-    constants_info_[185].name = "mv2_features_5_conv_1_1_running_var";
-    constants_info_[185].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[185].offset = 0;
-    constants_info_[185].data_size = 768;
-    constants_info_[185].from_folded = false;
-    constants_info_[185].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[185].shape = {192};
-    constants_info_[185].stride = {1};
-    constants_info_[185].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[185].original_fqn = "mv2.features.5.conv.1.1.running_var";
-    constants_info_[186].name = "mv2_features_5_conv_3_running_mean";
-    constants_info_[186].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[186].offset = 0;
-    constants_info_[186].data_size = 128;
-    constants_info_[186].from_folded = false;
-    constants_info_[186].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[186].shape = {32};
-    constants_info_[186].stride = {1};
-    constants_info_[186].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[186].original_fqn = "mv2.features.5.conv.3.running_mean";
-    constants_info_[187].name = "mv2_features_5_conv_3_running_var";
-    constants_info_[187].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[187].offset = 0;
-    constants_info_[187].data_size = 128;
-    constants_info_[187].from_folded = false;
-    constants_info_[187].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[187].shape = {32};
-    constants_info_[187].stride = {1};
-    constants_info_[187].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[187].original_fqn = "mv2.features.5.conv.3.running_var";
-    constants_info_[188].name = "mv2_features_6_conv_0_1_running_mean";
-    constants_info_[188].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[188].offset = 0;
-    constants_info_[188].data_size = 768;
-    constants_info_[188].from_folded = false;
-    constants_info_[188].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[188].shape = {192};
-    constants_info_[188].stride = {1};
-    constants_info_[188].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[188].original_fqn = "mv2.features.6.conv.0.1.running_mean";
-    constants_info_[189].name = "mv2_features_6_conv_0_1_running_var";
-    constants_info_[189].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[189].offset = 0;
-    constants_info_[189].data_size = 768;
-    constants_info_[189].from_folded = false;
-    constants_info_[189].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[189].shape = {192};
-    constants_info_[189].stride = {1};
-    constants_info_[189].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[189].original_fqn = "mv2.features.6.conv.0.1.running_var";
-    constants_info_[190].name = "mv2_features_6_conv_1_1_running_mean";
-    constants_info_[190].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[190].offset = 0;
-    constants_info_[190].data_size = 768;
-    constants_info_[190].from_folded = false;
-    constants_info_[190].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[190].shape = {192};
-    constants_info_[190].stride = {1};
-    constants_info_[190].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[190].original_fqn = "mv2.features.6.conv.1.1.running_mean";
-    constants_info_[191].name = "mv2_features_6_conv_1_1_running_var";
-    constants_info_[191].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[191].offset = 0;
-    constants_info_[191].data_size = 768;
-    constants_info_[191].from_folded = false;
-    constants_info_[191].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[191].shape = {192};
-    constants_info_[191].stride = {1};
-    constants_info_[191].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[191].original_fqn = "mv2.features.6.conv.1.1.running_var";
-    constants_info_[192].name = "mv2_features_6_conv_3_running_mean";
-    constants_info_[192].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[192].offset = 0;
-    constants_info_[192].data_size = 128;
-    constants_info_[192].from_folded = false;
-    constants_info_[192].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[192].shape = {32};
-    constants_info_[192].stride = {1};
-    constants_info_[192].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[192].original_fqn = "mv2.features.6.conv.3.running_mean";
-    constants_info_[193].name = "mv2_features_6_conv_3_running_var";
-    constants_info_[193].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[193].offset = 0;
-    constants_info_[193].data_size = 128;
-    constants_info_[193].from_folded = false;
-    constants_info_[193].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[193].shape = {32};
-    constants_info_[193].stride = {1};
-    constants_info_[193].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[193].original_fqn = "mv2.features.6.conv.3.running_var";
-    constants_info_[194].name = "mv2_features_7_conv_0_1_running_mean";
-    constants_info_[194].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[194].offset = 0;
-    constants_info_[194].data_size = 768;
-    constants_info_[194].from_folded = false;
-    constants_info_[194].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[194].shape = {192};
-    constants_info_[194].stride = {1};
-    constants_info_[194].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[194].original_fqn = "mv2.features.7.conv.0.1.running_mean";
-    constants_info_[195].name = "mv2_features_7_conv_0_1_running_var";
-    constants_info_[195].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[195].offset = 0;
-    constants_info_[195].data_size = 768;
-    constants_info_[195].from_folded = false;
-    constants_info_[195].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[195].shape = {192};
-    constants_info_[195].stride = {1};
-    constants_info_[195].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[195].original_fqn = "mv2.features.7.conv.0.1.running_var";
-    constants_info_[196].name = "mv2_features_7_conv_1_1_running_mean";
-    constants_info_[196].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[196].offset = 0;
-    constants_info_[196].data_size = 768;
-    constants_info_[196].from_folded = false;
-    constants_info_[196].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[196].shape = {192};
-    constants_info_[196].stride = {1};
-    constants_info_[196].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[196].original_fqn = "mv2.features.7.conv.1.1.running_mean";
-    constants_info_[197].name = "mv2_features_7_conv_1_1_running_var";
-    constants_info_[197].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[197].offset = 0;
-    constants_info_[197].data_size = 768;
-    constants_info_[197].from_folded = false;
-    constants_info_[197].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[197].shape = {192};
-    constants_info_[197].stride = {1};
-    constants_info_[197].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[197].original_fqn = "mv2.features.7.conv.1.1.running_var";
-    constants_info_[198].name = "mv2_features_7_conv_3_running_mean";
-    constants_info_[198].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[198].offset = 0;
-    constants_info_[198].data_size = 256;
-    constants_info_[198].from_folded = false;
-    constants_info_[198].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[198].shape = {64};
-    constants_info_[198].stride = {1};
-    constants_info_[198].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[198].original_fqn = "mv2.features.7.conv.3.running_mean";
-    constants_info_[199].name = "mv2_features_7_conv_3_running_var";
-    constants_info_[199].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[199].offset = 0;
-    constants_info_[199].data_size = 256;
-    constants_info_[199].from_folded = false;
-    constants_info_[199].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[199].shape = {64};
-    constants_info_[199].stride = {1};
-    constants_info_[199].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[199].original_fqn = "mv2.features.7.conv.3.running_var";
-    constants_info_[200].name = "mv2_features_8_conv_0_1_running_mean";
-    constants_info_[200].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[200].offset = 0;
-    constants_info_[200].data_size = 1536;
-    constants_info_[200].from_folded = false;
-    constants_info_[200].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[200].shape = {384};
-    constants_info_[200].stride = {1};
-    constants_info_[200].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[200].original_fqn = "mv2.features.8.conv.0.1.running_mean";
-    constants_info_[201].name = "mv2_features_8_conv_0_1_running_var";
-    constants_info_[201].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[201].offset = 0;
-    constants_info_[201].data_size = 1536;
-    constants_info_[201].from_folded = false;
-    constants_info_[201].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[201].shape = {384};
-    constants_info_[201].stride = {1};
-    constants_info_[201].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[201].original_fqn = "mv2.features.8.conv.0.1.running_var";
-    constants_info_[202].name = "mv2_features_8_conv_1_1_running_mean";
-    constants_info_[202].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[202].offset = 0;
-    constants_info_[202].data_size = 1536;
-    constants_info_[202].from_folded = false;
-    constants_info_[202].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[202].shape = {384};
-    constants_info_[202].stride = {1};
-    constants_info_[202].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[202].original_fqn = "mv2.features.8.conv.1.1.running_mean";
-    constants_info_[203].name = "mv2_features_8_conv_1_1_running_var";
-    constants_info_[203].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[203].offset = 0;
-    constants_info_[203].data_size = 1536;
-    constants_info_[203].from_folded = false;
-    constants_info_[203].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[203].shape = {384};
-    constants_info_[203].stride = {1};
-    constants_info_[203].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[203].original_fqn = "mv2.features.8.conv.1.1.running_var";
-    constants_info_[204].name = "mv2_features_8_conv_3_running_mean";
-    constants_info_[204].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[204].offset = 0;
-    constants_info_[204].data_size = 256;
-    constants_info_[204].from_folded = false;
-    constants_info_[204].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[204].shape = {64};
-    constants_info_[204].stride = {1};
-    constants_info_[204].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[204].original_fqn = "mv2.features.8.conv.3.running_mean";
-    constants_info_[205].name = "mv2_features_8_conv_3_running_var";
-    constants_info_[205].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[205].offset = 0;
-    constants_info_[205].data_size = 256;
-    constants_info_[205].from_folded = false;
-    constants_info_[205].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[205].shape = {64};
-    constants_info_[205].stride = {1};
-    constants_info_[205].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[205].original_fqn = "mv2.features.8.conv.3.running_var";
-    constants_info_[206].name = "mv2_features_9_conv_0_1_running_mean";
-    constants_info_[206].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[206].offset = 0;
-    constants_info_[206].data_size = 1536;
-    constants_info_[206].from_folded = false;
-    constants_info_[206].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[206].shape = {384};
-    constants_info_[206].stride = {1};
-    constants_info_[206].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[206].original_fqn = "mv2.features.9.conv.0.1.running_mean";
-    constants_info_[207].name = "mv2_features_9_conv_0_1_running_var";
-    constants_info_[207].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[207].offset = 0;
-    constants_info_[207].data_size = 1536;
-    constants_info_[207].from_folded = false;
-    constants_info_[207].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[207].shape = {384};
-    constants_info_[207].stride = {1};
-    constants_info_[207].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[207].original_fqn = "mv2.features.9.conv.0.1.running_var";
-    constants_info_[208].name = "mv2_features_9_conv_1_1_running_mean";
-    constants_info_[208].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[208].offset = 0;
-    constants_info_[208].data_size = 1536;
-    constants_info_[208].from_folded = false;
-    constants_info_[208].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[208].shape = {384};
-    constants_info_[208].stride = {1};
-    constants_info_[208].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[208].original_fqn = "mv2.features.9.conv.1.1.running_mean";
-    constants_info_[209].name = "mv2_features_9_conv_1_1_running_var";
-    constants_info_[209].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[209].offset = 0;
-    constants_info_[209].data_size = 1536;
-    constants_info_[209].from_folded = false;
-    constants_info_[209].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[209].shape = {384};
-    constants_info_[209].stride = {1};
-    constants_info_[209].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[209].original_fqn = "mv2.features.9.conv.1.1.running_var";
-    constants_info_[210].name = "mv2_features_9_conv_3_running_mean";
-    constants_info_[210].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[210].offset = 0;
-    constants_info_[210].data_size = 256;
-    constants_info_[210].from_folded = false;
-    constants_info_[210].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[210].shape = {64};
-    constants_info_[210].stride = {1};
-    constants_info_[210].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[210].original_fqn = "mv2.features.9.conv.3.running_mean";
-    constants_info_[211].name = "mv2_features_9_conv_3_running_var";
-    constants_info_[211].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[211].offset = 0;
-    constants_info_[211].data_size = 256;
-    constants_info_[211].from_folded = false;
-    constants_info_[211].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[211].shape = {64};
-    constants_info_[211].stride = {1};
-    constants_info_[211].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[211].original_fqn = "mv2.features.9.conv.3.running_var";
-    constants_info_[212].name = "mv2_features_10_conv_0_1_running_mean";
-    constants_info_[212].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[212].offset = 0;
-    constants_info_[212].data_size = 1536;
-    constants_info_[212].from_folded = false;
-    constants_info_[212].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[212].shape = {384};
-    constants_info_[212].stride = {1};
-    constants_info_[212].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[212].original_fqn = "mv2.features.10.conv.0.1.running_mean";
-    constants_info_[213].name = "mv2_features_10_conv_0_1_running_var";
-    constants_info_[213].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[213].offset = 0;
-    constants_info_[213].data_size = 1536;
-    constants_info_[213].from_folded = false;
-    constants_info_[213].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[213].shape = {384};
-    constants_info_[213].stride = {1};
-    constants_info_[213].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[213].original_fqn = "mv2.features.10.conv.0.1.running_var";
-    constants_info_[214].name = "mv2_features_10_conv_1_1_running_mean";
-    constants_info_[214].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[214].offset = 0;
-    constants_info_[214].data_size = 1536;
-    constants_info_[214].from_folded = false;
-    constants_info_[214].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[214].shape = {384};
-    constants_info_[214].stride = {1};
-    constants_info_[214].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[214].original_fqn = "mv2.features.10.conv.1.1.running_mean";
-    constants_info_[215].name = "mv2_features_10_conv_1_1_running_var";
-    constants_info_[215].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[215].offset = 0;
-    constants_info_[215].data_size = 1536;
-    constants_info_[215].from_folded = false;
-    constants_info_[215].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[215].shape = {384};
-    constants_info_[215].stride = {1};
-    constants_info_[215].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[215].original_fqn = "mv2.features.10.conv.1.1.running_var";
-    constants_info_[216].name = "mv2_features_10_conv_3_running_mean";
-    constants_info_[216].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[216].offset = 0;
-    constants_info_[216].data_size = 256;
-    constants_info_[216].from_folded = false;
-    constants_info_[216].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[216].shape = {64};
-    constants_info_[216].stride = {1};
-    constants_info_[216].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[216].original_fqn = "mv2.features.10.conv.3.running_mean";
-    constants_info_[217].name = "mv2_features_10_conv_3_running_var";
-    constants_info_[217].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[217].offset = 0;
-    constants_info_[217].data_size = 256;
-    constants_info_[217].from_folded = false;
-    constants_info_[217].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[217].shape = {64};
-    constants_info_[217].stride = {1};
-    constants_info_[217].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[217].original_fqn = "mv2.features.10.conv.3.running_var";
-    constants_info_[218].name = "mv2_features_11_conv_0_1_running_mean";
-    constants_info_[218].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[218].offset = 0;
-    constants_info_[218].data_size = 1536;
-    constants_info_[218].from_folded = false;
-    constants_info_[218].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[218].shape = {384};
-    constants_info_[218].stride = {1};
-    constants_info_[218].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[218].original_fqn = "mv2.features.11.conv.0.1.running_mean";
-    constants_info_[219].name = "mv2_features_11_conv_0_1_running_var";
-    constants_info_[219].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[219].offset = 0;
-    constants_info_[219].data_size = 1536;
-    constants_info_[219].from_folded = false;
-    constants_info_[219].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[219].shape = {384};
-    constants_info_[219].stride = {1};
-    constants_info_[219].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[219].original_fqn = "mv2.features.11.conv.0.1.running_var";
-    constants_info_[220].name = "mv2_features_11_conv_1_1_running_mean";
-    constants_info_[220].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[220].offset = 0;
-    constants_info_[220].data_size = 1536;
-    constants_info_[220].from_folded = false;
-    constants_info_[220].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[220].shape = {384};
-    constants_info_[220].stride = {1};
-    constants_info_[220].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[220].original_fqn = "mv2.features.11.conv.1.1.running_mean";
-    constants_info_[221].name = "mv2_features_11_conv_1_1_running_var";
-    constants_info_[221].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[221].offset = 0;
-    constants_info_[221].data_size = 1536;
-    constants_info_[221].from_folded = false;
-    constants_info_[221].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[221].shape = {384};
-    constants_info_[221].stride = {1};
-    constants_info_[221].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[221].original_fqn = "mv2.features.11.conv.1.1.running_var";
-    constants_info_[222].name = "mv2_features_11_conv_3_running_mean";
-    constants_info_[222].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[222].offset = 0;
-    constants_info_[222].data_size = 384;
-    constants_info_[222].from_folded = false;
-    constants_info_[222].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[222].shape = {96};
-    constants_info_[222].stride = {1};
-    constants_info_[222].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[222].original_fqn = "mv2.features.11.conv.3.running_mean";
-    constants_info_[223].name = "mv2_features_11_conv_3_running_var";
-    constants_info_[223].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[223].offset = 0;
-    constants_info_[223].data_size = 384;
-    constants_info_[223].from_folded = false;
-    constants_info_[223].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[223].shape = {96};
-    constants_info_[223].stride = {1};
-    constants_info_[223].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[223].original_fqn = "mv2.features.11.conv.3.running_var";
-    constants_info_[224].name = "mv2_features_12_conv_0_1_running_mean";
-    constants_info_[224].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[224].offset = 0;
-    constants_info_[224].data_size = 2304;
-    constants_info_[224].from_folded = false;
-    constants_info_[224].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[224].shape = {576};
-    constants_info_[224].stride = {1};
-    constants_info_[224].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[224].original_fqn = "mv2.features.12.conv.0.1.running_mean";
-    constants_info_[225].name = "mv2_features_12_conv_0_1_running_var";
-    constants_info_[225].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[225].offset = 0;
-    constants_info_[225].data_size = 2304;
-    constants_info_[225].from_folded = false;
-    constants_info_[225].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[225].shape = {576};
-    constants_info_[225].stride = {1};
-    constants_info_[225].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[225].original_fqn = "mv2.features.12.conv.0.1.running_var";
-    constants_info_[226].name = "mv2_features_12_conv_1_1_running_mean";
-    constants_info_[226].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[226].offset = 0;
-    constants_info_[226].data_size = 2304;
-    constants_info_[226].from_folded = false;
-    constants_info_[226].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[226].shape = {576};
-    constants_info_[226].stride = {1};
-    constants_info_[226].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[226].original_fqn = "mv2.features.12.conv.1.1.running_mean";
-    constants_info_[227].name = "mv2_features_12_conv_1_1_running_var";
-    constants_info_[227].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[227].offset = 0;
-    constants_info_[227].data_size = 2304;
-    constants_info_[227].from_folded = false;
-    constants_info_[227].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[227].shape = {576};
-    constants_info_[227].stride = {1};
-    constants_info_[227].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[227].original_fqn = "mv2.features.12.conv.1.1.running_var";
-    constants_info_[228].name = "mv2_features_12_conv_3_running_mean";
-    constants_info_[228].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[228].offset = 0;
-    constants_info_[228].data_size = 384;
-    constants_info_[228].from_folded = false;
-    constants_info_[228].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[228].shape = {96};
-    constants_info_[228].stride = {1};
-    constants_info_[228].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[228].original_fqn = "mv2.features.12.conv.3.running_mean";
-    constants_info_[229].name = "mv2_features_12_conv_3_running_var";
-    constants_info_[229].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[229].offset = 0;
-    constants_info_[229].data_size = 384;
-    constants_info_[229].from_folded = false;
-    constants_info_[229].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[229].shape = {96};
-    constants_info_[229].stride = {1};
-    constants_info_[229].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[229].original_fqn = "mv2.features.12.conv.3.running_var";
-    constants_info_[230].name = "mv2_features_13_conv_0_1_running_mean";
-    constants_info_[230].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[230].offset = 0;
-    constants_info_[230].data_size = 2304;
-    constants_info_[230].from_folded = false;
-    constants_info_[230].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[230].shape = {576};
-    constants_info_[230].stride = {1};
-    constants_info_[230].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[230].original_fqn = "mv2.features.13.conv.0.1.running_mean";
-    constants_info_[231].name = "mv2_features_13_conv_0_1_running_var";
-    constants_info_[231].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[231].offset = 0;
-    constants_info_[231].data_size = 2304;
-    constants_info_[231].from_folded = false;
-    constants_info_[231].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[231].shape = {576};
-    constants_info_[231].stride = {1};
-    constants_info_[231].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[231].original_fqn = "mv2.features.13.conv.0.1.running_var";
-    constants_info_[232].name = "mv2_features_13_conv_1_1_running_mean";
-    constants_info_[232].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[232].offset = 0;
-    constants_info_[232].data_size = 2304;
-    constants_info_[232].from_folded = false;
-    constants_info_[232].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[232].shape = {576};
-    constants_info_[232].stride = {1};
-    constants_info_[232].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[232].original_fqn = "mv2.features.13.conv.1.1.running_mean";
-    constants_info_[233].name = "mv2_features_13_conv_1_1_running_var";
-    constants_info_[233].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[233].offset = 0;
-    constants_info_[233].data_size = 2304;
-    constants_info_[233].from_folded = false;
-    constants_info_[233].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[233].shape = {576};
-    constants_info_[233].stride = {1};
-    constants_info_[233].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[233].original_fqn = "mv2.features.13.conv.1.1.running_var";
-    constants_info_[234].name = "mv2_features_13_conv_3_running_mean";
-    constants_info_[234].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[234].offset = 0;
-    constants_info_[234].data_size = 384;
-    constants_info_[234].from_folded = false;
-    constants_info_[234].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[234].shape = {96};
-    constants_info_[234].stride = {1};
-    constants_info_[234].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[234].original_fqn = "mv2.features.13.conv.3.running_mean";
-    constants_info_[235].name = "mv2_features_13_conv_3_running_var";
-    constants_info_[235].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[235].offset = 0;
-    constants_info_[235].data_size = 384;
-    constants_info_[235].from_folded = false;
-    constants_info_[235].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[235].shape = {96};
-    constants_info_[235].stride = {1};
-    constants_info_[235].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[235].original_fqn = "mv2.features.13.conv.3.running_var";
-    constants_info_[236].name = "mv2_features_14_conv_0_1_running_mean";
-    constants_info_[236].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[236].offset = 0;
-    constants_info_[236].data_size = 2304;
-    constants_info_[236].from_folded = false;
-    constants_info_[236].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[236].shape = {576};
-    constants_info_[236].stride = {1};
-    constants_info_[236].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[236].original_fqn = "mv2.features.14.conv.0.1.running_mean";
-    constants_info_[237].name = "mv2_features_14_conv_0_1_running_var";
-    constants_info_[237].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[237].offset = 0;
-    constants_info_[237].data_size = 2304;
-    constants_info_[237].from_folded = false;
-    constants_info_[237].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[237].shape = {576};
-    constants_info_[237].stride = {1};
-    constants_info_[237].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[237].original_fqn = "mv2.features.14.conv.0.1.running_var";
-    constants_info_[238].name = "mv2_features_14_conv_1_1_running_mean";
-    constants_info_[238].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[238].offset = 0;
-    constants_info_[238].data_size = 2304;
-    constants_info_[238].from_folded = false;
-    constants_info_[238].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[238].shape = {576};
-    constants_info_[238].stride = {1};
-    constants_info_[238].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[238].original_fqn = "mv2.features.14.conv.1.1.running_mean";
-    constants_info_[239].name = "mv2_features_14_conv_1_1_running_var";
-    constants_info_[239].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[239].offset = 0;
-    constants_info_[239].data_size = 2304;
-    constants_info_[239].from_folded = false;
-    constants_info_[239].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[239].shape = {576};
-    constants_info_[239].stride = {1};
-    constants_info_[239].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[239].original_fqn = "mv2.features.14.conv.1.1.running_var";
-    constants_info_[240].name = "mv2_features_14_conv_3_running_mean";
-    constants_info_[240].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[240].offset = 0;
-    constants_info_[240].data_size = 640;
-    constants_info_[240].from_folded = false;
-    constants_info_[240].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[240].shape = {160};
-    constants_info_[240].stride = {1};
-    constants_info_[240].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[240].original_fqn = "mv2.features.14.conv.3.running_mean";
-    constants_info_[241].name = "mv2_features_14_conv_3_running_var";
-    constants_info_[241].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[241].offset = 0;
-    constants_info_[241].data_size = 640;
-    constants_info_[241].from_folded = false;
-    constants_info_[241].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[241].shape = {160};
-    constants_info_[241].stride = {1};
-    constants_info_[241].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[241].original_fqn = "mv2.features.14.conv.3.running_var";
-    constants_info_[242].name = "mv2_features_15_conv_0_1_running_mean";
-    constants_info_[242].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[242].offset = 0;
-    constants_info_[242].data_size = 3840;
-    constants_info_[242].from_folded = false;
-    constants_info_[242].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[242].shape = {960};
-    constants_info_[242].stride = {1};
-    constants_info_[242].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[242].original_fqn = "mv2.features.15.conv.0.1.running_mean";
-    constants_info_[243].name = "mv2_features_15_conv_0_1_running_var";
-    constants_info_[243].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[243].offset = 0;
-    constants_info_[243].data_size = 3840;
-    constants_info_[243].from_folded = false;
-    constants_info_[243].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[243].shape = {960};
-    constants_info_[243].stride = {1};
-    constants_info_[243].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[243].original_fqn = "mv2.features.15.conv.0.1.running_var";
-    constants_info_[244].name = "mv2_features_15_conv_1_1_running_mean";
-    constants_info_[244].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[244].offset = 0;
-    constants_info_[244].data_size = 3840;
-    constants_info_[244].from_folded = false;
-    constants_info_[244].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[244].shape = {960};
-    constants_info_[244].stride = {1};
-    constants_info_[244].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[244].original_fqn = "mv2.features.15.conv.1.1.running_mean";
-    constants_info_[245].name = "mv2_features_15_conv_1_1_running_var";
-    constants_info_[245].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[245].offset = 0;
-    constants_info_[245].data_size = 3840;
-    constants_info_[245].from_folded = false;
-    constants_info_[245].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[245].shape = {960};
-    constants_info_[245].stride = {1};
-    constants_info_[245].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[245].original_fqn = "mv2.features.15.conv.1.1.running_var";
-    constants_info_[246].name = "mv2_features_15_conv_3_running_mean";
-    constants_info_[246].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[246].offset = 0;
-    constants_info_[246].data_size = 640;
-    constants_info_[246].from_folded = false;
-    constants_info_[246].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[246].shape = {160};
-    constants_info_[246].stride = {1};
-    constants_info_[246].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[246].original_fqn = "mv2.features.15.conv.3.running_mean";
-    constants_info_[247].name = "mv2_features_15_conv_3_running_var";
-    constants_info_[247].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[247].offset = 0;
-    constants_info_[247].data_size = 640;
-    constants_info_[247].from_folded = false;
-    constants_info_[247].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[247].shape = {160};
-    constants_info_[247].stride = {1};
-    constants_info_[247].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[247].original_fqn = "mv2.features.15.conv.3.running_var";
-    constants_info_[248].name = "mv2_features_16_conv_0_1_running_mean";
-    constants_info_[248].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[248].offset = 0;
-    constants_info_[248].data_size = 3840;
-    constants_info_[248].from_folded = false;
-    constants_info_[248].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[248].shape = {960};
-    constants_info_[248].stride = {1};
-    constants_info_[248].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[248].original_fqn = "mv2.features.16.conv.0.1.running_mean";
-    constants_info_[249].name = "mv2_features_16_conv_0_1_running_var";
-    constants_info_[249].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[249].offset = 0;
-    constants_info_[249].data_size = 3840;
-    constants_info_[249].from_folded = false;
-    constants_info_[249].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[249].shape = {960};
-    constants_info_[249].stride = {1};
-    constants_info_[249].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[249].original_fqn = "mv2.features.16.conv.0.1.running_var";
-    constants_info_[250].name = "mv2_features_16_conv_1_1_running_mean";
-    constants_info_[250].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[250].offset = 0;
-    constants_info_[250].data_size = 3840;
-    constants_info_[250].from_folded = false;
-    constants_info_[250].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[250].shape = {960};
-    constants_info_[250].stride = {1};
-    constants_info_[250].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[250].original_fqn = "mv2.features.16.conv.1.1.running_mean";
-    constants_info_[251].name = "mv2_features_16_conv_1_1_running_var";
-    constants_info_[251].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[251].offset = 0;
-    constants_info_[251].data_size = 3840;
-    constants_info_[251].from_folded = false;
-    constants_info_[251].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[251].shape = {960};
-    constants_info_[251].stride = {1};
-    constants_info_[251].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[251].original_fqn = "mv2.features.16.conv.1.1.running_var";
-    constants_info_[252].name = "mv2_features_16_conv_3_running_mean";
-    constants_info_[252].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[252].offset = 0;
-    constants_info_[252].data_size = 640;
-    constants_info_[252].from_folded = false;
-    constants_info_[252].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[252].shape = {160};
-    constants_info_[252].stride = {1};
-    constants_info_[252].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[252].original_fqn = "mv2.features.16.conv.3.running_mean";
-    constants_info_[253].name = "mv2_features_16_conv_3_running_var";
-    constants_info_[253].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[253].offset = 0;
-    constants_info_[253].data_size = 640;
-    constants_info_[253].from_folded = false;
-    constants_info_[253].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[253].shape = {160};
-    constants_info_[253].stride = {1};
-    constants_info_[253].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[253].original_fqn = "mv2.features.16.conv.3.running_var";
-    constants_info_[254].name = "mv2_features_17_conv_0_1_running_mean";
-    constants_info_[254].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[254].offset = 0;
-    constants_info_[254].data_size = 3840;
-    constants_info_[254].from_folded = false;
-    constants_info_[254].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[254].shape = {960};
-    constants_info_[254].stride = {1};
-    constants_info_[254].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[254].original_fqn = "mv2.features.17.conv.0.1.running_mean";
-    constants_info_[255].name = "mv2_features_17_conv_0_1_running_var";
-    constants_info_[255].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[255].offset = 0;
-    constants_info_[255].data_size = 3840;
-    constants_info_[255].from_folded = false;
-    constants_info_[255].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[255].shape = {960};
-    constants_info_[255].stride = {1};
-    constants_info_[255].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[255].original_fqn = "mv2.features.17.conv.0.1.running_var";
-    constants_info_[256].name = "mv2_features_17_conv_1_1_running_mean";
-    constants_info_[256].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[256].offset = 0;
-    constants_info_[256].data_size = 3840;
-    constants_info_[256].from_folded = false;
-    constants_info_[256].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[256].shape = {960};
-    constants_info_[256].stride = {1};
-    constants_info_[256].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[256].original_fqn = "mv2.features.17.conv.1.1.running_mean";
-    constants_info_[257].name = "mv2_features_17_conv_1_1_running_var";
-    constants_info_[257].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[257].offset = 0;
-    constants_info_[257].data_size = 3840;
-    constants_info_[257].from_folded = false;
-    constants_info_[257].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[257].shape = {960};
-    constants_info_[257].stride = {1};
-    constants_info_[257].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[257].original_fqn = "mv2.features.17.conv.1.1.running_var";
-    constants_info_[258].name = "mv2_features_17_conv_3_running_mean";
-    constants_info_[258].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[258].offset = 0;
-    constants_info_[258].data_size = 1280;
-    constants_info_[258].from_folded = false;
-    constants_info_[258].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[258].shape = {320};
-    constants_info_[258].stride = {1};
-    constants_info_[258].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[258].original_fqn = "mv2.features.17.conv.3.running_mean";
-    constants_info_[259].name = "mv2_features_17_conv_3_running_var";
-    constants_info_[259].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[259].offset = 0;
-    constants_info_[259].data_size = 1280;
-    constants_info_[259].from_folded = false;
-    constants_info_[259].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[259].shape = {320};
-    constants_info_[259].stride = {1};
-    constants_info_[259].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[259].original_fqn = "mv2.features.17.conv.3.running_var";
-    constants_info_[260].name = "mv2_features_18_1_running_mean";
-    constants_info_[260].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[260].offset = 0;
-    constants_info_[260].data_size = 5120;
-    constants_info_[260].from_folded = false;
-    constants_info_[260].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[260].shape = {1280};
-    constants_info_[260].stride = {1};
-    constants_info_[260].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[260].original_fqn = "mv2.features.18.1.running_mean";
-    constants_info_[261].name = "mv2_features_18_1_running_var";
-    constants_info_[261].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[261].offset = 0;
-    constants_info_[261].data_size = 5120;
-    constants_info_[261].from_folded = false;
-    constants_info_[261].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Buffer);
-    constants_info_[261].shape = {1280};
-    constants_info_[261].stride = {1};
-    constants_info_[261].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[261].original_fqn = "mv2.features.18.1.running_var";
-    update_constants_map(std::move(constants_map));
-    update_constants_array(std::move(constants_array));
-    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
-    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
-    outputs_info_[0].name = "output0";
-    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
-}
-
-std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor,
-    bool initialization
-) {
-
-    if (!initialization) {
-        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
-                  << "aot_inductor.use_runtime_constant_folding=False\n";
-    }
-    return {};
-}
-} // namespace torch::aot_inductor
-using namespace torch::aot_inductor;
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_0(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_0', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 4, 'x': 65536}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 451584, 'x': 602112}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 3
-        xnumel = 50176
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x1 = xindex
-        y0 = yindex
-        tmp0 = tl.load(in_ptr0 + (x1 + 50176*y0), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = ((ynumel + (4 - 1)) / (4));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
-        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin", "triton_poi_fused_convolution_0", 4160, cubin_dir_); 
-    }
-    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_2 = ynumel;
-    int var_3 = xnumel;
-    CUdeviceptr global_scratch_4 = 0;
-    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
-    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4160, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_1(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_1', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 128, 'x': 16}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6912, 'x': 3456}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 96
-        xnumel = 9
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
-    uint32_t grid_1 = ((ynumel + (64 - 1)) / (64));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
-        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin", "triton_poi_fused_convolution_1", 4352, cubin_dir_); 
-    }
-    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_7 = ynumel;
-    int var_8 = xnumel;
-    CUdeviceptr global_scratch_9 = 0;
-    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
-    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 524288}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 4817408}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 401408
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = tl.full([XBLOCK], True, tl.int1)
-        x2 = xindex
-        x0 = (xindex % 32)
-        tmp0 = tl.load(in_out_ptr0 + (x2), None)
-        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, None)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 = loadKernel("/home/gasoonjia/executorch/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_12 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_13 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_14 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_15 = xnumel;
-    CUdeviceptr global_scratch_16 = 0;
-    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &var_14, &var_15, &global_scratch_16};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_3(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_3', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 262144}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 2408704}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_3(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 200704
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = tl.full([XBLOCK], True, tl.int1)
-        x2 = xindex
-        x0 = (xindex % 16)
-        tmp0 = tl.load(in_out_ptr0 + (x2), None)
-        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, None)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 = loadKernel("/home/gasoonjia/executorch/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_3", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_17 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_18 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_19 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_20 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_21 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_22 = xnumel;
-    CUdeviceptr global_scratch_23 = 0;
-    void* kernel_args_[] = {&var_17, &var_18, &var_19, &var_20, &var_21, &var_22, &global_scratch_23};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 2097152}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 14452224}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 1204224
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = tl.full([XBLOCK], True, tl.int1)
-        x2 = xindex
-        x0 = (xindex % 96)
-        tmp0 = tl.load(in_out_ptr0 + (x2), None)
-        tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, None)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 = loadKernel("/home/gasoonjia/executorch/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_24 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_25 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_26 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_27 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_28 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_29 = xnumel;
-    CUdeviceptr global_scratch_30 = 0;
-    void* kernel_args_[] = {&var_24, &var_25, &var_26, &var_27, &var_28, &var_29, &global_scratch_30};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 524288}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 3614208}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 301056
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 96)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 = loadKernel("/home/gasoonjia/executorch/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_31 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_32 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_33 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_34 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_35 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_36 = xnumel;
-    CUdeviceptr global_scratch_37 = 0;
-    void* kernel_args_[] = {&var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &global_scratch_37};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_6(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_6', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 131072}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 903552}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_6(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 75264
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 24)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 = loadKernel("/home/gasoonjia/executorch/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_6", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_38 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_39 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_40 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_41 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_42 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_43 = xnumel;
-    CUdeviceptr global_scratch_44 = 0;
-    void* kernel_args_[] = {&var_38, &var_39, &var_40, &var_41, &var_42, &var_43, &global_scratch_44};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 524288}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 5421312}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 451584
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 144)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 = loadKernel("/home/gasoonjia/executorch/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_45 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_46 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_47 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_48 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_49 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_50 = xnumel;
-    CUdeviceptr global_scratch_51 = 0;
-    void* kernel_args_[] = {&var_45, &var_46, &var_47, &var_48, &var_49, &var_50, &global_scratch_51};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_8', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 131072}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1204608}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_add_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 75264
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 24)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x2), xmask)
-        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tmp1 - tmp2
-        tmp5 = 1e-05
-        tmp6 = tmp4 + tmp5
-        tmp7 = libdevice.sqrt(tmp6)
-        tmp8 = tl.full([1], 1, tl.int32)
-        tmp9 = (tmp8 / tmp7)
-        tmp10 = 1.0
-        tmp11 = tmp9 * tmp10
-        tmp12 = tmp3 * tmp11
-        tmp14 = tmp12 * tmp13
-        tmp16 = tmp14 + tmp15
-        tmp17 = tmp0 + tmp16
-        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 = loadKernel("/home/gasoonjia/executorch/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_8", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_52 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_53 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_54 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_55 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_56 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_57 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_58 = xnumel;
-    CUdeviceptr global_scratch_59 = 0;
-    void* kernel_args_[] = {&var_52, &var_53, &var_54, &var_55, &var_56, &var_57, &var_58, &global_scratch_59};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 131072}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1357056}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 112896
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 144)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 = loadKernel("/home/gasoonjia/executorch/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_60 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_61 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_62 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_63 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_64 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_65 = xnumel;
-    CUdeviceptr global_scratch_66 = 0;
-    void* kernel_args_[] = {&var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &global_scratch_66};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_10(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_10', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 32768}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 301568}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 25088
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 32)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 = loadKernel("/home/gasoonjia/executorch/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_10", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_67 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_68 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_69 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_70 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_71 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_72 = xnumel;
-    CUdeviceptr global_scratch_73 = 0;
-    void* kernel_args_[] = {&var_67, &var_68, &var_69, &var_70, &var_71, &var_72, &global_scratch_73};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 262144}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1809408}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 150528
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 192)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 = loadKernel("/home/gasoonjia/executorch/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_74 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_75 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_76 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_77 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_78 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_79 = xnumel;
-    CUdeviceptr global_scratch_80 = 0;
-    void* kernel_args_[] = {&var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &global_scratch_80};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_12', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 32768}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 401920}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_add_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 25088
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 32)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x2), xmask)
-        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tmp1 - tmp2
-        tmp5 = 1e-05
-        tmp6 = tmp4 + tmp5
-        tmp7 = libdevice.sqrt(tmp6)
-        tmp8 = tl.full([1], 1, tl.int32)
-        tmp9 = (tmp8 / tmp7)
-        tmp10 = 1.0
-        tmp11 = tmp9 * tmp10
-        tmp12 = tmp3 * tmp11
-        tmp14 = tmp12 * tmp13
-        tmp16 = tmp14 + tmp15
-        tmp17 = tmp0 + tmp16
-        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 = loadKernel("/home/gasoonjia/executorch/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_12", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_81 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_82 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_83 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_84 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_85 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_86 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_87 = xnumel;
-    CUdeviceptr global_scratch_88 = 0;
-    void* kernel_args_[] = {&var_81, &var_82, &var_83, &var_84, &var_85, &var_86, &var_87, &global_scratch_88};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 65536}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 454656}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 37632
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 192)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 = loadKernel("/home/gasoonjia/executorch/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_89 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_90 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_91 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_92 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_93 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_94 = xnumel;
-    CUdeviceptr global_scratch_95 = 0;
-    void* kernel_args_[] = {&var_89, &var_90, &var_91, &var_92, &var_93, &var_94, &global_scratch_95};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_14(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_14', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 16384}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_14', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 151552}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_14(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 12544
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 64)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 = loadKernel("/home/gasoonjia/executorch/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_14", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_96 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_97 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_98 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_99 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_100 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_101 = xnumel;
-    CUdeviceptr global_scratch_102 = 0;
-    void* kernel_args_[] = {&var_96, &var_97, &var_98, &var_99, &var_100, &var_101, &global_scratch_102};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 131072}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 909312}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 75264
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 384)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 = loadKernel("/home/gasoonjia/executorch/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_103 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_104 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_105 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_106 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_107 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_108 = xnumel;
-    CUdeviceptr global_scratch_109 = 0;
-    void* kernel_args_[] = {&var_103, &var_104, &var_105, &var_106, &var_107, &var_108, &global_scratch_109};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_16', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 16384}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_16', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 201728}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_add_16(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 12544
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 64)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x2), xmask)
-        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tmp1 - tmp2
-        tmp5 = 1e-05
-        tmp6 = tmp4 + tmp5
-        tmp7 = libdevice.sqrt(tmp6)
-        tmp8 = tl.full([1], 1, tl.int32)
-        tmp9 = (tmp8 / tmp7)
-        tmp10 = 1.0
-        tmp11 = tmp9 * tmp10
-        tmp12 = tmp3 * tmp11
-        tmp14 = tmp12 * tmp13
-        tmp16 = tmp14 + tmp15
-        tmp17 = tmp0 + tmp16
-        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 = loadKernel("/home/gasoonjia/executorch/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_16", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_110 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_111 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_112 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_113 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_114 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_115 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_116 = xnumel;
-    CUdeviceptr global_scratch_117 = 0;
-    void* kernel_args_[] = {&var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &global_scratch_117};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_17(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_17', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 32768}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_17', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 227328}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_17(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 18816
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 96)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 = loadKernel("/home/gasoonjia/executorch/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_17", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_118 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_119 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_120 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_121 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_122 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_123 = xnumel;
-    CUdeviceptr global_scratch_124 = 0;
-    void* kernel_args_[] = {&var_118, &var_119, &var_120, &var_121, &var_122, &var_123, &global_scratch_124};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 131072}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1363968}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 112896
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 576)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 = loadKernel("/home/gasoonjia/executorch/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_125 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_126 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_127 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_128 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_129 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_130 = xnumel;
-    CUdeviceptr global_scratch_131 = 0;
-    void* kernel_args_[] = {&var_125, &var_126, &var_127, &var_128, &var_129, &var_130, &global_scratch_131};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_19', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 32768}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_19', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 302592}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_add_19(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 18816
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 96)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x2), xmask)
-        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tmp1 - tmp2
-        tmp5 = 1e-05
-        tmp6 = tmp4 + tmp5
-        tmp7 = libdevice.sqrt(tmp6)
-        tmp8 = tl.full([1], 1, tl.int32)
-        tmp9 = (tmp8 / tmp7)
-        tmp10 = 1.0
-        tmp11 = tmp9 * tmp10
-        tmp12 = tmp3 * tmp11
-        tmp14 = tmp12 * tmp13
-        tmp16 = tmp14 + tmp15
-        tmp17 = tmp0 + tmp16
-        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 = loadKernel("/home/gasoonjia/executorch/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_19", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_132 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_133 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_134 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_135 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_136 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_137 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_138 = xnumel;
-    CUdeviceptr global_scratch_139 = 0;
-    void* kernel_args_[] = {&var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &global_scratch_139};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 32768}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 347904}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 28224
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 576)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 = loadKernel("/home/gasoonjia/executorch/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_140 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_141 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_142 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_143 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_144 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_145 = xnumel;
-    CUdeviceptr global_scratch_146 = 0;
-    void* kernel_args_[] = {&var_140, &var_141, &var_142, &var_143, &var_144, &var_145, &global_scratch_146};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_21(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_21', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 8192}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_21', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 96640}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_21(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 7840
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 160)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 = loadKernel("/home/gasoonjia/executorch/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_21", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_147 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_148 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_149 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_150 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_151 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_152 = xnumel;
-    CUdeviceptr global_scratch_153 = 0;
-    void* kernel_args_[] = {&var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &global_scratch_153};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 65536}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 579840}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 47040
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 960)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tl.store(in_out_ptr0 + (x2), tmp19, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (512 - 1)) / (512));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 = loadKernel("/home/gasoonjia/executorch/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_154 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_155 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_156 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_157 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_158 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_159 = xnumel;
-    CUdeviceptr global_scratch_160 = 0;
-    void* kernel_args_[] = {&var_154, &var_155, &var_156, &var_157, &var_158, &var_159, &global_scratch_160};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_23', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 8192}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_23', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 128000}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_add_23(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 7840
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 160)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x2), xmask)
-        tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tmp1 - tmp2
-        tmp5 = 1e-05
-        tmp6 = tmp4 + tmp5
-        tmp7 = libdevice.sqrt(tmp6)
-        tmp8 = tl.full([1], 1, tl.int32)
-        tmp9 = (tmp8 / tmp7)
-        tmp10 = 1.0
-        tmp11 = tmp9 * tmp10
-        tmp12 = tmp3 * tmp11
-        tmp14 = tmp12 * tmp13
-        tmp16 = tmp14 + tmp15
-        tmp17 = tmp0 + tmp16
-        tl.store(in_out_ptr0 + (x2), tmp17, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (128 - 1)) / (128));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 = loadKernel("/home/gasoonjia/executorch/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_23", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_161 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_162 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_163 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_164 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_165 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_166 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_167 = xnumel;
-    CUdeviceptr global_scratch_168 = 0;
-    void* kernel_args_[] = {&var_161, &var_162, &var_163, &var_164, &var_165, &var_166, &var_167, &global_scratch_168};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename kernels_type_>
-static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_24(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_24', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 16384}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_24', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 193280}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused__native_batch_norm_legit_no_training_24(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 15680
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x2 = xindex
-        x0 = (xindex % 320)
-        tmp0 = tl.load(in_out_ptr0 + (x2), xmask)
-        tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tl.store(in_out_ptr0 + (x2), tmp15, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (256 - 1)) / (256));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 == nullptr) {
-        kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 = loadKernel("/home/gasoonjia/executorch/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_24", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_169 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_170 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_171 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_172 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_173 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    int var_174 = xnumel;
-    CUdeviceptr global_scratch_175 = 0;
-    void* kernel_args_[] = {&var_169, &var_170, &var_171, &var_172, &var_173, &var_174, &global_scratch_175};
-    launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-template <typename in_out_ptr0_type_, typename in_ptr0_type_, typename in_ptr1_type_, typename in_ptr2_type_, typename in_ptr3_type_, typename in_ptr4_type_, typename kernels_type_>
-static inline void call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(
-    const in_out_ptr0_type_& in_out_ptr0,
-    const in_ptr0_type_& in_ptr0,
-    const in_ptr1_type_& in_ptr1,
-    const in_ptr2_type_& in_ptr2,
-    const in_ptr3_type_& in_ptr3,
-    const in_ptr4_type_& in_ptr4,
-    int64_t xnumel,
-    int64_t r0_numel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.persistent_reduction(
-        size_hints={'x': 2048, 'r0_': 64},
-        reduction_hint=ReductionHint.OUTER,
-        filename=__file__,
-        triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 1, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 281600, 'r0_': 0}}
-    )
-    @triton.jit
-    def triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr):
-        xnumel = 1280
-        r0_numel = 49
-        R0_BLOCK: tl.constexpr = 64
-        rnumel = r0_numel
-        RBLOCK: tl.constexpr = R0_BLOCK
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
-        xmask = xindex < xnumel
-        r0_index = tl.arange(0, R0_BLOCK)[None, :]
-        r0_offset = 0
-        r0_mask = r0_index < r0_numel
-        roffset = r0_offset
-        rindex = r0_index
-        r0_1 = r0_index
-        x0 = xindex
-        tmp0 = tl.load(in_ptr0 + (x0 + 1280*r0_1), r0_mask & xmask, other=0.0)
-        tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last')
-        tmp3 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last')
-        tmp12 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last')
-        tmp14 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last')
-        tmp2 = tmp0 - tmp1
-        tmp4 = 1e-05
-        tmp5 = tmp3 + tmp4
-        tmp6 = libdevice.sqrt(tmp5)
-        tmp7 = tl.full([1, 1], 1, tl.int32)
-        tmp8 = (tmp7 / tmp6)
-        tmp9 = 1.0
-        tmp10 = tmp8 * tmp9
-        tmp11 = tmp2 * tmp10
-        tmp13 = tmp11 * tmp12
-        tmp15 = tmp13 + tmp14
-        tmp16 = 0.0
-        tmp17 = triton_helpers.maximum(tmp15, tmp16)
-        tmp18 = 6.0
-        tmp19 = triton_helpers.minimum(tmp17, tmp18)
-        tmp20 = tl.broadcast_to(tmp19, [XBLOCK, R0_BLOCK])
-        tmp22 = tl.where(r0_mask & xmask, tmp20, 0)
-        tmp23 = tl.sum(tmp22, 1)[:, None]
-        tmp24 = 49.0
-        tmp25 = (tmp23 / tmp24)
-        tl.debug_barrier()
-        tl.store(in_out_ptr0 + (x0), tmp25, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 == nullptr) {
-        kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 = loadKernel("/home/gasoonjia/executorch/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin", "triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25", 1024, cubin_dir_); 
-    }
-    CUdeviceptr var_176 = reinterpret_cast<CUdeviceptr>(in_out_ptr0.data_ptr());
-    CUdeviceptr var_177 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_178 = reinterpret_cast<CUdeviceptr>(in_ptr1.data_ptr());
-    CUdeviceptr var_179 = reinterpret_cast<CUdeviceptr>(in_ptr2.data_ptr());
-    CUdeviceptr var_180 = reinterpret_cast<CUdeviceptr>(in_ptr3.data_ptr());
-    CUdeviceptr var_181 = reinterpret_cast<CUdeviceptr>(in_ptr4.data_ptr());
-    int var_182 = xnumel;
-    int var_183 = r0_numel;
-    CUdeviceptr global_scratch_184 = 0;
-    void* kernel_args_[] = {&var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184};
-    launchKernel(kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25, grid_0, grid_1, grid_2, 8, 1024, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_permute_copy_26(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_permute_copy_26', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'x': 2097152}, 
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_permute_copy_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 15360000}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_permute_copy_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
-        xnumel = 1280000
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[:]
-        xmask = xindex < xnumel
-        x0 = xindex
-        tmp0 = tl.load(in_ptr0 + (x0), xmask)
-        tl.store(out_ptr0 + (x0), tmp0, xmask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024));
-    uint32_t grid_1 = 1;
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_permute_copy_26 == nullptr) {
-        kernels_.triton_poi_fused_permute_copy_26 = loadKernel("/home/gasoonjia/executorch/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin", "triton_poi_fused_permute_copy_26", 0, cubin_dir_); 
-    }
-    CUdeviceptr var_185 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_186 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_187 = xnumel;
-    CUdeviceptr global_scratch_188 = 0;
-    void* kernel_args_[] = {&var_185, &var_186, &var_187, &global_scratch_188};
-    launchKernel(kernels_.triton_poi_fused_permute_copy_26, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_);
-}
-
-namespace torch::aot_inductor {
-
-void AOTInductorModel::_const_run_impl(
-    std::vector<AtenTensorHandle>& output_handles,
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {}
-
-AOTI_NOINLINE static void check_input_0(
-    AtenTensorHandle* input_handles
-) {
-    ConstantHandle arg262_1 = ConstantHandle(input_handles[0]);
-    int32_t arg262_1_dtype;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg262_1, &arg262_1_dtype));
-
-    int32_t arg262_1_expected_dtype = aoti_torch_dtype_float32();
-    if (arg262_1_expected_dtype != arg262_1_dtype) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dtype, "
-           << "expected: " << arg262_1_expected_dtype << "(at::kFloat), "
-           << "but got: " << arg262_1_dtype << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg262_1_size = arg262_1.sizes();
-
-    if (1 != arg262_1_size[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 0, "
-           << "expected: 1, " << "but got: " << arg262_1_size[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (3 != arg262_1_size[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 1, "
-           << "expected: 3, " << "but got: " << arg262_1_size[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (224 != arg262_1_size[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 2, "
-           << "expected: 224, " << "but got: " << arg262_1_size[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (224 != arg262_1_size[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 3, "
-           << "expected: 224, " << "but got: " << arg262_1_size[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg262_1_stride = arg262_1.strides();
-
-    if (150528 != arg262_1_stride[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 0, "
-           << "expected: 150528, " << "but got: " << arg262_1_stride[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (50176 != arg262_1_stride[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 1, "
-           << "expected: 50176, " << "but got: " << arg262_1_stride[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (224 != arg262_1_stride[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 2, "
-           << "expected: 224, " << "but got: " << arg262_1_stride[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (1 != arg262_1_stride[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 3, "
-           << "expected: 1, " << "but got: " << arg262_1_stride[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    int32_t arg262_1_device_type;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg262_1, &arg262_1_device_type));
-
-    int32_t arg262_1_expected_device_type = 1;
-    if (arg262_1_expected_device_type != arg262_1_device_type) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched device type, "
-        << "expected: " << arg262_1_expected_device_type << "1(cuda), "
-        << "but got: " << arg262_1_device_type << "\n";
-        throw std::runtime_error(ss.str());
-    }
-}
-
-static bool _check_aoti_runtime_check_inputs_env() {
-    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
-    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
-    return result;
-}
-
-AOTI_NOINLINE static void __check_inputs_outputs(
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-    if (!_check_aoti_runtime_check_inputs_env()){
-        return;
-    }
-    check_input_0(input_handles);
-}
-
-void AOTInductorModel::run_impl(
-    AtenTensorHandle*
-        input_handles, // array of input AtenTensorHandle; handles
-                        // are stolen; the array itself is borrowed
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {
-    __check_inputs_outputs(input_handles, output_handles);
-
-    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
-    auto arg262_1 = std::move(inputs[0]);
-    [[maybe_unused]] auto& mv2_features_0_0_weight = constants_->at(0);
-    [[maybe_unused]] auto& mv2_features_0_1_weight = constants_->at(1);
-    [[maybe_unused]] auto& mv2_features_0_1_bias = constants_->at(2);
-    [[maybe_unused]] auto& mv2_features_1_conv_0_0_weight = constants_->at(3);
-    [[maybe_unused]] auto& mv2_features_1_conv_0_1_weight = constants_->at(4);
-    [[maybe_unused]] auto& mv2_features_1_conv_0_1_bias = constants_->at(5);
-    [[maybe_unused]] auto& mv2_features_1_conv_1_weight = constants_->at(6);
-    [[maybe_unused]] auto& mv2_features_1_conv_2_weight = constants_->at(7);
-    [[maybe_unused]] auto& mv2_features_1_conv_2_bias = constants_->at(8);
-    [[maybe_unused]] auto& mv2_features_2_conv_0_0_weight = constants_->at(9);
-    [[maybe_unused]] auto& mv2_features_2_conv_0_1_weight = constants_->at(10);
-    [[maybe_unused]] auto& mv2_features_2_conv_0_1_bias = constants_->at(11);
-    [[maybe_unused]] auto& mv2_features_2_conv_1_0_weight = constants_->at(12);
-    [[maybe_unused]] auto& mv2_features_2_conv_1_1_weight = constants_->at(13);
-    [[maybe_unused]] auto& mv2_features_2_conv_1_1_bias = constants_->at(14);
-    [[maybe_unused]] auto& mv2_features_2_conv_2_weight = constants_->at(15);
-    [[maybe_unused]] auto& mv2_features_2_conv_3_weight = constants_->at(16);
-    [[maybe_unused]] auto& mv2_features_2_conv_3_bias = constants_->at(17);
-    [[maybe_unused]] auto& mv2_features_3_conv_0_0_weight = constants_->at(18);
-    [[maybe_unused]] auto& mv2_features_3_conv_0_1_weight = constants_->at(19);
-    [[maybe_unused]] auto& mv2_features_3_conv_0_1_bias = constants_->at(20);
-    [[maybe_unused]] auto& mv2_features_3_conv_1_0_weight = constants_->at(21);
-    [[maybe_unused]] auto& mv2_features_3_conv_1_1_weight = constants_->at(22);
-    [[maybe_unused]] auto& mv2_features_3_conv_1_1_bias = constants_->at(23);
-    [[maybe_unused]] auto& mv2_features_3_conv_2_weight = constants_->at(24);
-    [[maybe_unused]] auto& mv2_features_3_conv_3_weight = constants_->at(25);
-    [[maybe_unused]] auto& mv2_features_3_conv_3_bias = constants_->at(26);
-    [[maybe_unused]] auto& mv2_features_4_conv_0_0_weight = constants_->at(27);
-    [[maybe_unused]] auto& mv2_features_4_conv_0_1_weight = constants_->at(28);
-    [[maybe_unused]] auto& mv2_features_4_conv_0_1_bias = constants_->at(29);
-    [[maybe_unused]] auto& mv2_features_4_conv_1_0_weight = constants_->at(30);
-    [[maybe_unused]] auto& mv2_features_4_conv_1_1_weight = constants_->at(31);
-    [[maybe_unused]] auto& mv2_features_4_conv_1_1_bias = constants_->at(32);
-    [[maybe_unused]] auto& mv2_features_4_conv_2_weight = constants_->at(33);
-    [[maybe_unused]] auto& mv2_features_4_conv_3_weight = constants_->at(34);
-    [[maybe_unused]] auto& mv2_features_4_conv_3_bias = constants_->at(35);
-    [[maybe_unused]] auto& mv2_features_5_conv_0_0_weight = constants_->at(36);
-    [[maybe_unused]] auto& mv2_features_5_conv_0_1_weight = constants_->at(37);
-    [[maybe_unused]] auto& mv2_features_5_conv_0_1_bias = constants_->at(38);
-    [[maybe_unused]] auto& mv2_features_5_conv_1_0_weight = constants_->at(39);
-    [[maybe_unused]] auto& mv2_features_5_conv_1_1_weight = constants_->at(40);
-    [[maybe_unused]] auto& mv2_features_5_conv_1_1_bias = constants_->at(41);
-    [[maybe_unused]] auto& mv2_features_5_conv_2_weight = constants_->at(42);
-    [[maybe_unused]] auto& mv2_features_5_conv_3_weight = constants_->at(43);
-    [[maybe_unused]] auto& mv2_features_5_conv_3_bias = constants_->at(44);
-    [[maybe_unused]] auto& mv2_features_6_conv_0_0_weight = constants_->at(45);
-    [[maybe_unused]] auto& mv2_features_6_conv_0_1_weight = constants_->at(46);
-    [[maybe_unused]] auto& mv2_features_6_conv_0_1_bias = constants_->at(47);
-    [[maybe_unused]] auto& mv2_features_6_conv_1_0_weight = constants_->at(48);
-    [[maybe_unused]] auto& mv2_features_6_conv_1_1_weight = constants_->at(49);
-    [[maybe_unused]] auto& mv2_features_6_conv_1_1_bias = constants_->at(50);
-    [[maybe_unused]] auto& mv2_features_6_conv_2_weight = constants_->at(51);
-    [[maybe_unused]] auto& mv2_features_6_conv_3_weight = constants_->at(52);
-    [[maybe_unused]] auto& mv2_features_6_conv_3_bias = constants_->at(53);
-    [[maybe_unused]] auto& mv2_features_7_conv_0_0_weight = constants_->at(54);
-    [[maybe_unused]] auto& mv2_features_7_conv_0_1_weight = constants_->at(55);
-    [[maybe_unused]] auto& mv2_features_7_conv_0_1_bias = constants_->at(56);
-    [[maybe_unused]] auto& mv2_features_7_conv_1_0_weight = constants_->at(57);
-    [[maybe_unused]] auto& mv2_features_7_conv_1_1_weight = constants_->at(58);
-    [[maybe_unused]] auto& mv2_features_7_conv_1_1_bias = constants_->at(59);
-    [[maybe_unused]] auto& mv2_features_7_conv_2_weight = constants_->at(60);
-    [[maybe_unused]] auto& mv2_features_7_conv_3_weight = constants_->at(61);
-    [[maybe_unused]] auto& mv2_features_7_conv_3_bias = constants_->at(62);
-    [[maybe_unused]] auto& mv2_features_8_conv_0_0_weight = constants_->at(63);
-    [[maybe_unused]] auto& mv2_features_8_conv_0_1_weight = constants_->at(64);
-    [[maybe_unused]] auto& mv2_features_8_conv_0_1_bias = constants_->at(65);
-    [[maybe_unused]] auto& mv2_features_8_conv_1_0_weight = constants_->at(66);
-    [[maybe_unused]] auto& mv2_features_8_conv_1_1_weight = constants_->at(67);
-    [[maybe_unused]] auto& mv2_features_8_conv_1_1_bias = constants_->at(68);
-    [[maybe_unused]] auto& mv2_features_8_conv_2_weight = constants_->at(69);
-    [[maybe_unused]] auto& mv2_features_8_conv_3_weight = constants_->at(70);
-    [[maybe_unused]] auto& mv2_features_8_conv_3_bias = constants_->at(71);
-    [[maybe_unused]] auto& mv2_features_9_conv_0_0_weight = constants_->at(72);
-    [[maybe_unused]] auto& mv2_features_9_conv_0_1_weight = constants_->at(73);
-    [[maybe_unused]] auto& mv2_features_9_conv_0_1_bias = constants_->at(74);
-    [[maybe_unused]] auto& mv2_features_9_conv_1_0_weight = constants_->at(75);
-    [[maybe_unused]] auto& mv2_features_9_conv_1_1_weight = constants_->at(76);
-    [[maybe_unused]] auto& mv2_features_9_conv_1_1_bias = constants_->at(77);
-    [[maybe_unused]] auto& mv2_features_9_conv_2_weight = constants_->at(78);
-    [[maybe_unused]] auto& mv2_features_9_conv_3_weight = constants_->at(79);
-    [[maybe_unused]] auto& mv2_features_9_conv_3_bias = constants_->at(80);
-    [[maybe_unused]] auto& mv2_features_10_conv_0_0_weight = constants_->at(81);
-    [[maybe_unused]] auto& mv2_features_10_conv_0_1_weight = constants_->at(82);
-    [[maybe_unused]] auto& mv2_features_10_conv_0_1_bias = constants_->at(83);
-    [[maybe_unused]] auto& mv2_features_10_conv_1_0_weight = constants_->at(84);
-    [[maybe_unused]] auto& mv2_features_10_conv_1_1_weight = constants_->at(85);
-    [[maybe_unused]] auto& mv2_features_10_conv_1_1_bias = constants_->at(86);
-    [[maybe_unused]] auto& mv2_features_10_conv_2_weight = constants_->at(87);
-    [[maybe_unused]] auto& mv2_features_10_conv_3_weight = constants_->at(88);
-    [[maybe_unused]] auto& mv2_features_10_conv_3_bias = constants_->at(89);
-    [[maybe_unused]] auto& mv2_features_11_conv_0_0_weight = constants_->at(90);
-    [[maybe_unused]] auto& mv2_features_11_conv_0_1_weight = constants_->at(91);
-    [[maybe_unused]] auto& mv2_features_11_conv_0_1_bias = constants_->at(92);
-    [[maybe_unused]] auto& mv2_features_11_conv_1_0_weight = constants_->at(93);
-    [[maybe_unused]] auto& mv2_features_11_conv_1_1_weight = constants_->at(94);
-    [[maybe_unused]] auto& mv2_features_11_conv_1_1_bias = constants_->at(95);
-    [[maybe_unused]] auto& mv2_features_11_conv_2_weight = constants_->at(96);
-    [[maybe_unused]] auto& mv2_features_11_conv_3_weight = constants_->at(97);
-    [[maybe_unused]] auto& mv2_features_11_conv_3_bias = constants_->at(98);
-    [[maybe_unused]] auto& mv2_features_12_conv_0_0_weight = constants_->at(99);
-    [[maybe_unused]] auto& mv2_features_12_conv_0_1_weight = constants_->at(100);
-    [[maybe_unused]] auto& mv2_features_12_conv_0_1_bias = constants_->at(101);
-    [[maybe_unused]] auto& mv2_features_12_conv_1_0_weight = constants_->at(102);
-    [[maybe_unused]] auto& mv2_features_12_conv_1_1_weight = constants_->at(103);
-    [[maybe_unused]] auto& mv2_features_12_conv_1_1_bias = constants_->at(104);
-    [[maybe_unused]] auto& mv2_features_12_conv_2_weight = constants_->at(105);
-    [[maybe_unused]] auto& mv2_features_12_conv_3_weight = constants_->at(106);
-    [[maybe_unused]] auto& mv2_features_12_conv_3_bias = constants_->at(107);
-    [[maybe_unused]] auto& mv2_features_13_conv_0_0_weight = constants_->at(108);
-    [[maybe_unused]] auto& mv2_features_13_conv_0_1_weight = constants_->at(109);
-    [[maybe_unused]] auto& mv2_features_13_conv_0_1_bias = constants_->at(110);
-    [[maybe_unused]] auto& mv2_features_13_conv_1_0_weight = constants_->at(111);
-    [[maybe_unused]] auto& mv2_features_13_conv_1_1_weight = constants_->at(112);
-    [[maybe_unused]] auto& mv2_features_13_conv_1_1_bias = constants_->at(113);
-    [[maybe_unused]] auto& mv2_features_13_conv_2_weight = constants_->at(114);
-    [[maybe_unused]] auto& mv2_features_13_conv_3_weight = constants_->at(115);
-    [[maybe_unused]] auto& mv2_features_13_conv_3_bias = constants_->at(116);
-    [[maybe_unused]] auto& mv2_features_14_conv_0_0_weight = constants_->at(117);
-    [[maybe_unused]] auto& mv2_features_14_conv_0_1_weight = constants_->at(118);
-    [[maybe_unused]] auto& mv2_features_14_conv_0_1_bias = constants_->at(119);
-    [[maybe_unused]] auto& mv2_features_14_conv_1_0_weight = constants_->at(120);
-    [[maybe_unused]] auto& mv2_features_14_conv_1_1_weight = constants_->at(121);
-    [[maybe_unused]] auto& mv2_features_14_conv_1_1_bias = constants_->at(122);
-    [[maybe_unused]] auto& mv2_features_14_conv_2_weight = constants_->at(123);
-    [[maybe_unused]] auto& mv2_features_14_conv_3_weight = constants_->at(124);
-    [[maybe_unused]] auto& mv2_features_14_conv_3_bias = constants_->at(125);
-    [[maybe_unused]] auto& mv2_features_15_conv_0_0_weight = constants_->at(126);
-    [[maybe_unused]] auto& mv2_features_15_conv_0_1_weight = constants_->at(127);
-    [[maybe_unused]] auto& mv2_features_15_conv_0_1_bias = constants_->at(128);
-    [[maybe_unused]] auto& mv2_features_15_conv_1_0_weight = constants_->at(129);
-    [[maybe_unused]] auto& mv2_features_15_conv_1_1_weight = constants_->at(130);
-    [[maybe_unused]] auto& mv2_features_15_conv_1_1_bias = constants_->at(131);
-    [[maybe_unused]] auto& mv2_features_15_conv_2_weight = constants_->at(132);
-    [[maybe_unused]] auto& mv2_features_15_conv_3_weight = constants_->at(133);
-    [[maybe_unused]] auto& mv2_features_15_conv_3_bias = constants_->at(134);
-    [[maybe_unused]] auto& mv2_features_16_conv_0_0_weight = constants_->at(135);
-    [[maybe_unused]] auto& mv2_features_16_conv_0_1_weight = constants_->at(136);
-    [[maybe_unused]] auto& mv2_features_16_conv_0_1_bias = constants_->at(137);
-    [[maybe_unused]] auto& mv2_features_16_conv_1_0_weight = constants_->at(138);
-    [[maybe_unused]] auto& mv2_features_16_conv_1_1_weight = constants_->at(139);
-    [[maybe_unused]] auto& mv2_features_16_conv_1_1_bias = constants_->at(140);
-    [[maybe_unused]] auto& mv2_features_16_conv_2_weight = constants_->at(141);
-    [[maybe_unused]] auto& mv2_features_16_conv_3_weight = constants_->at(142);
-    [[maybe_unused]] auto& mv2_features_16_conv_3_bias = constants_->at(143);
-    [[maybe_unused]] auto& mv2_features_17_conv_0_0_weight = constants_->at(144);
-    [[maybe_unused]] auto& mv2_features_17_conv_0_1_weight = constants_->at(145);
-    [[maybe_unused]] auto& mv2_features_17_conv_0_1_bias = constants_->at(146);
-    [[maybe_unused]] auto& mv2_features_17_conv_1_0_weight = constants_->at(147);
-    [[maybe_unused]] auto& mv2_features_17_conv_1_1_weight = constants_->at(148);
-    [[maybe_unused]] auto& mv2_features_17_conv_1_1_bias = constants_->at(149);
-    [[maybe_unused]] auto& mv2_features_17_conv_2_weight = constants_->at(150);
-    [[maybe_unused]] auto& mv2_features_17_conv_3_weight = constants_->at(151);
-    [[maybe_unused]] auto& mv2_features_17_conv_3_bias = constants_->at(152);
-    [[maybe_unused]] auto& mv2_features_18_0_weight = constants_->at(153);
-    [[maybe_unused]] auto& mv2_features_18_1_weight = constants_->at(154);
-    [[maybe_unused]] auto& mv2_features_18_1_bias = constants_->at(155);
-    [[maybe_unused]] auto& mv2_classifier_1_weight = constants_->at(156);
-    [[maybe_unused]] auto& mv2_classifier_1_bias = constants_->at(157);
-    [[maybe_unused]] auto& mv2_features_0_1_running_mean = constants_->at(158);
-    [[maybe_unused]] auto& mv2_features_0_1_running_var = constants_->at(159);
-    [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_mean = constants_->at(160);
-    [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_var = constants_->at(161);
-    [[maybe_unused]] auto& mv2_features_1_conv_2_running_mean = constants_->at(162);
-    [[maybe_unused]] auto& mv2_features_1_conv_2_running_var = constants_->at(163);
-    [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_mean = constants_->at(164);
-    [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_var = constants_->at(165);
-    [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_mean = constants_->at(166);
-    [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_var = constants_->at(167);
-    [[maybe_unused]] auto& mv2_features_2_conv_3_running_mean = constants_->at(168);
-    [[maybe_unused]] auto& mv2_features_2_conv_3_running_var = constants_->at(169);
-    [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_mean = constants_->at(170);
-    [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_var = constants_->at(171);
-    [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_mean = constants_->at(172);
-    [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_var = constants_->at(173);
-    [[maybe_unused]] auto& mv2_features_3_conv_3_running_mean = constants_->at(174);
-    [[maybe_unused]] auto& mv2_features_3_conv_3_running_var = constants_->at(175);
-    [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_mean = constants_->at(176);
-    [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_var = constants_->at(177);
-    [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_mean = constants_->at(178);
-    [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_var = constants_->at(179);
-    [[maybe_unused]] auto& mv2_features_4_conv_3_running_mean = constants_->at(180);
-    [[maybe_unused]] auto& mv2_features_4_conv_3_running_var = constants_->at(181);
-    [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_mean = constants_->at(182);
-    [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_var = constants_->at(183);
-    [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_mean = constants_->at(184);
-    [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_var = constants_->at(185);
-    [[maybe_unused]] auto& mv2_features_5_conv_3_running_mean = constants_->at(186);
-    [[maybe_unused]] auto& mv2_features_5_conv_3_running_var = constants_->at(187);
-    [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_mean = constants_->at(188);
-    [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_var = constants_->at(189);
-    [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_mean = constants_->at(190);
-    [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_var = constants_->at(191);
-    [[maybe_unused]] auto& mv2_features_6_conv_3_running_mean = constants_->at(192);
-    [[maybe_unused]] auto& mv2_features_6_conv_3_running_var = constants_->at(193);
-    [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_mean = constants_->at(194);
-    [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_var = constants_->at(195);
-    [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_mean = constants_->at(196);
-    [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_var = constants_->at(197);
-    [[maybe_unused]] auto& mv2_features_7_conv_3_running_mean = constants_->at(198);
-    [[maybe_unused]] auto& mv2_features_7_conv_3_running_var = constants_->at(199);
-    [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_mean = constants_->at(200);
-    [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_var = constants_->at(201);
-    [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_mean = constants_->at(202);
-    [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_var = constants_->at(203);
-    [[maybe_unused]] auto& mv2_features_8_conv_3_running_mean = constants_->at(204);
-    [[maybe_unused]] auto& mv2_features_8_conv_3_running_var = constants_->at(205);
-    [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_mean = constants_->at(206);
-    [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_var = constants_->at(207);
-    [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_mean = constants_->at(208);
-    [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_var = constants_->at(209);
-    [[maybe_unused]] auto& mv2_features_9_conv_3_running_mean = constants_->at(210);
-    [[maybe_unused]] auto& mv2_features_9_conv_3_running_var = constants_->at(211);
-    [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_mean = constants_->at(212);
-    [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_var = constants_->at(213);
-    [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_mean = constants_->at(214);
-    [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_var = constants_->at(215);
-    [[maybe_unused]] auto& mv2_features_10_conv_3_running_mean = constants_->at(216);
-    [[maybe_unused]] auto& mv2_features_10_conv_3_running_var = constants_->at(217);
-    [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_mean = constants_->at(218);
-    [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_var = constants_->at(219);
-    [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_mean = constants_->at(220);
-    [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_var = constants_->at(221);
-    [[maybe_unused]] auto& mv2_features_11_conv_3_running_mean = constants_->at(222);
-    [[maybe_unused]] auto& mv2_features_11_conv_3_running_var = constants_->at(223);
-    [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_mean = constants_->at(224);
-    [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_var = constants_->at(225);
-    [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_mean = constants_->at(226);
-    [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_var = constants_->at(227);
-    [[maybe_unused]] auto& mv2_features_12_conv_3_running_mean = constants_->at(228);
-    [[maybe_unused]] auto& mv2_features_12_conv_3_running_var = constants_->at(229);
-    [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_mean = constants_->at(230);
-    [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_var = constants_->at(231);
-    [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_mean = constants_->at(232);
-    [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_var = constants_->at(233);
-    [[maybe_unused]] auto& mv2_features_13_conv_3_running_mean = constants_->at(234);
-    [[maybe_unused]] auto& mv2_features_13_conv_3_running_var = constants_->at(235);
-    [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_mean = constants_->at(236);
-    [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_var = constants_->at(237);
-    [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_mean = constants_->at(238);
-    [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_var = constants_->at(239);
-    [[maybe_unused]] auto& mv2_features_14_conv_3_running_mean = constants_->at(240);
-    [[maybe_unused]] auto& mv2_features_14_conv_3_running_var = constants_->at(241);
-    [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_mean = constants_->at(242);
-    [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_var = constants_->at(243);
-    [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_mean = constants_->at(244);
-    [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_var = constants_->at(245);
-    [[maybe_unused]] auto& mv2_features_15_conv_3_running_mean = constants_->at(246);
-    [[maybe_unused]] auto& mv2_features_15_conv_3_running_var = constants_->at(247);
-    [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_mean = constants_->at(248);
-    [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_var = constants_->at(249);
-    [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_mean = constants_->at(250);
-    [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_var = constants_->at(251);
-    [[maybe_unused]] auto& mv2_features_16_conv_3_running_mean = constants_->at(252);
-    [[maybe_unused]] auto& mv2_features_16_conv_3_running_var = constants_->at(253);
-    [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_mean = constants_->at(254);
-    [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_var = constants_->at(255);
-    [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_mean = constants_->at(256);
-    [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_var = constants_->at(257);
-    [[maybe_unused]] auto& mv2_features_17_conv_3_running_mean = constants_->at(258);
-    [[maybe_unused]] auto& mv2_features_17_conv_3_running_var = constants_->at(259);
-    [[maybe_unused]] auto& mv2_features_18_1_running_mean = constants_->at(260);
-    [[maybe_unused]] auto& mv2_features_18_1_running_var = constants_->at(261);
-
-    if ((long(arg262_1.data_ptr()) & (16 -1)) != 0) {
-        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
-        AtenTensorHandle arg262_1_aligned;
-        aoti_torch_clone_preserve_strides(arg262_1, &arg262_1_aligned);
-        arg262_1 = std::move(RAIIAtenTensorHandle(arg262_1_aligned));
-    }
-    inputs.clear();
-    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
-
-    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
-    static constexpr int64_t int_array_0[] = {1L, 3L, 224L, 224L};
-    static constexpr int64_t int_array_1[] = {150528L, 1L, 672L, 3L};
-    AtenTensorHandle buf0_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
-    RAIIAtenTensorHandle buf0(buf0_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_0(arg262_1, buf0, 3L, 50176L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    arg262_1.reset();
-    static constexpr int64_t int_array_2[] = {32L, 3L, 3L, 3L};
-    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
-    AtenTensorHandle buf1_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
-    RAIIAtenTensorHandle buf1(buf1_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_1(mv2_features_0_0_weight, buf1, 96L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    AtenTensorHandle buf2_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
-    RAIIAtenTensorHandle buf2(buf2_handle);
-    buf0.reset();
-    buf1.reset();
-    auto buf3 = std::move(buf2);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf3, mv2_features_0_1_running_mean, mv2_features_0_1_running_var, mv2_features_0_1_weight, mv2_features_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default, aten_convolution_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf4_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf3, mv2_features_1_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 32L, &buf4_handle));
-    RAIIAtenTensorHandle buf4(buf4_handle);
-    buf3.reset();
-    auto buf5 = std::move(buf4);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf5, mv2_features_1_conv_0_1_running_mean, mv2_features_1_conv_0_1_running_var, mv2_features_1_conv_0_1_weight, mv2_features_1_conv_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1, aten_convolution_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf6_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf5, mv2_features_1_conv_1_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf6_handle));
-    RAIIAtenTensorHandle buf6(buf6_handle);
-    buf5.reset();
-    auto buf7 = std::move(buf6);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_3(buf7, mv2_features_1_conv_2_running_mean, mv2_features_1_conv_2_running_var, mv2_features_1_conv_2_weight, mv2_features_1_conv_2_bias, 200704L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2, aten_convolution_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution]
-    AtenTensorHandle buf8_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf7, mv2_features_2_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf8_handle));
-    RAIIAtenTensorHandle buf8(buf8_handle);
-    buf7.reset();
-    auto buf9 = std::move(buf8);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(buf9, mv2_features_2_conv_0_1_running_mean, mv2_features_2_conv_0_1_running_var, mv2_features_2_conv_0_1_weight, mv2_features_2_conv_0_1_bias, 1204224L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2, aten_convolution_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf10_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf9, mv2_features_2_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 96L, &buf10_handle));
-    RAIIAtenTensorHandle buf10(buf10_handle);
-    buf9.reset();
-    auto buf11 = std::move(buf10);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(buf11, mv2_features_2_conv_1_1_running_mean, mv2_features_2_conv_1_1_running_var, mv2_features_2_conv_1_1_weight, mv2_features_2_conv_1_1_bias, 301056L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3, aten_convolution_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf12_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf11, mv2_features_2_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf12_handle));
-    RAIIAtenTensorHandle buf12(buf12_handle);
-    buf11.reset();
-    auto buf13 = std::move(buf12);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_5], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_6(buf13, mv2_features_2_conv_3_running_mean, mv2_features_2_conv_3_running_var, mv2_features_2_conv_3_weight, mv2_features_2_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default_6], Original ATen: [aten.convolution]
-    AtenTensorHandle buf14_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf13, mv2_features_3_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf14_handle));
-    RAIIAtenTensorHandle buf14(buf14_handle);
-    auto buf15 = std::move(buf14);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf15, mv2_features_3_conv_0_1_running_mean, mv2_features_3_conv_0_1_running_var, mv2_features_3_conv_0_1_weight, mv2_features_3_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4, aten_convolution_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf16_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf15, mv2_features_3_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 144L, &buf16_handle));
-    RAIIAtenTensorHandle buf16(buf16_handle);
-    buf15.reset();
-    auto buf17 = std::move(buf16);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf17, mv2_features_3_conv_1_1_running_mean, mv2_features_3_conv_1_1_running_var, mv2_features_3_conv_1_1_weight, mv2_features_3_conv_1_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5, aten_convolution_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf18_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf17, mv2_features_3_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf18_handle));
-    RAIIAtenTensorHandle buf18(buf18_handle);
-    buf17.reset();
-    auto buf19 = std::move(buf13);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(buf19, buf18, mv2_features_3_conv_3_running_mean, mv2_features_3_conv_3_running_var, mv2_features_3_conv_3_weight, mv2_features_3_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf18.reset();
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor, aten_convolution_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
-    AtenTensorHandle buf20_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf19, mv2_features_4_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf20_handle));
-    RAIIAtenTensorHandle buf20(buf20_handle);
-    buf19.reset();
-    auto buf21 = std::move(buf20);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf21, mv2_features_4_conv_0_1_running_mean, mv2_features_4_conv_0_1_running_var, mv2_features_4_conv_0_1_weight, mv2_features_4_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6, aten_convolution_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf22_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf21, mv2_features_4_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 144L, &buf22_handle));
-    RAIIAtenTensorHandle buf22(buf22_handle);
-    buf21.reset();
-    auto buf23 = std::move(buf22);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(buf23, mv2_features_4_conv_1_1_running_mean, mv2_features_4_conv_1_1_running_var, mv2_features_4_conv_1_1_weight, mv2_features_4_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7, aten_convolution_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf24_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf23, mv2_features_4_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf24_handle));
-    RAIIAtenTensorHandle buf24(buf24_handle);
-    buf23.reset();
-    auto buf25 = std::move(buf24);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_11], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_10(buf25, mv2_features_4_conv_3_running_mean, mv2_features_4_conv_3_running_var, mv2_features_4_conv_3_weight, mv2_features_4_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default_12], Original ATen: [aten.convolution]
-    AtenTensorHandle buf26_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf25, mv2_features_5_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf26_handle));
-    RAIIAtenTensorHandle buf26(buf26_handle);
-    auto buf27 = std::move(buf26);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf27, mv2_features_5_conv_0_1_running_mean, mv2_features_5_conv_0_1_running_var, mv2_features_5_conv_0_1_weight, mv2_features_5_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8, aten_convolution_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf28_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf27, mv2_features_5_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf28_handle));
-    RAIIAtenTensorHandle buf28(buf28_handle);
-    buf27.reset();
-    auto buf29 = std::move(buf28);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf29, mv2_features_5_conv_1_1_running_mean, mv2_features_5_conv_1_1_running_var, mv2_features_5_conv_1_1_weight, mv2_features_5_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9, aten_convolution_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf30_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf29, mv2_features_5_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf30_handle));
-    RAIIAtenTensorHandle buf30(buf30_handle);
-    buf29.reset();
-    auto buf31 = std::move(buf25);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_14, aten_add_tensor_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf31, buf30, mv2_features_5_conv_3_running_mean, mv2_features_5_conv_3_running_var, mv2_features_5_conv_3_weight, mv2_features_5_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf30.reset();
-    // Topologically Sorted Source Nodes: [aten_convolution_default_15], Original ATen: [aten.convolution]
-    AtenTensorHandle buf32_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf31, mv2_features_6_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf32_handle));
-    RAIIAtenTensorHandle buf32(buf32_handle);
-    auto buf33 = std::move(buf32);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf33, mv2_features_6_conv_0_1_running_mean, mv2_features_6_conv_0_1_running_var, mv2_features_6_conv_0_1_weight, mv2_features_6_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10, aten_convolution_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf34_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf33, mv2_features_6_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf34_handle));
-    RAIIAtenTensorHandle buf34(buf34_handle);
-    buf33.reset();
-    auto buf35 = std::move(buf34);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf35, mv2_features_6_conv_1_1_running_mean, mv2_features_6_conv_1_1_running_var, mv2_features_6_conv_1_1_weight, mv2_features_6_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11, aten_convolution_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf36_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf35, mv2_features_6_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf36_handle));
-    RAIIAtenTensorHandle buf36(buf36_handle);
-    buf35.reset();
-    auto buf37 = std::move(buf31);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf37, buf36, mv2_features_6_conv_3_running_mean, mv2_features_6_conv_3_running_var, mv2_features_6_conv_3_weight, mv2_features_6_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf36.reset();
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2, aten_convolution_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
-    AtenTensorHandle buf38_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf37, mv2_features_7_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf38_handle));
-    RAIIAtenTensorHandle buf38(buf38_handle);
-    buf37.reset();
-    auto buf39 = std::move(buf38);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf39, mv2_features_7_conv_0_1_running_mean, mv2_features_7_conv_0_1_running_var, mv2_features_7_conv_0_1_weight, mv2_features_7_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12, aten_convolution_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf40_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf39, mv2_features_7_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 192L, &buf40_handle));
-    RAIIAtenTensorHandle buf40(buf40_handle);
-    buf39.reset();
-    auto buf41 = std::move(buf40);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(buf41, mv2_features_7_conv_1_1_running_mean, mv2_features_7_conv_1_1_running_var, mv2_features_7_conv_1_1_weight, mv2_features_7_conv_1_1_bias, 37632L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13, aten_convolution_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf42_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf41, mv2_features_7_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf42_handle));
-    RAIIAtenTensorHandle buf42(buf42_handle);
-    buf41.reset();
-    auto buf43 = std::move(buf42);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_20], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_14(buf43, mv2_features_7_conv_3_running_mean, mv2_features_7_conv_3_running_var, mv2_features_7_conv_3_weight, mv2_features_7_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default_21], Original ATen: [aten.convolution]
-    AtenTensorHandle buf44_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf43, mv2_features_8_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf44_handle));
-    RAIIAtenTensorHandle buf44(buf44_handle);
-    auto buf45 = std::move(buf44);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf45, mv2_features_8_conv_0_1_running_mean, mv2_features_8_conv_0_1_running_var, mv2_features_8_conv_0_1_weight, mv2_features_8_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14, aten_convolution_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf46_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf45, mv2_features_8_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf46_handle));
-    RAIIAtenTensorHandle buf46(buf46_handle);
-    buf45.reset();
-    auto buf47 = std::move(buf46);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf47, mv2_features_8_conv_1_1_running_mean, mv2_features_8_conv_1_1_running_var, mv2_features_8_conv_1_1_weight, mv2_features_8_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15, aten_convolution_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf48_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf47, mv2_features_8_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf48_handle));
-    RAIIAtenTensorHandle buf48(buf48_handle);
-    buf47.reset();
-    auto buf49 = std::move(buf43);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_23, aten_add_tensor_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf49, buf48, mv2_features_8_conv_3_running_mean, mv2_features_8_conv_3_running_var, mv2_features_8_conv_3_weight, mv2_features_8_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf48.reset();
-    // Topologically Sorted Source Nodes: [aten_convolution_default_24], Original ATen: [aten.convolution]
-    AtenTensorHandle buf50_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf49, mv2_features_9_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf50_handle));
-    RAIIAtenTensorHandle buf50(buf50_handle);
-    auto buf51 = std::move(buf50);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf51, mv2_features_9_conv_0_1_running_mean, mv2_features_9_conv_0_1_running_var, mv2_features_9_conv_0_1_weight, mv2_features_9_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16, aten_convolution_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf52_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf51, mv2_features_9_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf52_handle));
-    RAIIAtenTensorHandle buf52(buf52_handle);
-    buf51.reset();
-    auto buf53 = std::move(buf52);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf53, mv2_features_9_conv_1_1_running_mean, mv2_features_9_conv_1_1_running_var, mv2_features_9_conv_1_1_weight, mv2_features_9_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17, aten_convolution_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf54_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf53, mv2_features_9_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf54_handle));
-    RAIIAtenTensorHandle buf54(buf54_handle);
-    buf53.reset();
-    auto buf55 = std::move(buf49);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_26, aten_add_tensor_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf55, buf54, mv2_features_9_conv_3_running_mean, mv2_features_9_conv_3_running_var, mv2_features_9_conv_3_weight, mv2_features_9_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf54.reset();
-    // Topologically Sorted Source Nodes: [aten_convolution_default_27], Original ATen: [aten.convolution]
-    AtenTensorHandle buf56_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf55, mv2_features_10_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf56_handle));
-    RAIIAtenTensorHandle buf56(buf56_handle);
-    auto buf57 = std::move(buf56);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf57, mv2_features_10_conv_0_1_running_mean, mv2_features_10_conv_0_1_running_var, mv2_features_10_conv_0_1_weight, mv2_features_10_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18, aten_convolution_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf58_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf57, mv2_features_10_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf58_handle));
-    RAIIAtenTensorHandle buf58(buf58_handle);
-    buf57.reset();
-    auto buf59 = std::move(buf58);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf59, mv2_features_10_conv_1_1_running_mean, mv2_features_10_conv_1_1_running_var, mv2_features_10_conv_1_1_weight, mv2_features_10_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19, aten_convolution_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf60_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf59, mv2_features_10_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf60_handle));
-    RAIIAtenTensorHandle buf60(buf60_handle);
-    buf59.reset();
-    auto buf61 = std::move(buf55);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf61, buf60, mv2_features_10_conv_3_running_mean, mv2_features_10_conv_3_running_var, mv2_features_10_conv_3_weight, mv2_features_10_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf60.reset();
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5, aten_convolution_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
-    AtenTensorHandle buf62_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf61, mv2_features_11_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf62_handle));
-    RAIIAtenTensorHandle buf62(buf62_handle);
-    buf61.reset();
-    auto buf63 = std::move(buf62);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf63, mv2_features_11_conv_0_1_running_mean, mv2_features_11_conv_0_1_running_var, mv2_features_11_conv_0_1_weight, mv2_features_11_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20, aten_convolution_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf64_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf63, mv2_features_11_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 384L, &buf64_handle));
-    RAIIAtenTensorHandle buf64(buf64_handle);
-    buf63.reset();
-    auto buf65 = std::move(buf64);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf65, mv2_features_11_conv_1_1_running_mean, mv2_features_11_conv_1_1_running_var, mv2_features_11_conv_1_1_weight, mv2_features_11_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21, aten_convolution_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf66_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf65, mv2_features_11_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf66_handle));
-    RAIIAtenTensorHandle buf66(buf66_handle);
-    buf65.reset();
-    auto buf67 = std::move(buf66);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_32], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_17(buf67, mv2_features_11_conv_3_running_mean, mv2_features_11_conv_3_running_var, mv2_features_11_conv_3_weight, mv2_features_11_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default_33], Original ATen: [aten.convolution]
-    AtenTensorHandle buf68_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf67, mv2_features_12_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf68_handle));
-    RAIIAtenTensorHandle buf68(buf68_handle);
-    auto buf69 = std::move(buf68);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf69, mv2_features_12_conv_0_1_running_mean, mv2_features_12_conv_0_1_running_var, mv2_features_12_conv_0_1_weight, mv2_features_12_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22, aten_convolution_default_34], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf70_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf69, mv2_features_12_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf70_handle));
-    RAIIAtenTensorHandle buf70(buf70_handle);
-    buf69.reset();
-    auto buf71 = std::move(buf70);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf71, mv2_features_12_conv_1_1_running_mean, mv2_features_12_conv_1_1_running_var, mv2_features_12_conv_1_1_weight, mv2_features_12_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23, aten_convolution_default_35], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf72_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf71, mv2_features_12_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf72_handle));
-    RAIIAtenTensorHandle buf72(buf72_handle);
-    buf71.reset();
-    auto buf73 = std::move(buf67);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_35, aten_add_tensor_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf73, buf72, mv2_features_12_conv_3_running_mean, mv2_features_12_conv_3_running_var, mv2_features_12_conv_3_weight, mv2_features_12_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf72.reset();
-    // Topologically Sorted Source Nodes: [aten_convolution_default_36], Original ATen: [aten.convolution]
-    AtenTensorHandle buf74_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf73, mv2_features_13_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf74_handle));
-    RAIIAtenTensorHandle buf74(buf74_handle);
-    auto buf75 = std::move(buf74);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf75, mv2_features_13_conv_0_1_running_mean, mv2_features_13_conv_0_1_running_var, mv2_features_13_conv_0_1_weight, mv2_features_13_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24, aten_convolution_default_37], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf76_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf75, mv2_features_13_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf76_handle));
-    RAIIAtenTensorHandle buf76(buf76_handle);
-    buf75.reset();
-    auto buf77 = std::move(buf76);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf77, mv2_features_13_conv_1_1_running_mean, mv2_features_13_conv_1_1_running_var, mv2_features_13_conv_1_1_weight, mv2_features_13_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25, aten_convolution_default_38], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf78_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf77, mv2_features_13_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf78_handle));
-    RAIIAtenTensorHandle buf78(buf78_handle);
-    buf77.reset();
-    auto buf79 = std::move(buf73);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf79, buf78, mv2_features_13_conv_3_running_mean, mv2_features_13_conv_3_running_var, mv2_features_13_conv_3_weight, mv2_features_13_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf78.reset();
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7, aten_convolution_default_39], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
-    AtenTensorHandle buf80_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf79, mv2_features_14_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf80_handle));
-    RAIIAtenTensorHandle buf80(buf80_handle);
-    buf79.reset();
-    auto buf81 = std::move(buf80);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf81, mv2_features_14_conv_0_1_running_mean, mv2_features_14_conv_0_1_running_var, mv2_features_14_conv_0_1_weight, mv2_features_14_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26, aten_convolution_default_40], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf82_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf81, mv2_features_14_conv_1_0_weight, nullptr, std::array<int64_t, 2>{2L, 2L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 576L, &buf82_handle));
-    RAIIAtenTensorHandle buf82(buf82_handle);
-    buf81.reset();
-    auto buf83 = std::move(buf82);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(buf83, mv2_features_14_conv_1_1_running_mean, mv2_features_14_conv_1_1_running_var, mv2_features_14_conv_1_1_weight, mv2_features_14_conv_1_1_bias, 28224L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27, aten_convolution_default_41], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf84_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf83, mv2_features_14_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf84_handle));
-    RAIIAtenTensorHandle buf84(buf84_handle);
-    buf83.reset();
-    auto buf85 = std::move(buf84);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_41], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_21(buf85, mv2_features_14_conv_3_running_mean, mv2_features_14_conv_3_running_var, mv2_features_14_conv_3_weight, mv2_features_14_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default_42], Original ATen: [aten.convolution]
-    AtenTensorHandle buf86_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf85, mv2_features_15_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf86_handle));
-    RAIIAtenTensorHandle buf86(buf86_handle);
-    auto buf87 = std::move(buf86);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf87, mv2_features_15_conv_0_1_running_mean, mv2_features_15_conv_0_1_running_var, mv2_features_15_conv_0_1_weight, mv2_features_15_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28, aten_convolution_default_43], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf88_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf87, mv2_features_15_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf88_handle));
-    RAIIAtenTensorHandle buf88(buf88_handle);
-    buf87.reset();
-    auto buf89 = std::move(buf88);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf89, mv2_features_15_conv_1_1_running_mean, mv2_features_15_conv_1_1_running_var, mv2_features_15_conv_1_1_weight, mv2_features_15_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29, aten_convolution_default_44], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf90_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf89, mv2_features_15_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf90_handle));
-    RAIIAtenTensorHandle buf90(buf90_handle);
-    buf89.reset();
-    auto buf91 = std::move(buf85);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_44, aten_add_tensor_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf91, buf90, mv2_features_15_conv_3_running_mean, mv2_features_15_conv_3_running_var, mv2_features_15_conv_3_weight, mv2_features_15_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf90.reset();
-    // Topologically Sorted Source Nodes: [aten_convolution_default_45], Original ATen: [aten.convolution]
-    AtenTensorHandle buf92_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf91, mv2_features_16_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf92_handle));
-    RAIIAtenTensorHandle buf92(buf92_handle);
-    auto buf93 = std::move(buf92);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf93, mv2_features_16_conv_0_1_running_mean, mv2_features_16_conv_0_1_running_var, mv2_features_16_conv_0_1_weight, mv2_features_16_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30, aten_convolution_default_46], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf94_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf93, mv2_features_16_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf94_handle));
-    RAIIAtenTensorHandle buf94(buf94_handle);
-    buf93.reset();
-    auto buf95 = std::move(buf94);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf95, mv2_features_16_conv_1_1_running_mean, mv2_features_16_conv_1_1_running_var, mv2_features_16_conv_1_1_weight, mv2_features_16_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31, aten_convolution_default_47], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf96_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf95, mv2_features_16_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf96_handle));
-    RAIIAtenTensorHandle buf96(buf96_handle);
-    buf95.reset();
-    auto buf97 = std::move(buf91);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf97, buf96, mv2_features_16_conv_3_running_mean, mv2_features_16_conv_3_running_var, mv2_features_16_conv_3_weight, mv2_features_16_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf96.reset();
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9, aten_convolution_default_48], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution]
-    AtenTensorHandle buf98_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf97, mv2_features_17_conv_0_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf98_handle));
-    RAIIAtenTensorHandle buf98(buf98_handle);
-    buf97.reset();
-    auto buf99 = std::move(buf98);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf99, mv2_features_17_conv_0_1_running_mean, mv2_features_17_conv_0_1_running_var, mv2_features_17_conv_0_1_weight, mv2_features_17_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32, aten_convolution_default_49], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf100_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf99, mv2_features_17_conv_1_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 960L, &buf100_handle));
-    RAIIAtenTensorHandle buf100(buf100_handle);
-    buf99.reset();
-    auto buf101 = std::move(buf100);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf101, mv2_features_17_conv_1_1_running_mean, mv2_features_17_conv_1_1_running_var, mv2_features_17_conv_1_1_weight, mv2_features_17_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33, aten_convolution_default_50], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution]
-    AtenTensorHandle buf102_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf101, mv2_features_17_conv_2_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf102_handle));
-    RAIIAtenTensorHandle buf102(buf102_handle);
-    buf101.reset();
-    auto buf103 = std::move(buf102);  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50], Original ATen: [aten._native_batch_norm_legit_no_training]
-    call_triton_poi_fused__native_batch_norm_legit_no_training_24(buf103, mv2_features_17_conv_3_running_mean, mv2_features_17_conv_3_running_var, mv2_features_17_conv_3_weight, mv2_features_17_conv_3_bias, 15680L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50, aten_convolution_default_51], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution]
-    AtenTensorHandle buf104_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf103, mv2_features_18_0_weight, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf104_handle));
-    RAIIAtenTensorHandle buf104(buf104_handle);
-    buf103.reset();
-    static constexpr int64_t int_array_4[] = {1L, 1280L, 1L, 1L};
-    static constexpr int64_t int_array_5[] = {1280L, 1L, 1280L, 1280L};
-    AtenTensorHandle buf105_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf105_handle));
-    RAIIAtenTensorHandle buf105(buf105_handle);
-    static constexpr int64_t int_array_6[] = {1280L, 1L, 1L, 1L};
-    auto buf106 = wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf105, 4, int_array_4, int_array_6, 0L)); buf105.reset();  // reuse
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean]
-    call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(buf106, buf104, mv2_features_18_1_running_mean, mv2_features_18_1_running_var, mv2_features_18_1_weight, mv2_features_18_1_bias, 1280L, 49L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf104.reset();
-    static constexpr int64_t int_array_7[] = {1280L, 1000L};
-    static constexpr int64_t int_array_8[] = {1L, 1280L};
-    AtenTensorHandle buf107_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_7, int_array_8, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf107_handle));
-    RAIIAtenTensorHandle buf107(buf107_handle);
-    // Topologically Sorted Source Nodes: [aten_permute_copy_default], Original ATen: [aten.permute_copy]
-    call_triton_poi_fused_permute_copy_26(mv2_classifier_1_weight, buf107, 1280000L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    static constexpr int64_t int_array_9[] = {1L, 1000L};
-    static constexpr int64_t int_array_10[] = {1000L, 1L};
-    AtenTensorHandle buf108_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_9, int_array_10, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf108_handle));
-    RAIIAtenTensorHandle buf108(buf108_handle);
-    // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim, aten_view_copy_default, aten_permute_copy_default, aten_addmm_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean, aten.view_copy, aten.permute_copy, aten.addmm]
-    static constexpr int64_t int_array_11[] = {0L, 1L};
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_addmm_out(buf108, mv2_classifier_1_bias, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf106, 2, int_array_8, int_array_11, 0L)), buf107, 1L, 1L));
-    buf106.reset();
-    buf107.reset();
-    output_handles[0] = buf108.release();
-} // AOTInductorModel::run_impl
-} // namespace torch::aot_inductor
-
-
-
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin b/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin
deleted file mode 100644
index 5098c505ebb138fa361a0971a8c7d89086e12f3b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJz`FBm61Q{^C@`iK(E)oA44uyQB{n$c>-iF#
zmeRCLXrPHsd-*VF9}u4=HffrKCN@o5KLDx_+R#2seC#4LAzHd56ofQ}_xnFDm$RMh
zB~7bs;*{L;{NG=n=Q*E!?9KysMI({?CMK>Z`z@Q7a~j^hNdRvTi}ArL9%K`&$og=<
zibWVRn$@Q7`Bn)tHO@^7n2%<&>Qq|HaAu8~GudE<Z?}BI_iBE_G2KSZoSbRe71MMq
zzcy=|CCe{YO{d{an{|7t=3{F5o>g;d&XifTyozr*RWq9n6mF2JCUo1CjaBB^P1~D=
z)(rP{s@^DBb!4IDbGY1anux|PP-=~eU7DFPCp~N0RzSVxVD8y<Nep1C1L4e?LYU}g
z(`q(_&>DCPrRWt1aIW7n{T4hVyC|CS(r{~KjwJa2D|}yZQTh*r@=;<)#;Vt+Jj<=}
z=v}YjH~cvl5r*_-HlPBcQ!nNHlcrx~F{*CAf7+fl8k3Vv+ZTN=mzJVqHDI;RnhL)E
zo;&X`1;3dwt$NKu|Dq+r%f;MfVt4826yA9Pk9$t7ng6u~6fCRG4d6doKpV4}s)c?@
zmtYRdQe%TXi*pD^$<7cqCg-;SuRk{wqKIh;c#UoC!F#|*Lihp6F~|8+J^ar|e#NIW
z{*d!4FIL2!=lJ*vLeBoc@qIn`tD@wmI$c-DKF4ofLCD#6fgc(S<wvUgBj>-ehrf+$
zm&DSXE{(+=r!lqV`273`OSF&Ak0kh%F~aZe(IOGjR%{=F5wbCrfhT~9vOj|{68T$l
zGwu;!SS0OFyJ^4Sm8&(UGE>IPoU)pYhVw|xO1mwVt~RFa^a_d&j*6AGo!MsEYWTHu
zy;e%QbAGkq<c(}5-K_cc=UuD(s5NCbh2pfSRHwZe$FJdhWM#KCULUX7waTH{#~!Iw
zoYEt$x_!8kuXwrXL*tG&=}!5xg@!XXSG7l5h0#L2Vz_fG@|(YW|CPSiw#gM!wd=0!
zHHi}a_;-JLx$oC6BC3AmT3oS+wl%W+{(0?ibTIP17LA@^kwvWq;Jve2B&tW2G%cEm
zoLkn80XPppk1U=$zpOnGeS<}o7SAp(zJHE0YDYo3@XiIWoQXuvU3fPZ)m~>%1QWkJ
zMq^aF6~mI&kGnQQ(?!tsLoE7S6#3S81*kJRD*!~uVWw@1XjhVo&(e)t)xJr$U37b#
z+|sV5=@z=}r`sUiJlvL+mKK+^>j^aJ#`(BAj{-caeVvf6kkdlirkO*xVY*FsDZe+M
za9dXpIE&lj(&F;++4n?Rjv;-RWg_Pnk;6=6`P~ceesGp2VVP(Bz2&7P${NxzPidgS
z7_%r^t>NfJV;s+umN#YldUM*`ld+g#RVp}Hnr5?h*w)ABl#nNhZs2U0a`dcjl#04I
zIUdSpAJ)I1AK5dyTc`iA@!fin#v=YDZaf2A0rbq#oBx?EI)xs}j{i%2Mbejp{yoAT
z*)8VrpB4CG0d^2$tywG8ZQZaP-<#7fdXFDrhHK&JQa5JC3cBI?p5Bv~i#B3%6J}=j
zWi*z(oW^pO(^&p;8Y_g1<#QL`Z?Y9eLuxO1-;J>w=_AS^oDvm1H#)|S5IBWy$7ege
zf7soZ&EJAW%yA_|*+1;<E96#C@DQlVz8vA$t^+1MojPT+gioSQW-skMbdJz6GD?y+
zi{<;!hAo_jZ(ha1*alk2!UDyGbDIL1Zp<1pA`w~aLBnO=s#S~@f^SyqhD=cwyWg-!
z^gXM&hdpz{5|CRp0oaW<%t9U(I>JW%hS4t+bU0=t9WjHLT`gwp-5cgn0UlXSCCE1N
z3CplQn$5u1h8Iwf2T%mPKsB;`*QonOlULBi0Rl3ZvZI;NT#@3OsT<Xrq<dL-Ef~hL
zqs82Kv6!crJ=Fm@oJ^pDE03VV1fdy(QMg*ED1|v3W5ZsKQ*-c@%+(tKl^V(;p6s3+
z^v#q4ifAYvP7XeB!N+P9TQO9mE8sw!+@e};q_aYG!9X@wiV4?u>zNk$A-fvZC1zsS
z&FWaY@<Zipj_33UlnCebqbMlXtbb0==+q;tI3;-De3w%^&<d4|oI>uTQ}7uG@N=)q
zo^AQI+o;#da|S-AmVG?K;DykE+}^$RY_05@b*t&~jYUXhjjs4$rz<|CyDPo}*$$Ot
zp=_0_wu#4d-v~v|Jud{&=SB294r;KD)Qj*$`}IbNu3Sfykm`vNwe5-$mN{m<QSxpX
z3!)qgMLFIbC0`>IT1BhXGK(d$jxb5ACr*{Im2QR6i;9!)48<VMJ)t=HR?<<<EML48
zu_^S{leYWyL<+H<NRd;nY9-PjkLxWdFH<%X#G1{8gQY!%2RkXwwjA#OYOKlpcy4_E
z=xzJOwxZ~~x2erSbk;oN`G)1<O=ZiN^bZ?~nWs62t#B;{v9XQ83EK3O7<ZW`9ZM@j
z$ct^48V-I_j^w$Ur~Pa)e^)U>7qnGsv&Q61y{=T|xi(_MPKQqJq4VV}>FPX??&SF%
zENthoJ=5N;NIY9P?r)w_IXP_^o)eIHvV=TO1)d7JI5;}b)$m+U_sOS@ES@^Vxn)a*
zuHfd`=aKSYoA_)VSfqt9sc1Y=<b#WlyHs&&c_>_AC!UZh>cvh~WZp$~q=UJ@L0>s2
zGQpiFWcG7El;wbjIvNG!>Q|VAe~zV+=Vp1a^1@FT<^G51qNKti5-XgS4N(v_Ae_w<
z$MU(tc%J7WOUE?@sUFxuM!1X%O<{(DV1Sh_MRZPw6@|RvGH$4nd3qq`>lde1=fN62
zN_Xr0nB^9+N{=EY=GlIb@50`d2W{J_&{OGAuoz97ACUd>#i&&(dG;)x=+|`}?a{}-
zj9RC*b-8e9gn9#;w#pP+|B-&AF_uoSD`0sW{$pFkoE(V_h34ZiHcz-uZ6OlA+siO^
zVP^ox*$88cTLU=8^bWeluI_-N^nN!aA7P0Om>U6IJBW9q+uJO5ZGxSl7mawW8XJTU
zc*#TWdFWrerpRKO6D*1OU=;JQ7-{1dz00J@$8!9;96!Q%Ua;VuCFAi+kbQa$ywXYh
zy>Dyj-6wZXsbP1lp(kMYHzy~KMB0pA1Cs;SCD}D|QgA0mC&^A6NwnFwF(;nq0k>jo
zQL?aw6!_(v>tP@llI%yV1b=6H>o#Zv{Y0yuO;94~dNswq*Xr-5x8R8npG~pTDe(9A
zUj_KqKFR-hg565I<PpgqNw(QjfXIG^;K4R~M#3(^Lv8jQ36uXVQMWd|Oa6Tt^u9Lx
zmZX=GYzXv^;ZOV2vzy5NR&Rg@(7v1ClWa?i4X^`I=9U&U*pJ-+_@^(ta8o-9`fG@{
zrL}iJ9oq&aT%O*qL*JVxH?g-;EZ&kYZ`+_F-d(LYn?OGdkUaQ@TZuTeT7v7lmmPmP
zn&7qHZfs!-G0>Boo+SE0Djsi%HNi+9^6{&~2<!wDP(EO9xHZW62ZjFVaBGOeLtK6b
z{2^<&q(8&0z5&AYae5m0k9E}x{=yW$c)j9rs#h$iS3KTXuUJRDjz1ks2Ju`Mfqilm
z7uo0e;Zy-HQ2bl^_-p^|pV`7jQh+xPa*rt=KS=^k3<WUyZ`4m=&CTCJ>e#{#w$qPB
zfA-1^<J`u=Nx+&G!0<oO6l;g+;K(EaCwaX%ybxtOy;zKAnCnYL2GAeS{{*{z8TH>0
z+v&yQw14r<0Qr}6*#1CcFGGJzLwe@Xvh#>99RnR(1<7B~*jpzR4E_%R6MstbLnHjt
zv2}s}793<eKl)1=+G+mFD=??8z#~c0FZj_<6d%8^<<a`pPjmf(AO7m6Ilq8kC;lYg
z%lKDdSP#0ViF}a#(>n-`Q9pD0B=1F&RDTY?nr7E~nno4nRYd(jkBg3<;On8^i{bkp
zx=v94f<GD^0Mt(?#Ebl(oTnz(8RYM+RBXUwy#Lrd@{RoTd-1l=2YJK?e(cJ`&+|w8
zaUhTa<QMuj6-gX)%KsaH?<O+A$S>^Q!wmys^BOysVo%MnJ_xYB?bzsvuJ${kzmsBL
zocoB;{@rdrsj*Wjc6C69(ElN9n)|<c$8F|a4?Ku3#rgE$eRu9R@Bhku4^YxRxjxCj
ze?<mpdlvgjr#ll8{DIgs0wVE;FWQ4vlOL7H>$-^D^7PGjmkF-^8~pHMtSSYFht3Om
z%F&Z4zrF-|w@xCQk`H@T!4}-RdB3)X{D?GkqD!C)WKXUk|AO2fH+2bgf$a1e^1qPB
zyP<x#AS39Ozfs`bAV4`evt3MfNm!ME_UQ0+SOD%s7>Rl8T7h^`;z^#4!fyEy6b3%^
z*ndgN>mhlXDtT2$`dtU%CWsii-jwpH&Sam?CncxwKVL(BaTR%Lsc!kdt|6b0A1E@?
zC$dlHb+`OB#J`&VMJccRBzfZNmM_9S{(9EHK`Bo;4_``N;plfaBrlhXLbG}Gfk+a_
zKCPemfJ011q<x}?ulef*B-0p>$X58G-}jJD-SR({@?l<@cgx=)`iI@##i-=(23=jN
z^&f>tc~!q>RlI6EC=IGn<&V%8Bd6-4?5I)IOO0WB)W1Pplz<vl0p%E$Bb2I#VppT8
Vj~c^r6rakY;#Z@}s~VNuKLGA1>n8vJ

diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
deleted file mode 100644
index 90c865f5f5e..00000000000
--- a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
+++ /dev/null
@@ -1,965 +0,0 @@
-
-#include <torch/csrc/inductor/aoti_include/cuda.h>
-// Definition of AOTI runtime interface functions
-
-#include <torch/csrc/inductor/aoti_runtime/interface.h>
-#include <torch/csrc/inductor/aoti_runtime/model_container.h>
-
-#include <iostream>
-#include <vector>
-
-#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
-  try {                                           \
-    __VA_ARGS__                                   \
-  } catch (const std::exception& e) {             \
-    std::cerr << "Error: " << e.what() << '\n';   \
-    return AOTI_RUNTIME_FAILURE;                  \
-  } catch (...) {                                 \
-    std::cerr << "Unknown exception occurred.\n"; \
-    return AOTI_RUNTIME_FAILURE;                  \
-  }                                               \
-  return AOTI_RUNTIME_SUCCESS;
-
-#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
-  do {                                                            \
-    AOTI_RUNTIME_CHECK(                                           \
-        actual_size == expected_size,                             \
-        "expected " + std::string(name) + " vector size to be " + \
-            std::to_string(expected_size) + ", but got " +        \
-            std::to_string(actual_size));                         \
-  } while (0)
-
-// AOTInductor uses at::addmm_out, which doesn't supports
-// arguments that requires gradient. For this reason, we
-// enforce no_grad context for run APIs.
-//
-// A RAII, thread local (!) guard that enables or disables grad mode upon
-// construction, and sets it back to the original value upon destruction.
-struct AOTINoGradGuard {
-  AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(false);
-  }
-  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
-  ~AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(prev_mode);
-  }
-  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
-  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
-};
-
-extern "C" {
-
-AOTIRuntimeError AOTInductorModelContainerCreate(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    bool is_cpu,
-    const char* cubin_dir) {
-      return AOTInductorModelContainerCreateWithDevice(
-        container_handle,
-        num_models,
-        is_cpu ? "cpu" : "cuda",
-        cubin_dir);
-}
-
-AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    const char* device_str,
-    const char* cubin_dir) {
-  if (num_models == 0) {
-    std::cerr << "Error: num_models must be positive, but got 0\n";
-    return AOTI_RUNTIME_FAILURE;
-  }
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    std::optional<std::string> cubin_dir_opt;
-    if (cubin_dir != nullptr) {
-      cubin_dir_opt.emplace(cubin_dir);
-    }
-    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
-        num_models, std::string(device_str), cubin_dir_opt);
-    *container_handle =
-        reinterpret_cast<AOTInductorModelContainerHandle>(container);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerDelete(
-    AOTInductorModelContainerHandle container_handle) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto* container =
-        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-            container_handle);
-    delete container;
-  });
-}
-
-AOTIRuntimeError AOTInductorModelContainerRun(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_single_threaded(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *num_constants = container->num_constants(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** name) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *name = container->constant_name(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** original_fqn) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *original_fqn = container->constant_original_fqn(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    bool* from_folded) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantType(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* type) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* dtype) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *dtype = container->constant_dtype(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
-  AOTInductorModelContainerHandle container_handle,
-  size_t idx,
-  size_t* data_size) {
-  auto* container =
-    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-        container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *data_size = container->constant_data_size(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { const auto ret = container->extract_constants_map(use_inactive);
-      for (const auto& pair: ret) {
-        constants_map->emplace(pair.first, pair.second);
-      }
-    })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
-          constant_map_handle,
-          /*use_inactive*/ true,
-          /*validate_full_update*/ true);
-}
-
-AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->free_inactive_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
-    AOTInductorModelContainerHandle container_handle,
-    bool use_inactive,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->swap_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_inputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_inputs = container->num_inputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetInputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t input_idx,
-    const char** ret_input_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_input_names = container->input_name(input_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_outputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_outputs = container->num_outputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetOutputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t output_idx,
-    const char** ret_output_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_output_names = container->output_name(output_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
-    AOTInductorModelContainerHandle container_handle,
-    const char** in_spec,
-    const char** out_spec) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    *in_spec = container->get_in_spec();
-    *out_spec = container->get_out_spec();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelCreate(
-    AOTInductorModelHandle* model_handle,
-    AOTInductorConstantMapHandle constant_map_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
-      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-
-      auto model = new torch::aot_inductor::AOTInductorModel(
-          constant_map,
-          constant_array,
-          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
-          ""
-      );
-
-      if (input_map) {
-        for (auto const& kv : *input_map) {
-          constant_map->emplace(kv.first, kv.second);
-        }
-      } else {
-        model->load_constants();
-      }
-
-      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
-    })}
-
-AOTIRuntimeError AOTInductorModelRun(
-    AOTInductorModelHandle model_handle,
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    model->run_impl(
-        input_handles,
-        output_handles,
-        (torch::aot_inductor::DeviceStreamType) nullptr,
-        nullptr);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
-          model_handle);
-      delete model;
-    })}
-
-AOTIRuntimeError AOTInductorModelGetNumOutputs(
-    AOTInductorModelHandle model_handle,
-    size_t* ret_num_outputs) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-      *ret_num_outputs = model->num_outputs();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
-    AOTInductorModelHandle model_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-    auto input_map =
-        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
-            constant_map_handle);
-
-    for (auto const& kv : *input_map) {
-      constant_map->emplace(kv.first, kv.second);
-    }
-    model->update_constants_map(std::move(constant_map));
-  })
-}
-
-} // extern "C"
-
-
-#define CUDA_DRIVER_CHECK(EXPR)                    \
-do {                                               \
-    CUresult code = EXPR;                          \
-    const char *msg;                               \
-    CUresult code_get_error = cuGetErrorString(code, &msg); \
-    if (code_get_error != CUDA_SUCCESS) {          \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string("invalid error code!"));   \
-    }                                              \
-    if (code != CUDA_SUCCESS) {                    \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string(msg));                     \
-    }                                              \
-} while (0);
-
-static inline CUfunction loadKernel(
-        std::string filePath,
-        const std::string &funcName,
-        uint32_t sharedMemBytes,
-        const std::optional<std::string> &cubinDir = std::nullopt) {
-    if (cubinDir) {
-        std::filesystem::path p1{*cubinDir};
-        std::filesystem::path p2{filePath};
-        filePath = (p1 / p2.filename()).string();
-    }
-
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline void launchKernel(
-        CUfunction func,
-        uint32_t gridX,
-        uint32_t gridY,
-        uint32_t gridZ,
-        uint32_t numWarps,
-        uint32_t sharedMemBytes,
-        void* args[],
-        cudaStream_t stream) {
-    CUDA_DRIVER_CHECK(cuLaunchKernel(
-        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
-    ));
-}
-CACHE_TORCH_DTYPE(float32);
-CACHE_TORCH_DEVICE(cuda);
-CACHE_TORCH_LAYOUT(strided);
-namespace torch::aot_inductor {
-namespace {
-class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
-  public:
-    CUfunction triton_poi_fused_convolution_0{nullptr};
-    CUfunction triton_poi_fused_convolution_1{nullptr};
-    CUfunction triton_poi_fused_convolution_2{nullptr};
-};
-}  // namespace
-
-
-
-AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
-                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-                                   const std::string& device_str,
-                                   std::optional<std::string> cubin_dir)
-    : AOTInductorModelBase(1,
-                           1,
-                           1,
-                           device_str,
-                           std::move(cubin_dir),
-                           true) {
-    inputs_info_[0].name = "arg2_1";
-    constants_info_[0].name = "conv_weight";
-    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[0].offset = 0;
-    constants_info_[0].data_size = 540;
-    constants_info_[0].from_folded = false;
-    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[0].shape = {5, 3, 3, 3};
-    constants_info_[0].stride = {27, 9, 3, 1};
-    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[0].original_fqn = "conv.weight";
-    update_constants_map(std::move(constants_map));
-    update_constants_array(std::move(constants_array));
-    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
-    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
-    outputs_info_[0].name = "output0";
-    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
-}
-
-std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor,
-    bool initialization
-) {
-
-    if (!initialization) {
-        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
-                  << "aot_inductor.use_runtime_constant_folding=False\n";
-    }
-    return {};
-}
-} // namespace torch::aot_inductor
-using namespace torch::aot_inductor;
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_0(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_0', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 12
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
-        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
-    }
-    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_2 = ynumel;
-    int var_3 = xnumel;
-    CUdeviceptr global_scratch_4 = 0;
-    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
-    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_1(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_1', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 15
-        xnumel = 9
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
-        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
-    }
-    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_7 = ynumel;
-    int var_8 = xnumel;
-    CUdeviceptr global_scratch_9 = 0;
-    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
-    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_2(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_2', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 20
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y0 = (yindex % 5)
-        y1 = yindex // 5
-        y3 = yindex
-        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
-        tmp1 = y0
-        tmp2 = tl.full([1, 1], 2, tl.int64)
-        tmp3 = tmp1 < tmp2
-        tmp4 = tl.full([1, 1], 1, tl.int64)
-        tmp5 = tmp1 < tmp4
-        tmp6 = -0.0312186349183321
-        tmp7 = -0.18273277580738068
-        tmp8 = tl.where(tmp5, tmp6, tmp7)
-        tmp9 = tl.full([1, 1], 3, tl.int64)
-        tmp10 = tmp1 < tmp9
-        tmp11 = tl.full([1, 1], 4, tl.int64)
-        tmp12 = tmp1 < tmp11
-        tmp13 = -0.12337345629930496
-        tmp14 = 0.12138354778289795
-        tmp15 = tl.where(tmp12, tmp13, tmp14)
-        tmp16 = 0.05455135554075241
-        tmp17 = tl.where(tmp10, tmp16, tmp15)
-        tmp18 = tl.where(tmp3, tmp8, tmp17)
-        tmp19 = tmp0 + tmp18
-        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
-    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
-        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
-    }
-    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_12 = ynumel;
-    int var_13 = xnumel;
-    CUdeviceptr global_scratch_14 = 0;
-    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
-    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
-}
-
-namespace torch::aot_inductor {
-
-void AOTInductorModel::_const_run_impl(
-    std::vector<AtenTensorHandle>& output_handles,
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {}
-
-AOTI_NOINLINE static void check_input_0(
-    AtenTensorHandle* input_handles
-) {
-    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
-    int32_t arg2_1_dtype;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
-
-    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
-    if (arg2_1_expected_dtype != arg2_1_dtype) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dtype, "
-           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
-           << "but got: " << arg2_1_dtype << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_size = arg2_1.sizes();
-
-    if (4 != arg2_1_size[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 0, "
-           << "expected: 4, " << "but got: " << arg2_1_size[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (3 != arg2_1_size[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 1, "
-           << "expected: 3, " << "but got: " << arg2_1_size[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_size[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 3, "
-           << "expected: 8, " << "but got: " << arg2_1_size[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_stride = arg2_1.strides();
-
-    if (192 != arg2_1_stride[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 0, "
-           << "expected: 192, " << "but got: " << arg2_1_stride[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (64 != arg2_1_stride[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 1, "
-           << "expected: 64, " << "but got: " << arg2_1_stride[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_stride[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_stride[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (1 != arg2_1_stride[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 3, "
-           << "expected: 1, " << "but got: " << arg2_1_stride[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    int32_t arg2_1_device_type;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
-
-    int32_t arg2_1_expected_device_type = 1;
-    if (arg2_1_expected_device_type != arg2_1_device_type) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched device type, "
-        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
-        << "but got: " << arg2_1_device_type << "\n";
-        throw std::runtime_error(ss.str());
-    }
-}
-
-static bool _check_aoti_runtime_check_inputs_env() {
-    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
-    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
-    return result;
-}
-
-AOTI_NOINLINE static void __check_inputs_outputs(
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-    if (!_check_aoti_runtime_check_inputs_env()){
-        return;
-    }
-    check_input_0(input_handles);
-}
-
-void AOTInductorModel::run_impl(
-    AtenTensorHandle*
-        input_handles, // array of input AtenTensorHandle; handles
-                        // are stolen; the array itself is borrowed
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {
-    __check_inputs_outputs(input_handles, output_handles);
-
-    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
-    auto arg2_1 = std::move(inputs[0]);
-    [[maybe_unused]] auto& conv_weight = constants_->at(0);
-
-    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
-        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
-        AtenTensorHandle arg2_1_aligned;
-        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
-        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
-    }
-    inputs.clear();
-    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
-
-    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
-    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
-    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
-    AtenTensorHandle buf0_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
-    RAIIAtenTensorHandle buf0(buf0_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    arg2_1.reset();
-    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
-    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
-    AtenTensorHandle buf1_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
-    RAIIAtenTensorHandle buf1(buf1_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    AtenTensorHandle buf2_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
-    RAIIAtenTensorHandle buf2(buf2_handle);
-    buf0.reset();
-    buf1.reset();
-    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
-    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
-    AtenTensorHandle buf3_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
-    RAIIAtenTensorHandle buf3(buf3_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf2.reset();
-    output_handles[0] = buf3.release();
-} // AOTInductorModel::run_impl
-} // namespace torch::aot_inductor
-
-
-
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
deleted file mode 100644
index 000ca4c1209b77cdaec3c8757e532677b79ccc0f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8968
zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ
znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8Y<D&eE)yW*yHhT
zn}&iw>XC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J
z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7
zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z
zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH
zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO
z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM#
zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f
zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z
zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb
z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew<A*5o`3mwBfmn%ve>$8dP>`$
zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb
z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut
zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L
zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z
z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz
zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY
z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa
zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1)
zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5<e2;d$N<O;
zmxSG_MZ$|D43;&wBq}RTWE|ULa_O*9ToP?B5~sjtJa89bJVv&N%#Oi`(F;l(%mI3l
z=+YP%0xvMG*$k9-m$v3vut`xRc<jW2-5q$eMbCn5;>GuFVIK-CY(EZ`C4GTUaK$2L
zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3
zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U
zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty
z02F5dZe(2`e4Kc=$%pwRQS}@<aNKsYQSF%cQmN9npxrc&7mgfpHX7BS+_e3G55Ls1
zOlb8>3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8K<dajkCB9D4>FIpOH77K*R{H
z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn<yKJQZnyvW5c<nMCbTo;?dO@}z(=
zNC-$x9F3m94kade=o2NzN3|x8P|X6K!_~?5DZYd%pVpo#m|RlohAnXI!n##JMa8b(
z0(K(rO<97GJO@I|LJ4)_*S+Wn7uXNj*g4f~q`(J?m<2koxbzjHh|?T<<A4&KMfpG|
z@);!woK(6=`j3R7JcJ0F{KQlkIi&~|YFvFnvcJb?yke1y5vl86fkg=#tl?ohu$>|U
zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC<S(oZSlT2tHAMrU$H&Y@3g8$b`avC#8P40
zw_jqQl<PsbN{nP-ScPGfiX}i2L%769QmqecqQp>@q#g<oH$xm?;ERJl<CRMCEn$!c
zUFO|aEYO=JPHWLXhSb$U8R9!*Av%V1l@yJ$bP^$3+DUdXE?bhao)h%mF{~r~*B_@<
zqgRfz^h2A&H)2-(WMo$=p0k11tYkYoB+9?L4GSg}Kdtp}0Bd%YV#uD(Vk1JI9px`k
z`nuHj%Q^Yhr(*L3U2GEVb7RE9m+`GaT)nHe*2GQtadLle%^z5?bxMp5$g>gXYY*W&
z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ
zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{<S8=`O~9a@eJm)nV(@D
z>S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM#
z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f
zaF0gz)9(?V<TIJRd@glcNWX)q_GV6ug<?+@OSgh*z<SEl@!K}+zYl#{ha(xz{fBZ5
zd7bRLlMvj>>4(C7I_d98v%g93M<m}>{PB?<|F=0Y6OQboct=_OL6Henm2?q5@R<om
z$9sI>b8dR?2(87mIVOHSA<kv@jrP`CY4Mu~pXVS0e5OJ@{WPZC3mlJ5`Re_w1U#n0
z@h)YI@nU@Nflu;`C+Sb<o3y6=Bi^ZSeEbetlRe-u9Zs<R1mTVKR5&(9^$_u)KXG(m
zo#uRVKCmC)lV*Ir1H6IXm%>c?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma
z2PCeK{b#acB;VbBF-P?<e`e1F<D>YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN
z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa
z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV
z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s
z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS?
zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy
zX;T0AF8WpVCl87*-V#gk(k}YzYNH<zz_FIpzgP0ym<Ue;dWk_hB!^y()U!xYh4{9d
zxywC@oV=p+f28P?V|q>|^)Jf!hx{MDSt=TG81f(G@hxR@S;<AiLrOndNB$$dDE~|F
zPcbb=f@+NPBRPJ?ftS<|chRpa|8adPiKPB7cG2J5L7(DG>eJuxU_y-WA7n6z+AV6L
z&H=?w^}nge(LnE3Wb<ngAsTO2lUTwifAnq@Z?9oN`ec8gMfwB8;rib~@D%@76LLc7
z=VOO^r}`mGDa7dibtKK)O2*NmF~X0?H>$^wM)<PzSf}vjmu`^b{`gAR1(%)$g?C(!
NNxSGp_(Wr*_b<c8Te<)M

diff --git a/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin b/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin
deleted file mode 100644
index 0849e4dda28f19e5fb30c5711328fcfd5f0769b3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9784
zcmeHN-ESP#6~D7HcI=JuIw>E4&}Q7kG>CM(ANG0^Xb1#Sq$YwAN-H8Xo!uGROV+!a
z+3~U)ld?$>LZ||@eX7)ls!s^<K;NiTp;Q8ds<wjvAd1ooK|mWPK}e-+f9HOTXLs$=
z+C-u%H~8LrzR$V$+%q$GdHVjthhmzR*~CPPvDaBk&S`jWmjK?|A;!n9dyE}mIW~y?
zdZsaE)#ht~A2@l;)Yx1QFdyyOeA#U<t6aB=<=G0e0<RHRfnN+NWxHA_+Ovx_&$aEc
z6BO&7op*x5yj`yN3wFtyD+ZX_f$tQ{#qyk;3Ds;!L2cggJ@<=H;d?dDuR~Xodp1|9
z<ed`YECw7dRLZr$DF;)uc$Vws7w7C*-&yb!P%4%&_q~!NhOpCxDA#QvOmw^E)M`R#
z4ZMTJ?iUDg)u3Sq4R}a)u`S9=t6D5@B*}+Z;roh<(tjY7j}k*NPN_8KJJop}ebui7
zm7rNggb{tob+5a~z%Rc4KTTnkyGIwi1*<YUTk`^u#qxM4I&vGQvu-Q+=#l%6*n;0q
z+D@tHARDp4(d4&x*{B9x`_A)7;L)em3HV<-4&fo`KF<Huao9-lQ-}2$9iudkm(Ire
z4%ro8#v@om&UfMwVE$%ngd(Q*0k5+yefUR!*CP1MlH(ZXPxSGZB){TQ8lU9+%8M1T
zXE=Uf1tDj@<M_co{Kp)xo_;IjM`PrFd<7wAKJZ(|R@SE_U3i)Ef31(dgC-xLjONsI
zW;{h>V&rtIwVMrfPPcXs@hRgxd-}8pgtBGOW*B5xC?M)N{KZ%my{7$RG>%?FWK5e3
z7OIm$#V^bk%kE+UGkeadRVw8tiq2%d$yg!ZtUgh!HY-nLY73d-lbJdmVHr2$r~Tu{
z-E6avcd9e_lZ{|;;dr6oyC)W_S*zM)+8_S@%QpvKzk#t~gf5z)>07krs~_nnV`JJ?
zJr+C1w6@*=uzV5!bS;+DF6mFlObs+s`*8WAi<dvp<Dg#zO=9(>*f~wp2LMCn5+}sY
zYKZ7BuoC0B*&QR$5g(X-qo!|#PJJhJWL>wZ8>jB6*gH(qx6yQjx<lx$esKB2%kB2E
zegk2KsXI>H52^bjqy7YS+tFRrAEjxQx(BJ-PhA~d`%?R=h=K>m<LREY-)}E3f4F=J
zee6{Vh*r(2dw#80DVsTK2Jc3WKj#HzZNc7~beQG1E~;+Lt`$#uW|k_btg@zsx;s}k
zQ>K;AnWCy5OQjw+zhf@#o!(>8Uv_4XnWHg>KZ%=30yhPE^3<LGP8U_^W2u?X)t4iE
zY3Sc8?2+AE27f7m&rQJ&Vl38*`I2W^UODia<~r-&Ze~>-ygixLVs^^3s)29zEyp@l
z+g!1o+;h!Mq^`Y*^tCsUx%MWeA|^8F_2!J+*y)I}Ym_@#c00X1pC|`K*Gx}m*%E<s
zIH2%;c*4IcGrb>YPN3ieQ0`snG;nO!VH;mT<$|5Z7f(64kLp6n70GVpC2tBx>(~a<
z`|&$h(VN|{-W2pw40zU%^9z-_wJ3s4p%zRO0;lL&4Fp=7FRA57p$bj-MBlrbQ>a}V
z3^u)Lu&9L_)SH3ct`M;=6WDVKFd84074d<XS}i`*%MJQ51wWj!8?Gl-4EA7kI+cWn
z6+fgPW>5ru3Z(q3XVv8Xzz?wtX;g7|TATOP-lb97K?6rR2UD)AgwiPPAaJ#5*ot^T
zS|)sA_;xA_ZHxI(h%!M5rD+KlOV*P`H+037_MjF{a&s#4Nz$dI#6SjD)*l-8n8^lk
zD(PTdVsL`wCrLfc*J6nr;QpDXu>4%9d54)aX-8M_NASY$9)EZwE-f|rgGH16z+E7a
z*W96k?>T{2t(1y|riHJZLV%|{E(Z>$_wDoQ#X?}0oLaz-w~$I%J)yx)PiRW_N@#~u
zUD`-_!70ppHs1FFD-t+&dMXS&BLe3kP)>9$xQI+-On-n#q;&)cslEWQqdft_GRLeV
zK%O4iFu-giz?t3vdDgokgZl$aA@SD{A&K>cs1|D_J(1t*4Ur$=To~fsNQnH*cWq*d
zA4QkW?`mmEm3KWsLaZ-HnBb~bf((~%9UI9VP9?)IQ|V}&8MT+QqJpLQbA&Q!HZzl+
zIXHdKK~X&vUF1bKoz+hGf#p<jL)EZmgOiqG=9!<yfmn-yZ$Ww*PEc)AVxkn%LX&8O
zAYvf`{HEoRF9R7$vDwU_T#_2Jxy8CQyI3kIl^I@qhz$h|m9&q}cURIWzp0!`cQX8}
zr@8G6%2SlYS1Zl^%}^DdU2rVFtRz#yDwpSx;@RB8A?$KBT#HCN6teDG8?CN;OkR1p
zZfxc$m}R~fgzpTgeJgK3UfHErh*S7zsCm3(4$F%cC!hDdI^L>!+o?WQe$U*`*em!&
z!Tk;_I0f=ycxf1cFk^CvU5AlwNsKLGPPUD$k@<|lT7<i4gh;qwPcn98JcI|>ZpPYM
zLfBwt7hPxDx?ujD9Ff=9P#2vW0bPFx_Z-_h%(&HH@93~U2K)Fh<(CIw(r?Z&BR<4N
zF&~Rzo;64lZ*+%2lh5S%PdR>yk>aLCcLNl=YG0R=UcAyt`2O1%p(lE)bKe?zl%4-{
zJhP;A_%#nXa6Z9q#+3+f<LIqXc4jHoVb5VsS3!Be4TH5g13Jzxy`2F3+fnwbhQV)U
z?!FPr1p1lA06Rd7N$sr!`$c15fNr^rPhLo{w-ewW7`Ptr-Gh?<DTCcjyiCGNLml=k
zAhO>kf8rhXyo8g`zckWeKa+4C{x`(lb?B<=jU9lE&W)IEx*kqkc#v%c{WDBEXJ6RF
zE+p73@Rz$s3lDgUzljIbc>r|yx5YOMLUha*IlLA8ZR{_X2Yt*>Y{E@ct8=cy^_^KV
zI_wz;kiWN$vWZ440XmDtpzqI<>~u30BbaTsMrgez8bh2uMD+77vfrPC<7Yr8d92q&
zBhKk@qF+c1oNf*ch4L3jeuUFU1U+`T8IOnbT_kUif6U<Y9k4$#64G}k01q1cR&e__
zM%ejLHXS68)2;m-S`_Txco2)d^5(Z@00Djwu&#$N{2Qu?++-%`$x*<gxHm*282Pdp
z>kWNBVfVImc43DhVd%P|vo{h%$I&ACGU$^;Cm8gPb@tZ;Gx@=2?Pu%&qx~>n;`+YF
zVfbfm8zue~`fD5x>Es`8W9YvL{~9{duO(6p`pI89+x%J${th(~lHTI+n=k1IrzP}}
z{nt1>)IW}V8RiZ{2lj8=9~^|s5GEM>zAn~-`@e(WLB2i&b2=Lpx4wYio&;STe?i~r
z$K%A$tKwS;ZKuC^GeA*O{*++PHc_4c4{k@{g1#f_{IZtL-cPXaHa}%lk2;;7>g;TS
zZ42oT`VzpleeehO-D4km<T3on!lw^CeE&iF==UFfgu?u)eYFB#iWQ)8D9&xSI|n3K
zign|OKicTft|m{_g<AV_<ms1{9u_Y5iF_%Tmj;Q4-ZyyJ#E7>d-tGXs*C!(USU$ez
z1zXtn^5)i%e_P652qnd&SKeDg{uSxyyPqTf@*47|P<*NVE&7rLxYz#eoQwZJBxbUA
zZxWMT5>|U;ba62ySUu2#e@@E38j+_*O|N_$8v~!JJ>o1UH^_|0(^Sc;eWc$F33q^q
zk@h1gulAYzr<71~3jgyp<g2U5)B8ZL{JU$&KPTmr(kHS{&$C|nPu7rsZ58_zXRrJ%
zkjH<0XW$(vPir1+N?zgUcU_X_?>Y)e<IMtS$pt6>DV16R9rkf)U*(4)wB!r@tqLJW
zvK4LglO_4oEB|ZtK@ereyjT7<kw5I-9zG@i0O+c%mVXpCg{c02CdI49honI@s`VrE
zM&#7~C_8FY`=!RHJzmn}z9@b*s(n&pRE}a)`=R*NsP;#VQ8|iFt)t>sqgq!rD!G3F
DI_8#s

diff --git a/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin b/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin
deleted file mode 100644
index 88b88a29bf7f3c8af0026294261be1288801c901..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bzQ>Q6*8c+u+tXHT<*}C&<5~p-+N})qq5glck1Vg8beU42X+dW@g
zr=`^GCN`#xPVCd9NkeEKHi?gDf@#|LKv6YSh)?^lPhEy4L`#)~f{@1We*fp?a<-FA
zOVetbI5qb?|M%DDdG5|V{m`L<!9XChfr%@~e$U!+PQ!a&7QlOPF+Oq26Ks;@*Z}Ug
zu>fO6qttL6*P6mijdL>s=A+ptRSPX<RGUV*TC6d{wOg*?I%T(3HS4vqS)6Uy1=Flr
zZn<fjQ<j@Anbn#zV^-|xvWuzdI##(_u1=dJ%PF{4wPa3s`qoEN1Dfr^y6ST5hV3+=
zG|7FPuGFTi3Q|yZIh?Om8wkbCQeKUMJvBRR79DHGRzRg(#oVzglIX!!7oyrUg)q_0
zhSg{Yp;hn}>d-IX;p%S7bX)L{?4n}IOQT-Sb0o=oSmFD!i_(7}l#dcaGFGKB?O62^
zkG}5I+?qRAM}$6o$?GqG=+YZ_|GeLOSD|Y3BQy4lQ7aZ3wkuj*E-FRGLcl6-nhHMh
z$U~2qg5OM<R;6sAb<qv}m16ENv8!}732!=q$33l<%m3Qi@m5s#+W(K%&bn-(YKh<2
z#h1pC)Yx#}!i?bv+3v%J<a{si`b#4|ikO<fYix5Lz6N~2haZw0P0k<d<Nv<oSA0t2
zi=1D1u`Korjt?&*<m@8H5A@+T2gy%$sxFg{as0+*gq-~t_>o~>ex%B`IsdJF{2g37
zB$noMX)N>%jj>H<+U=bz(mB)K8R1jL2*0aOi$q9Up*;vj$c9t~o&_q%{sP88;P26m
zxCek?frL9#Pq;NFUn*A%vw6(SX{%AIRi7?fiDH^1O0^j~v5cZKqhKZMYO|5BYHm4E
zDNiNpb8e|t%^0a<qEUA3uhy;nQES?62*n9gsZKbvRkw_zk>!i&>FHMS_*knsopnd;
z>6xd?wPvO@-kO~|(K<eU>}V^So-SqVQtPOFEL$2ocHA=Rb1d+?zy0+5z=bVx#gyzy
z-F6y834Zp6KfO8d+gB0QAaX6PP(a%pSo-Lyb|N?&_(%%|FR{Ra)&lV1Wi1fY1B;p#
zOa`tjX{P~P1)v8Ou3TNxo(;ak0*ecmmli&{!Wp$wAYFU^8dxp`0#~km5DID+85F_9
zZ%)$~)b?Rm)CO_aW@)+r+5w0KUkW1M8m|C#Mt22(@Hx!1EdlLTQt>6ak*nHw=(dAy
z&yZW%?KIs)w*zz=rkjJ?;^N}MqIM5~Cfzt6cjqX;%i6aId7hjW(l*Vibc@q%rbqds
z9);Vwg}^3m3yTX&OP4<sX*rGbVU`SBT|f?#fu#?weem&Ro`fZy^$(X87b$B<Lz~h-
zg)y2aTDex$bH+HHBQ0mzcJ;=LIgzxOVHFBESsG@ee8Sep=#-EriEiL*nXc+7-I&Vh
z;^cTTm3m75ntpO(beB$lW8=H@9E~~rN!)l6xGd<&Q}_QXU33aPnHvAM`f{W%4gC|s
z9@))h@Rt(!To!f^W4TeDs@S?=S6ydLzurB5Co}36o-1`@b}XwKb=T4R@^Za4Om4wU
z?z(xlQa907`X<`S+(cVhpRG*#x;ssF!KhE`jqbQHb|1Z!9K-2Q(9@%1>?DEHXmh;H
z>4Wk5o>XQp7B9z5A;!VDvnQKgM!`X#3VYIoV>=F+cpI(e%_+QjR+GDFr|`Z1;?
zZwkxy$@N+>Pu;(QfwA>7kb(h<3a2%NGgE6Cvmy;C>^$+jYn2N|3!yhk6+<Q{g*`9s
z5Pf1L@32#@SNQ1_!^al9UIsES&=oXl7e}v9$o`0taYPJaYNd#=U$2)#SvX`>3tp;`
zOc;jw(Nq$y)*O$5EPx{DSt^k2){Tm5G<f;yI5R*7Q!16tO^i{PvlXLMmUJftrv<~r
zXf~S}PmPaL$c`$18ix_+;L0FoA_&D!T4)Ag6s{Hu9v&xGTvF5UmCRLY9+kSuNrNYq
zg}05_DUTw$iHDQUNI#F43$|h?NLRoCNp4ZCG}2k1vS1*a%e92-yYys>{E#gT>k=~=
zucve@TlpT@O!J(cgc2TFT0ez)a?Scz^rTKL!lUVSO7Oz@9;bMq1u7Xih1^M};4={5
z=U(L<+j8xCty0d<8F)3#yLfED=buCA-Mj5(Iq#YktKsrJMM$NLp7>y=CqAXSC%!|e
zE|p{<Z{<t2i6?W{@I}u(&w9~kMD#okYOt=<i||DI^+$=WTtk$Q>WdP!?THeWIcBX<
z@@^UPq8#%@i51WtC0`>2T16|>GKD3whA>I2FHV)Q<!*)1>xz@_3^^~(316IiE9oj{
ziZ9-R*cAHfNqhZTB86CAq{t~(wH&FJ$F-J}mnoI>Vojy}!P1_>gDr}KEzLWC8mpKY
zPmdoM-G4xAD~is0o7&7rXN_ZyYgl!BmuwkD_k^LCd79JM3Rhzg8yjuFwCO1^?lMoh
z+zllmBeq>?IQUIDl80`F_OoK<U@l1)v=z!tqc~fsD3uwmjo7f$p_BXQe0fWH4hm?4
zR#y2QENo}eV((TYo~<<ZH$$l`&RB+1^(1++GF+ag0*?ee92}kDYIrWF`{W}>3XdG(
z+_I&@y5Q#7=aKSY8+aY}EYiX#DjH7|`B10hiywiWDsC+Uh0E;3<5NYw*sY4p*O49R
zVA^xgRSt?wa3`|K1KbZ~+2f&(Mgh6{WhUWY)l$i;r+Be4!cQ3G{`={oq{1Q+E1Z`N
zkrg%|oJ!`#GU@DihUXzg$2A41p4NOuxJ(_I{0v#a04qI;&<K7$u}4r)afi#)eU;46
zvoK%3IJLSDR{v4DOXtTdw}@4G3NbOqcD;OOcP~F<+tmU+lAZ#K(XjaeIVit{T2oVw
z-NeKEnyyoQ`uOioYt*(T7jB49e_S(Go<bWuIY?P#p+tn;f^iGYH;XyPkND=}A=W1P
z7d8<IzvCqtySCkf!)zyG3!6PS#Plw@#%}L|gY-?;Cm&#uE|?nuT|12Lp<6pFbZ3NJ
zqK}LClo}d_5BRV{-*)IvyCcU!8zU@=`EU^Pu@GtF7k$5^$!Bu>ha5kZ<5j_iZ<UPa
zK!WV9Rq#qL@%MkFrEi|xMWu#4xQZT+VU3fsCj%WupM%MvyQA!mIjLwn#%h$EJsIh+
zA7D;AF9U9c*n(tX^D*$tFW7M)=cDXrtqA{iyKf6Lf_}C&$R;VdbiEa0FSZ5;>1%l8
zlNVy_LJa(agSP?RHz4_+iLiadOCFK@$!Lc?2Z-z^2_EjSA4<4R@JNUKNW$cQOVqMM
z-<01@fIiS+-;?yIC>sI&Gx*ba<An`mf3q{h1L!<V@Hw`r#fI4H0al+{&|p7wFW`w6
z{+jCqLH{G-ZE2ltP{+1F374nu?9lh;a~s&ZF&1vgkGLJs5$}#xnAOoELnIIWcq<a7
z_KR?RkFqn*2P3=}wi=t*d<gU?r$>oC9}9<DVx2J3hkU$#0)d@{0?G&M#aqLie^}@b
z##<vC9^vxa;15~DCH;xF28IYT!08F(Kh#sNa8JF$VX9Zit5-POU9V7Az0N!zihA+f
z9e{mu6c^d&`QcOn&r|%H2KeXytzX>4cE$j29OfQVK7JJi92xOo^xvo(!#Zs5C3S3m
z8{6&%gRh*wcbwapKL=RTJQ(?pG{jnBIyjP1z)@Z=4$lYKb|)0#8Rq(8fg$t<^gqiU
zSVH}`g|<84Fzso4gFyZz9kxH#*qhMb(vY4uTDFb&5+TsBeUSV$jlFwL!QlS{F!9GE
zKQzKW9orfB_u_Qp`O#m~&`#}dEW@0>4DXDRe!-7^qWJiQZIITleu3*3{P0)5!1)Dy
zk@%y0f8#%eVLj-MCh|e{FKi<?ME%U|le`m*QvEsnR)XE*Xc|?NR}u9CJuEtYlCOtB
zCxqXD=sHXN3;tkm2+$y*5HIqBa=tOiE+K#K#zI35<Ne3l$T#vc=!82$ALJ1q__0$H
zKhGcWhk-x}kYDKAkRx%>DgW;PewfGvBfqf!2saFfwKaAn#-5vF0}x;XTe0~QUF~;7
ze?P{)KKCi3J-yTUuEySovD-a5g#Hg<(>(C42ltx?AAbTrj`Qi^#~wOh9{J{Dk5ke<
zzdp~ve?<mpdlvgjw>y&({E65!JR<S?FWQ4vl21zHT|LBJdHMys#{^e@m>*t@m81ai
z(0L(GIeJ><*Fn&GbrRt<`S4c~Y~H<>_sA;px-@jQN1z8}?N#Jomiyy|9)TW^onJ-%
z*YbGR)qh}+5%kL6C-8MIKsmX%RZMnBSe1eHX#XX*;rgT>iTQ$*e^ug1o{qv^`8Wy#
zpZe_o@=hVBi^1y=C9mpOgD~MHC7&8DNO@IfvQOuel2iCsR*}E7f;_cUul$u&<Uf(}
zN$C*Tr}Mg3{#L}llK(s93ySiS<cY6WeiZicuWt=ZN_on;|5EY_NB{qZ<mGZvXx3IQ
zNRmMIY5lZ44l#*K`$YF&?Rx|y)8LWFmj8+YK|b}$|6I!Zd1>A&zhCqZd!UC=$v+Ib
zx>o8x3Xk%t{y(eYRpVi4P>m{oguWg*RUc(XjjCR1^xLEU_3EMo)Tjz5N534QR6P{C
X8dZJN=$E7TR2~(-8dYA^sO0_$U6JTG

diff --git a/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin b/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin
deleted file mode 100644
index cd3b21f44c86f0181124fbc89ca5c63a37d6c9ea..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAnD(U3fdh&!kyPv?nqzwiU?ahTVOz6|p-AeH$P{x)
z$ya_Xaux*Lhc)Xqpbh$^$hU47_UM3OU%Yf})1bhx7Tbrep8_Kvio_ih+g$_HEBE{T
z&$*YE4<(kD4H%XS_}+8=@7MXCb9L#LPoF<T0)rLUm6rZ&x(kFAWJBog}h`CXq^
zb1JX;aNn&0O2z9-^@iOj&ST_0H_q$1JPY-uYS~fo>Uw;+y0EI^jf&HVH|*ucYPC>X
zT`nxF)hp#fX|=k(y0X?-1~lbS-wj=Tsc2WqJD{{H^@_a?mV}Dim6f|Q2!zGEqbyhE
z*A@#4cJX{g#r0%mxr#A$b<B%xfa-dog)h{L^}1%-23|x=yBR!Kt>F|JPQ$O)7$v^6
znTh0-iYqINcCof3!LQk?jn&3R4Q3O!TCF#V)kflzqO6PLp6~ycipi~_XU<p7$5$5?
z>XioS3BfY;=;Bdjg#wb|&YwAV`dmTN7ZQcy%5t&ZJ6@mFrd~oHQ=50uMLwm{O(92?
z8tux|*8w@~gNDrbM|j4*Hs-_lhG?6eR}nBvx!f&mRp5OsfEj&N;Qd|jcLnZt^A>t<
zkn#sx05eJgJ|1oPwP{G67W(ln`X;VP1Lc?(?7qzB=<wCe&7&&Pyt;WbB15GJf4EDE
z2DrLHXQ52<hN6ac1%3oo4YNSt7bAnX2LMrl>BjlmbYs;nEiG5eYbA^di^cltYW0QX
z;`9ROOAEE?3-$SOs<BpCYFwO2U9c~{m|1zTwC*%EN-vg+_1sdacJZY(Ae=@noxV`3
z$7>rZ@Z+ET>393Tw@;}7Sfj5{z#0qOv<8FGfOR352v`FgZ3KUy0yl2m_;Bl%brFCM
zZ{GNs46OcOEPzogz`4GIzMq^VBs78hh2Yov0@gvwTIFpFxA$*+aO3uE$i|#nZdwZz
zb|2^dQNn}0r8or5B54lbw)O8;fVy5LG=LlD7?y(Xg7%;uf>tz_$aN6p!5>(N?vrpj
zs2KhBcmv>zhq87DtX<^%meTyuz%A>|mpT5iK>qe+M#r)V+09#>H%9QLHNoNU@ixNS
ze%@Z-jmE=E+#aBji@e>hVHizv^c-*E&9_C*HPI=I%k=r51bSJNKL!3RovUfGJwtg@
z9PQ!lNg?{4FqCC{7T_CO)?>sz!P|EQd5%M^?wg|ShXVc^8Rvlfxu$+phBtqo<lhSL
z{@c7A<!v6fpMLPuTe?(!4hN%w53GL`+P@_2E$g3U@(oTt)V21H0{I6KRRvfg`Jr`9
zz{3Q5Q*i$xfU7cokYFZtYfF@DNnPB~$u1KvV9S%Y1;THLfPa&?{<(~QD8l|)#(yQ%
zbcBNGhINRyFAM5@lHIx~Wo`W~#<wNFH-mp1L;-R2#Mdi!9f!GCKAyvwuV^n;8nOEM
z!fc|b;>B_q+hM&>U%pt0WqHJm<4{<!t1+{C#^Tr*7pt*kHWr`H#}-#s=Zh<aI=0$p
zpF2+B%k|~?l}apLsW$A5*vIb!M^(I5#9k7MuVpi_c&%Z_;##7o<`X0ogujq@_|uY<
z^hrvm?oNB^9(d9|p8uZj8TAlPk}p*o%jH-yl~<QIT|_u>_>K)s)lQ^xj~zF#dH6Ii
zWuKT$v_Mb~Q29i97BDq=wt$0wwN#j|EG}0QkFaD`$~sT+c|)7ZJsf-X<8$j~2366`
zp1)i6B-m*YHW!+5esw**rd>$FgsD=axLl5J!0h_cO58Y?g!Cz!_!%JlEVDo$N^$47
zDpt!beJUaJK&F$&)%-%mRgp@1h=*et$Xc~Mo|K?OQz~)Xk4Y0_So3X|j9`FIW@&My
zQH!rMjIvr%l%*h-@;Z^CE(O+JNUagliS#?1nt?_gBe?93Q7S980M8iJmu$~xhSg3Y
z7(n=Ap@T{|^uWMYyi*LTumJ9hYg*z;YzlGkg($6E)xkq3T47C#pTr~FKCpO5a9cpr
z5>SGZ(Cj0p+qS^lrHj*qx2G&EJ`22Et+r6Gsdh=?avFHL*3z62myP58B-f>AYe~=I
zEHt@Ec9Gg_grFBv+m1l=LTU#fycbfuNJDrpq;>?tdm#0~V03yR5k>?V{JoHpM@EB4
zb$)FnzO?L?SVlNm>G_%b>})!dm9hoR%4%H4!p>*`phOwg*XBLAu62z?(nYk7P2x)h
zX;}-t%t&xE8lNU(vFz99m4>E+R&0fyR*7#0G?eN`NNkD7bRZ<IM9?`O$Ba}65%~^;
zM1_cCq64G7C5GzT4WH3V1L{1(B_<b^%N5uBvR<KLrZL9t<IG{1-JFkE2uBv$SamID
zK!HSxABI)bxvXj0)y1kkLWnw8C1@TidTuwXb{SrEu<9=9@uC8{c_ku@b0@DP-{VCQ
zbn^<y3=Hq&H6hZEG_c*g5|J5L-odMvk*61>)y<3KPT9<5C$Bi;@v^0`!@65LBdm80
zv=-#YhhhX>6sy^XqD&VBiGCFQcTu@^6y0-CXnKAwTJNGzM1B+lW>B(rd}go=Toj7H
z&&AbKV}}5N*;G82xqw_87qZ^)P?acQc$RQbizT~KY*cEiE6b&gIR3U);$JU|EAg|b
zM;@uHFP9pHm14ai#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2n`{$6d8J}$sZzjOFXVys
z5}9LHaKNEKso1etBK90ihH;(-Z7ia855}?$bQ+65i6j`C+Oe@I&)6BCv8*(IWRa13
zFcw^0#=1FzvFRNfoA!*&`;0}+wHcdq1G2L^AX}HYSP^b)Gdngn<C&XG`s`&*w;PPb
zbr0c9g00J7R}|tqvtxs2S_WtQ2BU+t8;r$!F9w6H%V2jMz~JnT4bHX<&iM`IiPK-d
zJdodm!DQ<)7(LA`iQJA2&Upr>5<Y`d=z#48yM?#28cu<&%V0NIh;V+#2IpG_r~L+F
zzv+63;Ty`7oxz<IF6Z0IOXe6-qiGt+UR$rM)Zz>JU2cZHE}VSg%$bv?d7{MlboylS
zbmo-wdTMgd%KGiECT}*Ph+VZ4$^A%>n$xF<i(}#BsWVUf+DWY_ed<hpHg`sED*D9k
zBWGzYq)*M9$tBaLDRc$`&}6wB+8mH`0o?!zwbxR#i{~ZRMecU?jL>nj^I^Ez84UNj
z7A7laMa=_oehfE$!^3^w$HPt2V7Tr3Fp?WBjk~@NqwN;kxas>abk|7Zit@enWhKZ=
zIE4A(B8ItUG%3p*>9Z06&DL>R&PoWf!bwz*1Crn<!nj*^Pq1QPIVHl&X_()AxYCND
zqmx%$@OV2M%t`dkn=lV2=FbZ`i9NBiv*v8#VkGuFY|kVh%=cEElh~7!r(OIJ!{tDj
ze1AgIu{ZI`iDx;<O%Cbz#W*L1Q=BOnmqn*gGq$HCCqZDCI#?w1In79;8}4T^G4a_c
zJ2xxGmziyl$C6o%$0pp1`B*w7c)kbq#AosvkI3}IXEPcPyL;mE35{ondKiCfR@*0U
zslD*Yq%jc7Wxi;~>l@WlY=G9b{=|{@YJqFFRSS8k1!7YjC7qXA(Amq=HJ>nD3?OJv
zMP5>^8T0U__41n683o4-j}E@V%1b2>-(i>}l-SIyS?s1p@=^%KFtgfSl+=PIpaD4c
znop+d8C|iLaJCie^&c{L{s1I<1*hF9J3o6Yl|VMPVHDv>9Y_gVGG+KO{L3-+rr8)0
zXb##&qdr7?2U8xdd&r4hfdEQKCbkoib+;}#pZD-V%djgA?+=d2M_PXjtyIhW`{Nb<
ztfC@s69eW40>$}xyRwcyx^`Yyx=7_$7TtuMjGxxL+ko@M5^Wf`G;l?QrX%vz@j<8y
zjp;EphsJ&5Tu6R%652m3MupS?rEVWq?%Yy)@I#a_Z!X`deDi>cw9$(Y@LNygo9F#a
z6?!0|VgpLm@Gle{7G3f&5ro!6Ooav`Y6#<K5aVo!(&XZ|OAdZ%p8si{cko1t<l?tN
z|5w$6+rUq41ApaGpeX{V^vH;s*a&IL&*4!WQLkJIHr1;bQ^}i{J0Z2DDU`ZCswVAS
zVq{Z&G0geIhWS=}7(~}c)EiEJh!P*)2e*KK#R;o9`9K5U+oS4FoN$<5fBQfB;;4FO
z6!hURqOBh9H}seL)#I4s68?z|JyBD59)W%*6pCS}^-rq_XQ)XF`u0J|JFly8CmK4$
z`A6EmNeAD>J^InWea_I3_q`SN{Hha<X42`j@h3Fws3`5JJ-=nC_eLSFe@G(qOIKbr
z{0rnG8r1V~Co&}Y^Y8;<`rCBzc9^MF%{!onJjm~#qx|v9A(3zQ5AZ9f;J+jIu*TaO
z?+gw}m3s2SVfx!|<U?LW(_b|7!$Xv(4E>Z73Df?d&_4x#MuzC0;I%!2bC}k~k!a{2
zM|`jvJpQW_yCU+{e<D7t-VCZSdywi?^GB~s{<F@Q<c~1s8}<y%VOSg2_G3P*=j$Qj
zv(C`q1Hj9L`5R88qh1C&>SbVn^%C*wWniGaULtMvGHDOD)hG0i%z?i)-Z1)ihmVSK
z+};}vWu3j@6Ro*IyvADbnvV84BO2R8QgytHe>IUe<&5uc(T}RD%fqdB{ST~%DQC}K
zzrHuH9yo8UH~dl#rphJjt*?*q3wiPD>uZl+s4ae1e`ge(5*PD*#KL;Jf5=~NQD-QO
zHm92JHkp6KZ-2N<dR~}ceR5<^i+>3Do(N+J>I?Ckw8vWc!F<Z;zu)gqpJRRl!g#+u
z4g8?2KdE2%b9G?6pY+;3@<;Z2{KU_W4SMxT`|Uk`eZeUwIN{Nw|6F}&Pb*(bm{05u
zdh+K3=x_VA?T7I|zx=k_@4letsIMsdiS!Hj&wlCstOoTx*%)lj{>7a~-F`)S%C9~D
z9~R>^+1S^dz1BS8_S0coMY##oe&eRa^+<Y4=x<9uia6Hb)|WqeR0THs)O&|i#0l;@
z_u%nsn9KT)M1t01FBKjIOMvU`OLnM_`W}A^{g(DPd-umMclAv;Vd<ayrGIk$@7?<@
z6gJ;Nze0S*F852lH9vZNl=Xj~V}-r`iTRk_znAfDMnV65%8y#qC;bWdNe92XaP?um
zO*;LOFVT;DjXRdb{YCVF9{sle4pJ|3;Io%4kq43p?U{5Ul7GweKiJ<V@@WL{<JfQ5
zU&8GFB2V?>hbI31tNZIb`a9xr*xnV~1$w!@|CIVdT}5TiA{P8`P@fVwe?4^A9u0cG
zUb;Toci0{YnhqiR8~KOXfXV-pfXEw-nYqXt4|>0HLjDiiV?pm%TZj+&M-qPeJwfT$
z0KuOA!}jjrZjO0<XH?{m&0%cpmHqoQ%(;Irj0k=Y7p&N$>32WhJ|C}{IqW;=9psP%
z%qN^-{NBplUk@SweZ$PJT)@A&5us%Ko(lgyH=@SvKB+f%zngGET**qYzH~nNqL>;y
z_gm>lwJ7!r_=7o5U#b?>{dLmm>zl(o?Cz(|aA?a{A5)I5Z|M*3Oru_Qx%SEV;qS36
zs#hR@@jyLII=iHPb{ToEO{fqZ2pz;h?fRHH_lKCswP~sM>EAHQ0TlFaegynD&tNF_
zpx)3=TKg;N|FsPz^`hlXt8?!8MQM4zzHvwKJlAagfu-IWRR=sgg#JI6!^_XE&m!=@
z76JAi_OH%s&HzTadH62rRlYe7@4fi{$nahH1b+7F4ib?bl~Xj{O^ia)utn)+s`y4K
z*EsN<HW87KLyl5Qn#{X*(jMA||0{;S*1^y*RcE*1H(g(y>u|1<|4ZBO|0bQqrC&Dm
znD<Wphct>iRNM9Z5j~wWP&Y^3-elmvrJuDqGj|`a8vZ{s0P@Q@*(ET~x3&FU`Vahy
zCWsljPW^5ke*!#hnlp6nBX9WKxvSshbMt>3`ah1}r6WH}s*~T@hW}}!-_OsI=;Z&q
zZTOdZ@YBvt{uE#kfkPN~{noW<_%}@*(0&d#TO=}i8F$Kjq64nt20*3eV*c8-DD`!~
z$WQrgm&@M<?5@A-FrN0G?4Wb?CxR_Ruliw3X`=i8np)I)bb!KfxlcEK1m7zk>%fgK
hW&1BT-tN<{Pg=q@aq(oBc>DPTv<;sdANT3<{TGw<t{ngX

diff --git a/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin b/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin
deleted file mode 100644
index c77efe3508a984392845250ee19f6c9eb29dc6e3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNZEPFm9e?bzQ@bg48c+u+Td&ZJuyyCxBu?pQpum_`L<j6eFmyWGm)NUgU(c7+
zX(>(1gaA!++RKMY`+)c~O_QccXo6|l`T<ad(1!M5+Q%+K6QZR{LP1Dlc)$Piayi?{
zUedJMCQivc&;R}Pd7ksxC+<FQPdF6HZDQgIvtP4$Ij7;R&j{eHAu&FD&BJVh6<8nc
z*Rc>|MzhlNT+b?ErpCEx0rSypR_f&zGwQQOwLaNkhG(}t!*i=%qi#BlsyR8+w9BSh
zx4i1CZ5A!BR59xfciODkQ&kUB({rtAy;`3#E0$aKta`=F6nupnq^b$sc6no!xpvcb
zXQ4ICy`8ExidGF-sCpbOHR?@7<K-!}M%gaTOqr9eHEk=PR;^?1+BHe^VXFgCpEZRr
z(aomSYzm<@@D@tZE8ycCuVs2Icu00pH07n?R7)I5@;+AhzT%?v9|+~6#E^_tt4+C<
zQ{mA&Zo_MMa}FX5=u2-v1w^M_$@|Amzsh1%-TvUTJ#925C!4k>`d%(AMaOEuYM(U~
z{NR0e-)9PbGi_S6s)hbVO9YpTxy!`v($Oir^8_CEtXebwYYWI<R-GHbf3$!$W;0a_
z{faKZES9Cl26`6fAdZrq0c=Fh?*LwZZZJR*(<1O1+uDP7fe!`n{gPvj^Cx@wpOO5E
zPig!C=T}~=h&|8o(G`T8{f^`NdhpkU$xn5<u8=*B-@Jm5v+n>uI1tE>RQY?(e{Bzc
z8`mz0r8!+1i#$eSa?8p2`C%4opPV0#@hM}3-`%4{BBZUzUIZg#BPs)r0~Ka}0%Iuj
zx5Q@LL%^_5%A0mlUc)U_s`c_r2{UubYBn16Z&t08cZj7bjcGfzf}(??Y^Ch_Y%^sw
zylSdeEvB3~uhOXJj7&P!ta|q69IJH5nzEZhamrMxQ|?UNtKxiQCGXJTd~@14T=vSv
zBS*4cZOU<HiZi3F+*r9Wm7A%J&Ui=M?4jwSlg>!l$~JR(!<l2DU;X8Kuk^jPO|F=V
zU2|-=NtEzMzxl(<eZPDGQN@vKaYaJf*3j~M7qz3|fzW$eIDC$U7PS_Dch75~upU~{
zv~W6fVOcu?;35D$w0Pm-vi5lR4HjBjJiom7-UZI69S7;s+n2y{E)=?O>77Ved!0cM
zO#I>mjbUvch9xbIyEa49MbP#`Ec{#;`PO&^s53e%07SrHrfmyp*OH1)(~Vr!zCpKL
zbbE~4(ypiJ7P{@H+W_5M+?JM>7MHY}2{h@(`M5iW0G`*rM#xvlX(4UXtWLKfx=nW}
zzu%{DTh|aci`(MT;_~wOcSTxGAbpsnLl+m3!*po*olEb$f1W2{nP>go<)tOc8qzRN
zX`sRwvnX1%QP&H`7@j9Bcgptk=CnDUwwPg+%Q#t@X0v+K)<@}-kSB?5;B1+y>lxiB
z7IblPJd(*gs()TTHa@ajr~lEh-Fkt>0{$g#EDc;9^z`xD{+TX1g&xU_{Y!lX(wBw)
zabb__7IOH{2z((AJBYE`tQKpwZrF9tozpLSj~`}+W8vviH)cljy5V@P-jkTiHezxU
zW_tHkG?uxV#<Ew_Sng^X%Lk0*vX|d)vK2-GYOi?Tjj~(mBg$c%5@kI*GRlq-IE8M<
zXFEMU<m}DlZpR|#xFVvA54n5u*%cH#1gg9@OE|XcfQe71^^#e{C((L(5A8j*vRFn&
zQSxT6d>`4ch4bibt5_J_KnodIptx{uQ$W*=Sz|^dB7;3>sN`AIve82D%}ULXDav5?
z8*+(0zM6a3GdC;&*;NyO-FU++<Y1vAY}9WE{X#(pV@A>uGl-ehV#eORVIJk-kyS7I
z*+xEL8TLmqY53Z3eG2jbilFDIMz-e|HP2}B3OYDIKn7DLlP!#oQk*k2qf(W0Hv_K)
z!}v%(pBu}JjZw_5>VPaxCeXo^L(oJJiruWx3<3mj!1{O`V?&afg|B3;*6^v+P;M4H
zBV%x;IaBm0qM>*=*_`zANVRM$hO%@89Eg)!RLhNYmZ>fn$mU8h;rebp-6B6^SHrr*
zObj_09cx#9sGQC6oF0P`9$HpEj)HQ{`e*gDPCde->2yl)!uc+zc%Wq}899aANvGg5
z5a8!tm0a8MY^PDHmgWq6PAz$OhQSM=1KB-$?AdC`Giz4U;~R^R${1bo!A@6vN_SU$
z2QnQh$wJ90RcsTF>7EgYo_n76qtA)xc^uSW9jO=LiT3M_5?#5DC?VAoC2HFhB`kBy
zdZXmsGU`V;8i*1N-5DicBV}4etJN}tC9;k%NvtPMm9dp>h0)83lkW@#KhE($oO~<k
zC})N*-m=&fdh1Es{dyvWSWl$LDOa@;sh`L7mXwz%llEiHWP`!dp2CBj6lYtOcK|ik
zWNs`wwtwV~{bE~Dbl%(4W&t{D9(Fy$a`2|IWlVZU4aLmUoE^cIwg!XP*k}W$O;3q&
zmwD2$v;u^j*mkMm;5X$+p1V2P&n9#C6w-7-Te&)GOwQD5N@b2~BR1@G=;R(cU*3{l
zfCAc}r8_yk2MgP|tk}C1iDxUz{moG-C#NmLt^1NZSwfzt0#5~992}kFYIrWF`{Ywc
z22UO0+_I%YM{x7(^GJEHO?)=@Ez-i6R5YF_^1-3wiywi$DsC+Yg)8jD6HrCH*r|%l
zJIIc7FzY+$DF;O+xD)yGe(r~|?DJ4Zqkvrf3X|}!ZmHxs8D6ZM@DoP4|3SJasj!H|
z3g=}*<b@3gXVQhyTsA+J<9W!?aZN$02eyC_F5^H`kRdM^V5LhD8X?Fh_6Q1U$mKFl
zpprRyAm-~Ar&i~|8azsO>-?DI7O_f?BPQnBo}cgho|Ok}yI!WJ(&Jz;nl?Wm<MPF*
zRV=#pES~7sbsg`~$G?nPr?z#uaAkyg1Dm!=6kGgQ9BGWCV(c1N-iH6kRxu|>B7=eX
zScJ_J?h{*xgzxs!j9uF4!%;TO*y2_njxfE0uCePo;4r=44akRBtOMpoK-Ui9-RSl<
zi`*Dv=jcTvUaLk1-~(Rr(0d;G*KR1V$mSSJU_KDWd^AGZ_(ktBY4VXA|1QT5F<ulb
zcxTCY{1RlJTm!Fk5`XX8T6*`%-BW7VU2Ev^8UD@5sbirwqu0RXz)cBu!<-b{iLstw
zr;f$i>?zEN=Xt=b2wRjaY#|AL`Q~~E$b|&^VJpVp+3wp0ji8@u#n}WUlCD>i?7OXa
zoZf=RK6p0C&L+Vhk6#CPU!UZEEXMW`FL^}r#}aM!Bp|Y%CU~IDo{_La@L-#LTf*dj
zOVq7R?~;F;0==)zz9s3!1RDhXBly!k^Xw+Fzt!#M0krQW_%z$nV*Tu;5Oa!)8tg}I
z0X+Wfp9}3U=&vE(me$?@b!;1yaCv&a4t=kk-o)NavS>@bylsPycz3m;><k*CpX9+m
z)QUx^)nZ)V{p{q^;TW&|c4G@$h=88p^aRltlF?{OtO-W?kdK#+BCu0XK>2{Zq1FKB
z9}xP(L#;s$4|4e(@Q1A7lKu>}`uYje$LT5LKhjmNXji?WQL0zOuU9nMS+7V(y-q$I
zN%-;H6oP$n6c^d&`QcOnFHrnj`uJ=A?VsAhhLeCd4{(nuA3sh2jt%-S`ftQbV$IFp
zPU_gg4z|+^hkyFYEo0oq!fC*o=EKN;tSQzG)4`EW08a3Fad;ujcDj)W&oI}Q4E3Ww
zp#O1p=Q8TQBeK(tMrr@zn*s7K>9GC2#$Ja0mWK4qqh;q2Un&ARwhEHJq_H<oD;WGA
z04DyV<cCK1r(^2^|Lr))cz*O3HMG<GmsVg-Ux9}cq+jr(pC~?lVaucStDoii1wZ`N
z&vJeNzfSxKzL)W@z_1>4R}=Xl`)79$9HD;Z_DS9iC#e1$el^8zb~TME%BzU_fgTkd
zKf%{S+>PM-AG%Ia|AIdp?gtbn6yimGP|h<G>>Tp<W-`+6GTwh|9{EOo;%>Ap^g$l+
zfgigv@$>u<e-sF$0QrT!O$8DMo$~(%;CqQoF!Brg_i@93*u2IrB-xX5tPcXLZ#y=6
zqO1Lm=x-<47v?@>w12nTPiX8+l3nl9A@qL;o96zn+;xX}&qEL6OL0Cu_`u!!%?H2y
zz(bU@kFSq2@L!Pu+MdO}(&^5G1ivpf4WCH-!Hf2w)#OJc@}@3gw>*9G-DQHS|2jXs
z7^_GD;-T|Go^teL%C9ei-mQ}eXXL|PMX>qzZr-o1AwMh)o$3<k0@)L5$p1v{kDIy#
zx<Gbz4f&tT<K0j{T#ylT%ik*Sjvt_$oZBuYyCkg2KznrXIw}BXB8bF1a-%@JAn_zm
zM`5@8FbV^odhEX_<@JC(O_jW=BmJ&}a1%rfTz{1Es?KDe&L<_O@IPBaesL9fYN>Ad
zzpf!4lOHJ3(kHS{=XJOIHpIW0{{<<p{3Lne>y|ITKK^>vz(FZbIS*b+Ug79>HzY5Y
zi$b$`^?^td$Ud!~d7nc}hNXR?2e0{?1tinxlgL)^qTlzBPu=oAlJY@bns>|JCHjZm
z*~O^j?*(05tMwm+M|oAhXH~pvJSYvSQRR=&mm{a@qwJ_r)k}>*d(^*vU6g<tRRQG~
elp~a?hhkTws*f6jaulD+qvBVi%Bvcc+&=(X9qb7J

diff --git a/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin b/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin
deleted file mode 100644
index 0ecf0f0b34a0a2aa1b740bffdad44c885099c559..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10296
zcmeHN+ix7z89%$TcGixw>ljEAk~R|(hfvb-UhMTI+zcU<szFeoMIuGR?C#j!c0IeE
z89VEkz$QfqRTZf16H*^gpHSKtq(0D#Djo`gKA`qLs3KG$<fdyk&`70hf8V*x&hFYp
zwMo?rqxhWj-7n`m-&{7&oIG`3G!n^fW8#XkU$cf>)AZh90larW%#ZJQoK3Mj>&JLK
zi!f$*v!3tzRskzDFU<*<&!#u)l<LfI7LBqqU1f%E*L}lx%YM}{Yt^zjz2Mm;({wDq
zyl9&R%P-EFPSu?=EA~v;$I|p&tL&7W88a8C*^&Zp)^hFAr=h~NJ=<M`t`zrdrcy0f
z6~tNgIb5tdo^Lt+I3>?0*@cA}bK14$Yz0)x4%V(+k;DMDIuOpHDTIk`dY0!2p$+gB
zlHDs1;A(!|^y~1D>|$G#mqx8z<Vcbau)_DYi_(7}l#dcaGFGKB<65;@9(~QN`c;3a
zh6qFYQk%}ZsK8Hu{68&WRlASO*>gs9dfKynQN{9jC^~8zr*qL%@FNeLe83d^X3DfG
zWee4a4GtH-dB{XJ=-78&M*@#AqfWqo+i?gEN#}9?myW|$nx8tX*XS5!aJ)1&+;hmT
z03&Y!oEDQD2XPC~Umoe<zZZCo?d-un3VbAl-!3^$bN*xxe?{^uKBe&t=T}~|#h&H(
zzBWRxUgh}y9{h(KukL<r@;9R7|6m&-S1#})!|nVuq^XOX|GFOj7Oq1QOKZ9`7JG{3
z<k0y>;|Pnl&Nq(4`I0fhAMMd15Xx3;ADkAlF_nQogC)vp7$cFtB?d7@z{DbB{#<R$
zue!zAvQt_pVr9-)UbX6ct89%`ON<q(rKNeVaK_8dIrVaOK9kM4-n^Ze&z_x`t)BUI
zdA#i9&N`L(S=VtK+pf+mW{lbri~Q=3KYXMA)f*TaK<MI%MYNrfwGS?9XQRWB542~Z
zzhjZ+s<s5^uUZ|W7K!SSRZWYgA}ed!(*T;6CGH$CuQgZJE;lcKxWbWI6b#yO^be7U
z)`#WV%GxR?L|=|lm@pdUrP>)X&>@$Ywkx7tN45{sz`gh;tq#%PkyUMgmb+<qkcK;G
zIEi6(brtR(B+#URvvB4Dfg?0{G(1MbmuT2W!;@SqU*144x3?Fo6>W%MpN2g&EMjP`
zHa`&YK11<RijcbSs`-9%ZS7BMtIai}PLD(z6h5WgShQWQTy^xkF^LB&%bl@(-J3Iy
zr7UJxr4m}UXL{wcww|N*D_gW~plQ!IdRjLMd0n*R)9LgR`W^c6v5BKP{pBW)>Uo;;
z_>;KF6ma99r_SB>&va3HKb@ZZM16VEmx2Cc!XDYpXYrR7`20BRAjYy+E>vvYupQrB
z(l^;Wk1(TV;n7Ps7INddQS)8BCmowqZF0qC>gY8$k-qjOGS}Wj_S%~m51GhjHd`}t
zV<$q&u2Jpe*e&$rJ@5GClAf8!v1I~hP^WlN(FaCrw@==QGsjT{1T-+}-k#01QE&^W
z((RcHaO}`26K_>c(JbH%%1IrkR#7R5`Zo%aH;vPFdW#kR#BJ;7&23q48hR-P+`Gv6
zx$2^^Ac9V#J&YE8t6Vbb2-KUc7&0AcG^SCP=*QM`3hiu*!DiMC7VUA1db7~m5hC_w
z6lFjGhT}t85g&-@_2NSd-J&1k@WXOSK|YZ&*n`!HbP66;-GG9aK@s$E+6CLM85Q5~
z_#V~Jy+H=k<U}S%^tpwKF<X{oHw{-MJw1`nOy=`h3e;6woWWfIbZ})6X_z1sgOGuz
zrIJ#Z!7XIe&2VZ4u9CS*HK0<(xWtozFWy2SAc%tD!DQe9mpfA~*@~egJpl(Kxj}Vw
zNM(uk0uz~RZy-EBs;BDYhOAl`mzb&1T3W|BlrKtnj-uLKh7#eseh!<+HS4$PDV^$K
z9j62@obPgq$62BcBd3r$=@fhh0{q;oqH9~eU8`2g#U%spUPT{|5BOwpDs%j}y;v^#
zX2tS+-Xw%n+USZ8cDmwIy1U{#mF`eU7K&DJ);94d>Kmcxx##0S^jQ%-kAupmBlRLY
zQGC5oq9!*HC8T<y#NKvA3CkR_*(iCn<bo)3p(rQ2qvS`VL}zHdy-eeTY$8k&>xolk
ztX-`zx}`XIU&se>9t*|E8%f7@ruo?|iJs8ApVZDb6Dh=cB1KNQs&=G79yhzCe4Em#
zAl7sy94z$|9_+NZcV&15P+?7HCo_{LChk5V+KQs{+NLrK(U~{z`i51*x3s!3?VmLi
zGf#770xfL=2GQ7ZaDtki65}rOq|0t730cu~so>x@<w)LFv((R~v-jmwbU|CGyl6}>
zR4PhkmTMz6bUIXW51k)xNzX$8^=D<3_h4Z=i>@9f@oZ(dzgbG<^qgh5j*?7sds!~e
zQ-QmF7l-hRtKqp2)yMZ59j>zphuiK^(UQ3V?C3ezX0GiAk0j~i?FTH|DbW+kIlQKL
zHt(ea^1F~#D7f|_p5wZ&b3Ls5vv@CKx8SD*_c}0V70HKz<pBi3SZplLc3|E`>z!iF
z@gt%2WQ;Y4{<$F{;rD<PV^<CYa34FuSaW9p$C%zh*Vyh3n7_z}<RdKJLFYz5*B-)m
znY}F*+ZSg~;^RjHf3acsfcIf~C8j@ZPoBjF<1B&oa1`rYjI{BKzOT^YV>$m_&L7E{
zCjfN%ra?JS*XQM;8?W>dzV|DV&=dX|b?*jxl%0R~d|^4#;-9%Fked>0&yw`u0A?q_
zE-VkW*z;J^r?4LbuE$tYvam}@@XM%2fxMJpFV*Axo77#qkaEy3)CbrUZ3<m)CfN__
z0|WGxFMjofBzr3f{(*t(0pHax`JaljyNDMoTz)y$V$T91`zh#KPPEu_60Q+E++yFA
zF!^5>HPE6@eeaBcKGI^}lk`G@je!0!{As=Z!ZxzM)7{1cXx&fni)^TlZ9E)dwZc^m
z_G32#{>jCQUu;D||1ILJYpwlI$96#pm#6Q9(ARu%8+$j&`s(s)SPOK-d#JvHy@%S`
zM)Kevtq<-%$+AJN??HC{?I<c0*Ir|YU5bGo+eY*l(Jv(<eRcWGlJs%?vf>Cvet&)z
z^b4dAe}G5p39dgO<fAXuV_o~v*R>ygeY793U_bi$I`<>iu^;E(j>ou1d%rNmZX$cb
z+}<#^w?EcbCr5B`e<{BHL=fLi5%7<2{t?bUhJ1<s!TFJ&^KTE1Q2aa}uOt8u4F@pl
zZ^BRF{5I|+_3YAqcEFEDfBMGFlO)0}y$D#-0vP#^d*Tc-9rRQJaDwj_hfxpPQ4i4f
zG`pv%v3K{!Bn(|wH1?Y$(a}bTz6Saj(Fq3qLyf(YWI7M7aVKL_ydLydxW2D)82h2`
zP7r^a{u+k^I{C-1VT`EXcwNJGG{g~re)5<482`=*f8+I}q&MIL^uMB^{2D?Z@~6MX
z>4E+OsP6%i=lM!T4!HgO++_hX-PJ@sxc~bJ?x%VV{C8P`>O;gg2D)l<Q{4VRcTl4p
z<h%angY2NYo%hY{Bm(~)_zErPYG`aF$(~z6YX;Q67fqY!s^1X({UrPH(npMXajW$m
zjlG^^y8}9e{s&>xJn{8=?>6sy^l|*i#+MI0eDZ|($k!fzl#=$@^;rh~BQikkLiB}B
zccvs*k+(fXCI0Y5{a`(L>Rxm;KS7>;S?glqa#!WU)vPo~JoMbgdmLswW%BD*(7Syi
z!jtmxWmd2SV>fSp1Npb4{9gh|vFMh!H<16ObQCwWj;mY#;s)}+;LV+}$M{X6t)g50
z7D1}DImGIEnV>_Gp!SI0b3@Ehur?LK(K0XPe<txHPq(se`9W+9eCn~kBIWgvJS~;H
z+DH1`m~c}>3|+%}1fcer>{Cf7IfeiJ2J%PNk*DXIZu#GDAU`GLQ^E~i<8;65mcP1z
zd~F^36lb^mjgZH`o-^>gl&74BFD0*V^t(RE%cn_&X6#0>RKG1#{^<NP1UeW8rF~T&
ziqPN>!;I}y2sx9j@O1?M`P41{BPk!|rFFObVNpNqo-Rfue?REzTCe^nZpy3r|7?m^
z%@0X~YF7Cp^j755{wO<YR{N#qusz<=<i03=HLHD6b6AdIRQsX$)vWeM&0#r;PvueZ
Lt6Ak$%}VYcL><|J

diff --git a/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin b/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin
deleted file mode 100644
index e8cdf9d03a89109e7d9e93424697209d3422b085..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13240
zcmeHOUu;{~c|YW(BvMjD$#GpNv*)YSLR8eb{7=bAmMC$OWmtk_O*(7|3XT*hksU?S
z_g=}A9XE_S6iHDu!~Ov2ThOOI4A>ruEiJmu4`~+!MHlR0=))d1quWDgYqYF&#g>)(
zecw6f^72y3mRJt9As5y?=llQXeBU{jy!1<tJ^6Sz6v~VVd4<Jah>n`mP)$pqdaoRx
zzvFqaB9_G{?ngyP2+OHDuI&~}nCWqQL&7{3oNA-o7FJ`+sx{V{!g4EZ*K+Ne+iVnC
z&01k?)2WmTg+|e>ZB+`TqPtoxG@ABCp<Y?9xtJDQyI5<~8ta8>(Js5iMzxSmdkPOp
zl>^<C^5H78D^A7Ug4QH^yIyaWigje6<}$q6Y&eL<%~EQua;3DnURbk>8x;-IYYoip
zN?j2>xHy1lY!#$1(F;z|aiq`z_#&2KSir-z+;+ij!$Y!*Mblndt=cLhN#4Us-}^2q
z|AEjxDh$aK>-BZJ*s5~$ExYM9-R%}4^yy0;!V1Vry`A@ui+-PG)Ybm%MrFfluB|y0
zSJu61Elo!^pxd_!8h-Yv$DS%k{z9@)tk;UDUzCKuT?(fP*j)xHg)2|uamRk}f3tzS
zwi@*R12xd$Y^J)=Z)*$Eabk$+q2@e-qh#KPji~v<z?*Lz@loWo1iT?;hVVA<As>E1
zacnbxe2D)mieK|-jep4e+Kax}HyA(CN2u9fGk$ake>6;f>eIDPb{Rk3N2uBF0)J$>
zpPx=t68?ty?;PUq;X0+TG^fiDkr!!<PrlsgEQ)CF<<4T1r$P|^^pF;b(6%CfhhU^^
zL}%dJK!wFWfH4&MkEwCoL%@ho!rf>k+@`%+tu@M<tC$tmi%zrI_(H9iIB$zYwYgDA
z^igzhl#7W<W6Mbtn{F*puay$5ZMWKNWUN#&;ndv9y{+QvZx`1qj#Qi|Xw?aOv*Ffo
zK8n@z_V&j4*0$4F+b(Ty*0SfnaNaqWt+v}+PHn4P&85$~=PxX6opT*Gy>)J*o?bm?
zwYEj*yZ`k2-y8j_W2$4Sm3pgUJ46Zp{4f9Zt<mp%3sJ?8Yk5UN#!P5;cX!9Q5S|YG
zs}TynD?-LgVKcO|YitAcZ$=w;!vN;`4Gc+11q)^&;$xa|DclvIn>$^W?e2DWKm<n^
z9L5)MyJ1`m|A`UWliaLx1n8ga?Og{`G873L?+93eLx1`bjbY;<lAR#yn^Ki$k?ex;
zEYRUM!dR*YdGV;I1{V(qU#!A77BcQ6h2wOar`s0Yev@wZ(Crf(|2kRT^pH=`?EB=M
z@m=!m#@@A?H*Ofm3Bmr^G~3idKQ-b6pQGDzbbFUYUu4zH|2e{1xLq@*Xlj!v&35<x
z`Nq!88@tAfz}(~{(n?(0-MM~ahm(1|+ugaod%e5U?Owlmtt%s;wcEM2$7^WZL3*!u
z_ly{+rc8H@X>Z!y+u7OcLJ|?k#RzvOgH#UI7D}boY?#Z|5}sm<_IkxNosGhoWKmef
zavA5DQ*de*DrTO}CUt(97EZMFhM6+0(y}Sfs~1wK&zrwwUObaKZPH(U>9n~_<1+pf
zZYc>|7WCw$``=F&oqaE)mi}9P%cL(2{b!^-vb&tYUrOSav#^60Yfi0Hub5V);o95g
zEuSol!fF-qOlw-3`K)QRT-zK<%q<%!u!%zQ^hanc^-&s2f0V{DAEmLZ&sZjX>-DBe
zA?H(j+v_ebK20Bt&f#n-o2m4&xJckSsvVyw&DcWgfz+7?(Zq}^A<Ec-{lHSHkAjCl
zl^;l_ffJ{mEZ{S7W3^Di=if&1KH7WgW!XknN%5x8zAqeF<9z=9eJtb;(LxFqC@wrL
zP(T~aEo)OIB8@;6R^4K)Y_$=*Q>|MnMQJ#&U=#h!e(qt<JhTL)_e}tH<3qEMfrWvv
z;r;^Zg@X3S44a4<#PohKWA8pRkFxNn*eHA1hHhAf{akVxzBX-5$>8JwMbh)MMisYZ
z)m_Ws6>Q-ES+I#IyOhi(Da_5fRjnzqo$=`RrjyBRCbgVgO2Krw>~Y{&f(FG$`LK0$
zWN<=(1cnS`h#-yH861t0fa37$qH@yI%>C?3z3EADNoA;{^6=EzEO``K4~_zN3X+02
zUn^HMLpjZ9;5d~<-HIexriR3jrUZBky<j7#WE(hL@krzl1~CSB5FBGx7FsD2y{|qB
zZ)JH+F2WEeOXiL2X3a||4fezQjF~j4+V%-SrWm1S2Sea2ms6abavGUdAs|A;q`g_S
zD@C``YSwG3+ZMjoS#|L=gfFz7Oy753WvjO87V1UE<t<7|rK~_UkfT7PFd4wWy8s3?
zxnDM(ObwVNFIJ1I)k*;$MO@35D_$9njTd>@%T-3^iWd$U5weifvdUznSY$XS+RG1?
z6G#o^1br2VH-$<IqK4}zr3)!gB>&^Ia6^+VR1+6QD(z(^@5{_mFf%MzPVrVXoEgvi
z50)Ad8_G>;xHVF`8v7kZtryIfuB#859I}kO@Gc`;JC*mcbH<mQ6qh*fKzcvmwk)S)
znufZ`i|GUA2M!JAN0*^58M<41sNoW$&Fq$XO7?}U0U2pJ>E#E7#G4Q|RtmeY?5s6;
z;%B&U$fvc;QhMoZ?%}g?H_~)Y9GB7l3|X&c573ZR<veIYIOlBFDz@;&bK6>TFIdW6
zR&Jv>^bTU+TFBs_If#MFgwDQ$7<dcHVq>LMQO`G7s?oK~<I739kfd^L%Uav4*FDF0
z?;$p9@OT~=qH`h@9a$ctb0QTT`&*En<2)*Q4i3<HsFISCDJ0XN_z9dXzTuek6J(Z1
z0y~DEAhSaRda$5_RU6^7<d~jA2K*$BI>$P*$h@D#Y00q*S!nc=I3v>cEbUus8%4`*
zc=k9UIc8D~4yHk#JSQc`HdOn=Mu3NtlVcav<^UH1JnUbNBa{a#JsPNI;1r&L<x^Be
z`PO2Cc?c!fxE{D90zB+^9>(8?heM<w-iJqS`Z=%;GSyo;Q+Xu5Pva^e#Vy+H9DHf!
z@nP)-`8a<q>J>j5?a*Faq>iFkXi?Ak*|;Ng_{>IGY+;d9;?iE?T%9<1cI8<#n>@?X
zq*gwo%1Ud|VF{z`#=%TnZFvsqU?#3snV&wBwAU!=QftYwAU2pqZ?ZprIu{Sw0v)|M
zM3?oz<&&olq<H-DvKFKk`B08kQ9BELpG0PHVkDRInRIq3LyT~W6!rPiT*t~ul4+&b
zl|Bc3BSj7lX`<fuYm)hu;gC#=wtYVbeeVQK2q@qaZ5Dn_GG#JwS##L>{2cVn7&IZG
zfF@l{`tzgohEv2ctrRw3RY)27SiJ%_O+2v<K3@ADqfeXkur2eH<`~oLUm6eG*j|KP
zaXppU`&c=DbgwkZ^f~tuSS+W)56Ut1f~Qz2*_ADPNARJpOGEnjrTT}|_Mu$3Jwn5Q
zZ4_52w%ElO(illZ#T~G0;x95I=j2G_h;P0W5go#Pa*{~+jY3k0y?GBF5sN}}XFNDU
z+I)0F%nra|dQat}hXh{6_+T~y*mwr-cz&TLBFCfRxA3Z`gTKi1B>O_&9MhjMw=5##
zQ89)2bQtq|gtYNR?__B5b2a{_8sEdXBU$k7MR5EI6ekYAYn{YD{ML!y;RxY>d-do6
zdKCJ$50@{7dV*eqk>htyiMefMavo!8O61%y=cY$V{6Sn?zBt(vuV79x-<T4Y+Y!;l
zl&`CC$QyFLfZ^(t_<DPk-(fv;3>HAY+#VAvq>HZa$Hnip$HwTb;ONi378h^F!9O;3
z6!1f%ivPt?@euKnhmap>s|=GvKPFhc!y}mVUp(3qUj{d!lLSxo#P2A$MeuA-d_}>C
z_f#9T0<b5(HYR?U0R66>_^P5K{v)8j2i3hRpuZCbzhUqqh5^6*nz*Mej8|WI<&`kO
zSL2aMz*qiEh*qg<P&{+(k={wD6UU$g@Qgje>)-p%Yf9hD$O;P0V1041(4L$je<lfj
z=V|fstI<h*t9inj6jvjNm#DbB_|-d475_r}D9ay}`a%osut5@2(!cOXn>qlO;(z(o
z$Wc#!DF*$rcP0I4@$CzsUIstq6Z!pmI}%vWk-&P6jL>>Uy!9Lz8C=iEz<MsUXW8Fb
z_U~k5q^;gHd;U#^rGLLTDaw?eyO{qj=1;(1>=AT%`5<c^92#jK>%(uwk<aN_&LHLU
zn*#9ByF3{6Jnu%MfAqb3mRJbu8+BxB2oo68!@L`dd35As-kq2jgvZA{81)i$j*A#6
zBl+$W;0eL+x(U8JE+$Z4*s;DS9(m6YKTbpxj9l*;RBut|1gBoxKjlWzkDUj(USS{n
z$EExp>g_dQR$1SJI9z$XqU|{}Oov(s`cG<p(0*#rdYZ2@|F3F(=!1L8pR4hZ`TE=(
z;2>T4hyDXUwnF0n8T2J0lrO$$S<qjK4*q|a_7;zmztTVSgZU=IQa|*W)d{jM>42lo
zI0cAJ8-L{~VLow!?0NB-uQR`t$9`h|i0PicrY!^lz_o(?;$%Pn%0FLzLgI&HFDCbP
zqF;r*HwWX7wr6!d!`Oc!(RNhBGCq5J+_QHzA?EG52zM0PANdZOKboC||NVGhXL+B$
zbA5lQzU<La?pP@gcw~gmVLsGo{p{!j9Kf{#{U<?}`)QvZi@~6xza~CqPtQ?k+$C3^
zhWz-rJdak$KhhVK=az<JF}cG4M*n;Y{PY`*PvDREWA=29^kI{r{SWk5j5?URj^SvO
zel_t)Vv+R%_A4vm@2TENex@gPaL6-!q{sb2`WU{mC*uSB7S|ihlRWATbkrBj5sdnm
zFw7B*{vp^F^4A<4)F0Lh@C^UPq=WhgeXL4k@QY5E_1oeI??8S2JIR01k^YW7LE_9G
zV);d4SMsE9me>hKzajpoWH6+_5ZB`3%lM@O<!kf=9#Dv`&s(DZBrbk=`#nL=9lhR{
z3~?ncW<5HD{vX1H!r9L~`f%a#=bp!p(s=sJXCFITc=p#m`y2xP`1&{l|2r~3+pl`E
z9dKwxL4Ob)K=bzN$4mg&Pd=xRcL#`M&wtS#6fnW$-;kTLs44;Cq4PoRnSwr1@%1a9
z2XzwRiu#aJm2BQU$ouOD$S*2Ge;yDBfZ|IB$p53-A0G(_1VHij0rK}yVsUl&K$3I1
z2jxF4Nv%GIoPE=j<0(bZ%fQcs3P-~QId84V(Yyb^l+$l149U~}9+Y3i!oa5?`<;0y
zXv)E>5iPIRk$zW6xD~~xhc}eGUT3mT=ed^C_&+{C{>^>lsicDP|8ju*yGlN(93uPl
z{2!D*hWPjMU)(JPwVxzUd_nnT*vG%29+*|~lym>3<uxvi3CXK=(P+_8@*O2W_Njk5
z9*3MPD*HqqxIQBxen#~X$d>=2-`A3fp!{(q@8_j?Q2tTmk8qC!7`6NppzCYD`lGlh
zuloO!XkI-&qYUa%=a100BB$3!+tH(5FFpG0QT^(5(foSU>!e4&9Ix#G`PA#9N534!
Qr}L=o=uziYk6P}(0Mh}Ap#T5?

diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
deleted file mode 100644
index f283030cd98..00000000000
--- a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin b/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin
deleted file mode 100644
index d2228db77f98247a3ac53ee64ff6864a708d7d4f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9528
zcmeGiTZ|i5^?1g+u{XQx-L#}#=ws5xkcu$t8Sk#0q_o?nO^Zm>&=e4j$}qOaYlC;}
z%*;C8Y|}ac6@*YL0UsdoQT2;Rzz6CN5<>ZCtEl3mLh!){QWZt0DrvOYCIr-iIp;oR
z$M!CT-KL_VSJ}DuoO|B)oHGyS=?4!UP-3y{7T`mH-#}ZeX?o*M4!m(2pC2E29A+U8
zgLu6WVgR~d@dGcg3%HW=;yj1hZ21+p)B@dY>Q%Q~2R(3FfgX6(pzd0Ydetf~_)f{P
zTsx>X9jjmm#fs(Dy?LwV%vA$iT7hR*-Kslh8B?L0%}MYpw&#@I7a5-8J6;ooWteSq
zwR*v>p{vz^!Nt1k2eunbQT+6hQ&^a@%AP&%NT61AaqT%Zu^Ga47s72?oSF75-}ZgZ
zv<}|Jc=t1exJJ;jf)-jxdhukWm3pIEWJux<vE24m6NUYVkTwbo@z}N6oM$&G%=?B{
z59+~U13iq$%WORGJPWVG_5U}IWda|Wcjonax$HXuPi0Xgavu{MtF&oJ_{gCL4_Tbv
z%2;--YGX>`DMu^7+G$}|=sJ9sOdO9_Q<lMpttP@s>8|PjswOto1ZBxypJFnx#8epT
zE3|8f*%86U#rl(YE6|?b7TMv;hY+vAcpv`Dh>u0^TLr~e8GWjczAoq`os@Wj(Mu~<
z*}l#21FHzJdXeD=`|y8fczG9G-TsO~_7ANh#A*TY+s0Po(-tQDg3;g5N8iD*Q($RL
zLxuR0G^d77wcEQO(K*%Lm0(K%!td^r!Vywe`~t$9H!dUa2Sh2*z-uh_&gc+cW9S*g
zCWHCLWKj2tm8x4>DB{YRv;BJA{d(1&%pC_P=8oswdJFGN^L8`mAImm_R>{san~P0*
z&d)WzQ7Ip9RJ;ZA#9YZPPiM1r*VG$}5c~C?fAI3)FFyv5MA!L<$JFuI%GFED>Y_3h
zyQ(V6>kwO3TL`RNTzPBdlBz11nEI5WA@*rpuHgDLh+V$;_9Zoj*sB+pFJ3{Sb}V*z
zS!pZi#!Dz#VF}so7@{MJLERoxZ$Lfj9dse}YKkt0>9T_^Pm(5eJ1w`;<#TxX>tEh}
zOHJb9&E;jbbxp0(-Y{LrZ1qcYVKUjpWkN;?S*A-GFBegs7E^}^CdHR8ub}7T!V0>g
z#gx}$=m$keZ#te|t-D%Y&*Afs?aeuX=FeMuGB)UTse~QYxBTh}N1LH;DLSgAV~?G4
zHAB-2d5w3_qlWQ>_E~Ld&-8AM{$_H!wLHyv{1dob260okpE<ej-^rqGeALLjr@TDL
zGg1B?u8;KQv-oFle0~b`ppR9*TBtdi?zn-ssBN^@?E<}FV^(VV!pxMWHv&)Vi^oP~
zTTHT**?s+W7$1NR^8?V4{Qz`KMRa7%4QGxx-RX$5>y<$>a2Gu_A9I6hNi)nmED<<|
z>56X|S~A_ZJ9iJ(9zzw-rDWQ>+sv+_&@M!k?lw)t!Onvgz7)AdtAH;+Ze}mFj9Q78
zfnE@_23FnC%_ja6`_>RVvw6V=3MMb`rbFh>*PHqRcN_atx)|8ilHNj3{Yp(2@xX?a
z_HZ}1mPv@;tgq&pzM|I63eKY7sE61U)0hV2pY4{Udj>Ymw1<0!xu%D>vspu?$dI}h
zYQ%P&_E=br48__B8hS0zeRdEHwrk+kFsG-caKBJ?#GYqhcV~xT;%&p&hsWf}^E0r~
zrx}4cgASHTArF$`1{a&{_8n6dw~*j7GpNa5D1=OyYa!D##c`pgAFq}|OG+k-8;M0u
zrcUNRah9kGa5Blk@DNbqZY|S7+-$mGXjnSpm4s4li4<xlG2Dz_`?QwPC~#}2=Dwhb
zJ*rs<N(QsNWMX`TYP1Y(K{uK1qUYFw)2P>~#YG+8Zi)fk@$t*RL38h3r&%oqR?YSU
zR?nQv(0jbVQ&`K3gGQGCl2^2g701FyJCrAUG(#V=+(#BDI_R!2awiNbOAKSLFNofN
zFUZyB3&w2YzGOpRW+J{|9d-Lcm4_+8Qoi38s=y66Lu`HC$TRNAum9|1rb2J_M7&`&
z+I2L%3QGoetM6>65;x)xvi13cRkZQbnGXFi%&0@y1o-)sdF$s%Mpl=zIWxC^`o8^0
z;UDt?-EQEAqLy9`PUw=DH!eD3>K78rV!`Ut+{fx&+&!pGpx6WX3=L!|RhxQwp;nWe
zCTHT^B##8tX@pdhwL2bX7UuH2t$VKAW+lRDSdXJV#XAd@Z=|zC8_I6hXJ)jMMC&<0
zcsn%quHN1qw?ucxllZXgJFGn<#m5A@Q1F~4K1KA7lYOM@n`%E{FXQ_K_Zu*87s-a?
zQW713_+$b`aK4e&<9toJ<J%(ZTpZejyJ?uV@LO^Q;L45=9)Mi{%i|#&2d!&gg`2xz
z_F5R>k3phqpDDq8^)SB6f1(5NTN2P#0iMUd_}DPBg}o#rp*oR=_)r2yaXqHsdL~ZN
z*r0cETKrSY{}A)LIFs)*=pC5+m*ZBk=*3H!gzx|Az~w~0UOcdl9I0oc^Xa8nhkf24
zgYFoGiA5o42hQ#&oL(C2z;|#>pFmzfxD|(GK?3JfFdYoDm~<$B2u~~u@6!a2!jD@C
z_HE+c?HESfKix{gEYZ?<B?aGaC6n~EC-JvuQ*bVY^vNXU&b@<z{>cQ~OSDMB_?O}x
zcm^S|Cxh~q5*_%ifExr4b>MpfMt`=o0G>v$bN1OS@X{nqw3H5J4~*Q1MqGFf#=Q|{
za3_!Z7gF%?7AVX-Iv9k9TX>uXK%;O)<@%Kld`rkLj3Pfj?@s2qEo49GJB#_i^dUUv
zrAU6e^Ll3syqbb^YcLf@#Gr!uUVQ{kof#fvxiFy*!wW+gPw0xL;*E0z1qtJqZp9gY
zocPbDV!t|p-?B~%doZ5qmNG)}6}CT#@r?Jxf1oG+0|O*F9>#xQpgaEYuK1riGae7^
zJ+FlRBuL*YDdp4|P}tYCiQ9(h{L`%jvv-Kub2|*Q#J4m`nBS(*|Di#$7s&rz1aD1*
z@Es{oFyAnI_W<*M5_SZ`!#{iZQ#oRRbB{p69|6`yG=%ZIcX;BXA|2xS=U^+HAGDu>
z`<GR?aC=<9xVM6E3KAF}+~+CwyA)^%$_La3v+Vq}AF=(<uflAf!)QP0X9+Zh`gV?j
z<_)m;Gx~G4BMiXn80BF;-05v)sms1&;`trA3;Ec<j&v~v&n$vMdxI0$nDD$0$$F%Y
z^=1klT6`C%-*h_PRN-t2ZVva6=|c!x*8VR(aG!PHvB&Yp1-3l==!5&MBVTy*F&>tW
zt{+9<e;@+XPVjM|Z_ElzgYFlR!R{dum}+`0{)9kYeGh*6MWKg;OBX+7RD?pJp?<`A
zG|qIdqoLz|uT6xnh=-2~0g=&5yKf!-*989`BfPZi<tO`kaYT1U7<!I?$mpg0`8xc^
zlAP&J{82KL(918{Y$GHPi^JRbVyA%R8L@j%g!mX=H)bO^S_XpuMS&-Nx)t>DkKn<e
zO?~=z-NYF+K7~0V`Q<s%?`4FW6?AfXSn$hpCjE53k$e*W!*%$-wgx|?R4@PU*Wqso
z{){k)^t0zV0ZaYwti%7)HT098z4~uOe*D*017`(4#XLGBzr>liAbwFU5*-#uTLhf!
zr}}BHQn1yC&`<l((Y}R4VhXoNS9DxMfNbjJ|4A>SVCv<+i{}sA9~Fvg$^S6!%duAe
zk>BK^{Qo?XR?dfoLOIL$5qdLx@_eKoIm`2sb5tLDUXqC;8N@8lNzPF|@==~g4;#oZ
T%Q?zN{>eD@^2xZ$S@Qi0=ju#B

diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
deleted file mode 100644
index bbe94294805..00000000000
--- a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin b/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin
deleted file mode 100644
index 9b7c06c6f791df59d0650fd339ed10f850f64651..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9528
zcmeHNTWlQF8UA-??W`T!8wVPQ3S<H?1T~uMtS`GxD7b{1)I?A&stTgh+1*+1z<PEv
zJ2q=O0Vl0!RaJo6K7iB*#8Z*liU(dQRTVF((y9-rs;WNp0U=bODnd@0h*r>UzyHiR
z<MFOjTAQF=7{%wD|9&~=|L3xK=I*2Sq*JN!Z4_N;`Z=x0HBax}6anwv5Y3McKT0!H
zqJE55Q;NuF)mvc@+GVWNyf_!ZVzyd!ui7TVTQFR&)+8fz+My8!ZrJoJzv)`F`Ib|)
zEYA+z1;;AeVWn<)&0x-IIJ0hur4<IY>$%>nRWM^cThq|0+ksR4SabwV%Lx{sEH8YU
zZ8Xbv1EIR1fGbU}71~}n!TB?)PI-RTss;9(qkx9%VI4RPNsM889l~3%B4MUmExXl<
zgf4@(k?&rC80Uv=D{R9<wu_xnUK+kz5lEJgv61g<E=vD_P(Dfw%h-*^Y+(C!5xpNY
z!)CbXBf^Bf{N@uMox&?H{{Nn1b$}n5bLNa@t=4kF=qSq~QFI(|ROy1H;D?UgeawpZ
zt-NJ7TpLFcyPT{5YoCR#ux|6kVT$k=3#ttM+iD`Nl=U_JZ`H(>nxHD#D^pAbl$b_4
zdJ64*VD=`k8M(e0w*vj;oe4^`ycc+lMtksI20oR*Z<icj75up#{-)$td`jbz;8$L(
ziG5Sx`_>S0^}4|K_u&63@aitOM*d2g{U2OI$kjaXJ9n(*XGOa36TyE~4}S;OK8fWu
zUm9gj@SGbuwX(9GvYk^a`?F$6#P|a}S|Wt9m3a$rB%4tg_ybUB@-e1TAB+rQOd&E#
z9SrCEgJCnM)LpMSU%|?nwOh@m_k?R7tbo5#@q@W~^GV<Fi}iW;$;k!V^W7(cV5tyR
zys$VmK3nvHrN!}T-G6Fnu64W}x`w|<sh|DvduRGz`wY<lLXWOYN*hgGxb)ttwwT_L
zx}>GkZ&7MhYXi_yX+3owV?OoX1?_17t03o7SiVU>o%{29g5}b=)$`gi%U#qo7FkWd
zkVZ6TU?DBe<@)%5PS_`HS4z8z&E3QUd!glcxSxj`c{suLv|YU1&cl5eE}TEVdj8@$
zfqh?d89l^<aQqQoogp4QA?x$<1|DiWjA1wjU3y9zWSEVfzo?z$7%m`2#Bf1`l3q<A
z5YCjb-~=tV>FFh-i02|Zm~}$EHD^udZ8Gd?6`i(awcI5~pW?15JF9M>*UoynsT<{z
z9(B^kO!IO5^ZN4i<N=-kriusj63-?4Nn9}x+yv<PlZXGAF7C>YnZ=LPS7Lny=%0@4
zvE9-*{>%tpnt&a|=(gN)!_f`L3xh>{vpsJ=8NQ9@Cf%5yn$QhD4D_CSY*x1=G+X%t
zSKfyC3D_un0yf4!0UHwu8{>sdpB%ZnlL>8C>I6;Et^D+S+zZ{RZWc<k%-}4JE8aHr
zfieHK;_awCfhr@UfwADW!o(U1egRdztxy1t_8qnGa^zL4GG2td{2^``jcQZ|Mp^Qj
zsJh3tdhj1Vyn*7Wtt&R6n4`eWhuxoRE*SGs*yvAVmC$ypMjMf~>J3BY0}X2|0JXT0
zOTce6*20Fd!s=EPk3(@XLUhG390QKe_Npp86HRk00L?6H7$GQIb!37aX$G-HwBxZr
zq}9xGuAR^~8llk=8{rG8iLqFkVDj92!>GHG8kp$sV*!)RqFJ0SO--BAhp}CO+D8)&
z9}Hj{HzPu%H!yLhi3n3jK(!ic#CZiiqK+RH_8QHYibDq~c+4qy)0!{G6h1M+6O2$P
z1fO!Nj$)`5L~7aB329N42uoI3DW(Dk8SfKx9nkY_;AX~rQ%7~lw--K*%WP0Ti5(I0
z`seh#&bi&dG_-&hE_Rs~*{PZ$s%illlBVGqd_fX~?MmR-q2o6jZe`KHi%=!RLjgV_
z94#C=<Se+A&}!JNP?UTmWg1;kApaXhanxL=fb~`EO5L&W6bOAX#zGw9cod^Z6e6K$
z<7676AUH@)V|Oe-Zz2{*^~8dlZ9bOqIF_kIEGV`0v2bzXNQkQFjfIPJ6Tz@pPc&+e
zyUy3YwlfoPG}DP_L>aH!ny5^xW)xP>-f)p_CLW0O#Dgl@{O(M~@tB2VK<ER}{?wwo
zL$opMb#1&@C?1*o!Vz$^jt8M(`}n5RHfrILp_oO~W{Ek_FC<oW1K#HXx(x0-@&;4j
zim#2|Q_AxNp{l!J)aDzFSW@&nW<yD%+lg{lTHzBEicV92UaHO6M&Kz?Q6dGwBO2T!
zIHJyi@=bJ>WJ5V1`plGml6eCsjPID{p|vLf$E)%U^dz3ATaIWC1M<6rT`mXC0v<HF
zuaiBj;`3`SW4GYP1@}5IXII#Vf#m@NLX<g}rD4oh^LjK|3;fQ+x|pFAre8C}Bz$+y
z6J6XJ!+o@$XmvD(Go-JhYjo{8SiCzX<WrPgM;Asw*B-!o|7Satxh_k-M)U&yGCPKZ
zFXDw7jN0xJWd^e}g7uCx)>9eQCN6$|=f&UU{5Ls&C})uX(D|L2bD*y4<)Ry}^fJEp
zD@CNIPbtBC*Jbo1Sj6)5a;hUfZ?J(IM`-t=<k*YZ8==$7{T=!i*8B<N6~OHbtx6Vp
zJ4ch@kchLxNddfjQARVy@Cd!s&WdjnckDuHKtJ6cpc&@n>lZotZhK&WzxHJR@?4J2
z=D<HNz~|wPe#w6#OLs6YScLp?rbEvHVt?|`x18<Jw<YW|JlLV{NEq?#Y!f{Lu=B=q
z+vw~;8V!bpgPjt?&(lzwhF`{S8-Dqs2K|{^0MFKHwN4uJUm>2h*4YDj+6AZK-)O+!
zgP`v>&u^o5*j~Cr-vEOBsg2OSc8<P=JsxIx@Q=0oa~Y8Og}(dg)T=}NVlQ_aL-fiZ
z@<;3PmkvfF1SbXg8f#~Se1_%U&ZU03gl|`;r9IGJYG=Cg+t-!fzCKo!iSygnw?4m_
zb@@H@>S!kR@5XJgpB`ra(;}XOaXi_`pZ-&?Qd)dX+kL|j-OT=Gg};M>zK8nSoEThu
zKz^M={Db|hk2wB1!`rhlT+UG%=Zk3Nb`tSFPkY0mp&y;OrN|<5_I}E?hDme+9q_&p
z+K&AK|I>8msz&eZ$w(NvyP(nUa-?T@zoCz2#Qy0o3iyixhCi%N9Ecszhx3v}KEWRy
zIplY5AU{&_nLV3J=`RX;B#-^ogFca;NdF$dL{S|^_HGKci=!dFTfUu3AG;NL#I>T)
zxg0&aNNJ|^??!Why<@8W2=qVY=-$N-iF-z;^L35h$kDYi9YX&Duw@;2<gPDR_dNV4
zeq0dC2Ohlpi1pBy9(*`T)2HL78Tb##0GB`a{qAc<B7D9vCobksUR>iF$!8_<(nrYi
zFA7~Ok$U-4MqL_Y9`0wNhhxUmkGKrbyM1CpT|P|IBer<#<~@8F`8TEfTd`!c=$2>y
zx^c|;nsoG+A0hwa%g8SdNI}uW*OYYIl5N*t<A_!#c180(Nl<$v?nw#e<I&omN#J<-
zvXp;a;#r<=CEfDF*ckZKWB=i6B0)Wx;xnS;)jsm?WsIAVd}?Y-d9}}MpYK0PPT{|Q
z8Tl7Bkmn=SE&tog$p28v=cPkzpKsmW@*iAA{`VW$=Qz9NuZKMT_0+(sl;@l$FD0*V
z1uR%zmWx8i1+pR&&i?aJT3O=|t6^!M>B(#5x(Jd}Ok!Kf>wN(1Q@8vNx&@_BxBRWq
z`Jp?LO0ib*_kpgijm{s(%^|A)&!c$N{D3s5W|coiZ$(b+kFuj?wO?vZ+7r)D>fkB?
iIjenAb5f3DRQu5-Mk>u}PRen7Dv#ZADz9o*a{mB2)l@wI

diff --git a/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin b/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin
deleted file mode 100644
index 6e21efafc59f39c347cd5fc3fb16d696fc03742f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 21056
zcmeHPZEPIJd7isHl6NFUQnD?Pl58z)u~87^cza(YV>ec!*hw2Za^e&X;-u#zc_hyv
zkLSG;a}+I`R*b+dir8&|A}NX>4T`oY`XdGUBQ1hPMV-Jk+BE2o!f5{ls*oD7B#@S3
zyNw(6dES}b+ubAW#-W9x@Y>wnnfG&^dFP#(-CI3&*FAT~W3gP1(nnnVjaoPNylmX3
z3-yoc>!<g8TAfgnsvGD1DyCGrvCwF`&GIzvyzA<cF3YvlSg6gmRJyj3uGZ$3Rk~Se
zHPcPE+FY)c>J_&%ce+uTEtP8JW_6`fnl3kI7D}~ccd4{knXfi+TWY%HYOPwEFD;ba
z*=D)6P+F>#Yo%<#*Seef8n9BCeH%2ol}5!~f$EI-J-@g-U0wtz)uxnZmTL_(u9+v&
zX}7YtyGTJ{d3QC=R;EwSm*(8^Qbnb8VX<1nJ$;Xq%b|+eN~ujRHOh^K7TTe{jNs`M
z@ayW$R;k$nh_s6^^MFj(t20tbdB3&|Z^(avCsae3^5WvWTdpri2i4u>=5lkjjur;=
zWp*o4^Z<L?&+iP_H%}?vNWFilvXow)n`=~>dd!;K;L$OXk+@b$Uitoe@4C07`AeBn
zd9hkXU?Wz7L%npgghUw%Aqh!ck8^hW5O^meRN9$hS2Kd2Crpkq8SUrF2)j6+dFkuA
zBo+pnrS^7&+$g5@(Lim&+~0?K`)fA_DEjtc)LUv_NB!fdj|J-c49AngKh(kh6~piG
zg**c-dVD5MyPkJ#YE|mH+7;&R_oTkNqyFEd-dhE=$+uh7-`lP*cfXGMeM1571~uWU
z!hcf-{}zs;rk3|S;P)fE4)#C3zJ5$4w;o?VmXuqiSbw}ji>WZS5|6-`mQ5rL$wgIM
zog|8}pAPim93yJ6@#a!}yt(YoEL3Z=r)O|inlCq&munAI%j3BjHNLRCR2gqqak`u>
zk5_6djq&nwvpT+5ogS~RHW!v_xwMlRZ&aI=_twiZpDWK-8d~vq$x}V<o~|{kSQY$&
zT4jwmzF4l!pN1KwW^yz2Gc%`GXPR?vc459cKmX9@3gw5FADW+?ZxmOjS6b!j^24nK
zw^E$>{KCwc&(({k)Adyq`}U81_q*NSJ7fmwLS+%_&IT#*pMB?Fp6&kYzl?F^HH+0H
zE3M0{{PLqpr4SYRNW`qYv7cD|@uAq24eK;6RxeMn_zzU<vaZ=c$u{+$$K5N|YCI$7
zA)JNt$_D?lQWRa{i3+U*m26xQ@hcmf5QD(wP3uhjoAKCBer$D7?OJ?8#q{ke%RSHy
z4v0U7yN!)ktycUzIFcwOEbD5F*O>L{S+Jn|RGinC_4K0<PQ<OhRlLTm?>xqODMJ+-
zhRToVx=CI7)c{98g4D}apXTw@7!jjj3dBpo0_Q~Yhk1!xZCN2)(RwVRwsFa_aPiQ9
z!F%bcW&L9;z8;HNw@53e<t+E$_|LN}UiO-4O%t12U<!9Q-NNFu;Abj<Uyv-WeVNEu
zB?{a|FD>gwF<xU9C`2|6`eiVHYZRx=D;t+L<LhyZvF{^J;)>ZUJnB#p2q9&S#H>Bk
ze<x2XJl(|8x7i2QxGp0UtowL-fTxe+bnV)v<*-1kEh4DJh>JAi+Dq32GwTM@23h?e
z?`)nP;)#+kfr8ufisxs<<agzMnBjNrrH!kXuIhNCQSFWdx(@m)y8WB_Y7wj}n}|IC
zVSR=s37T~)P8-&5;CyLw<MPJF=EjdVtW%_j^Y0?#XZ)he;sA@Ui6?(duQsGj8`dGY
zKZ5hix;ebHo)yLmjB4wvjA`rN#g$jAVHW0bx@u*4D^`i`HEYB#-;BGLtO=I?LLd-e
z?`K)?rcvu2p1v=JM9E>4UlJfMS-h9i6|2hI3Qt9xHmo_EFInOxZ>0fOFI(TB9j{T0
zI&=T>2H<)1)mKGv@VY6DV-m1_pDnz4#hPJ3G}9<#m_e>yy}EgIb8}Ot6Y_6vUfX=t
z`h!Qwy}{s>Bm+AyU(@)zsM|0M|G{ka$|Wr=9nCI!Ild7?6f^6kSCA#E%Qbs4J%Md|
z*`2R6?Z#4RDpOYJ^6V^Dy^T_%dZuC*xq?k&Ute)+HZ~aCK-+1of9GqqQ?%35leS(0
zml{}6Kk(Vxm01$|Ov0m=zz;aiXYF6M*QN@`ZT=M}j@y&GPU6qhO=M7)2R(E44Rvuj
z{eUy^rurtSFAM!s+8*sr=J4m}`pG=(0FY{<I=xu2)0JA&UA1>-(RWOx>t$@B?eytl
z-cHw>uHAvo?pP{`ol@rb^|t6-cZ=EUZZUV=E#|Mg#X`VhF1t(dZ=$mJHtL=T=)OK<
zY*M|OkDeaJI(XL37K&<(#d!>GrpDA*{f@lzexxp`n+BGtG4~FqSZLS5Pt?ucku9Q5
z9lfW72c@-{(lj25)-vy7eqO|Cm5C!gZFrsHar=SYO%+Jhoo0}bch`&qGi)I?DQw!(
z@=E%&#w3p>jm<R6)!B3lFf<kx(*~_PyclyyFKp){CX3zmF}p1km@jtMOb%v30H7gb
z2qHE=*m@X7>p{$J)A}r?v)y$o54W&G^1v^^0L;UQ?BsU#F>UUy{Q_ZGc76LWFy<OB
zCKwo%W<9;wOgAKK>Y9wRlby^++_+vu6)??f<Z`#G>A8wukLljjOGpKbVW9zIpyvR#
zZ%RjK-O*H>^HZQMm1on7%f=74fOUixr6Y3h0La-{pMmi$Z28Pok*+SCUQB<!I$QBL
zX0zfjHRX+{H{2;Y>&Q3&*X`SgN}CF>5fK%Yz9@i@&h>H=sk&Wtc+1gf#vFX0i++5L
zxf*EAL_o$|?QO6|I}xnWkln6Mz>JOjVxEt#<i*e$w1{3&rMtyGi-?lu+V8P5Hlrcn
zg$xFy0WWbtUV|N~>h#s|oP{IV?Hv)-TAL#ZoUHFlHsDJ~ujw{IBm%wE&pI5|?rauB
z(-3qjDu{3k7|*zsa<fujUaZcnrtv^>riq0(-WJ`Hec$^kE7h51X|ddBO6J#6PCD8%
zhzn5_00l;)RcK$NipfK&BE55u6EaOtX38@Ql@cB)HPgYqlZezUbOgU_#>XtLF_YG#
z#}IfN3?`y=C~KKcY&`FOK5QV>flU_O5^Xg?AzFpdi&mlcej#jV8Dm7^E-Rs(&HLC)
z1h7F`4P&EYaF)HZUEnY{#j~Fe9g20r<3(XRbh5}HQ7k$GEbDdATMeBoECU`%idl(<
z?4%EmlL_GCATvb(5;$iy!ky8>JpEPxf><X&ouek_jhZ%QIj>6sRbG_6Wq@Qn(b$+v
z{W!Ptk&_DoB&mg?Yfdkv3K~y3HJ6*nPMj=!;G~{VJO=chj?JAI^kC+&*j__X{K^vP
z0E22g>^9ToI^LhQ(sRu-o<l|Pzkum`CkCCKF%))UkS;G^-rb1-u8g@duLf2xe2QSE
zdPY%urY}n1>6K-X*(tiL5J7U8bwyCAz$+^<AkQv0lO~^A0FJvSGd!Slwz`s@JH5E*
z;~=n-4X(nWD4h>GK+$we^Br`-(a<rMchCh!LuXVpPEUKbCj>!J;IJbC6oF1$BSjV$
zCQv2>h6#-=z~_?1BME|9Ymw`rxutU2t@&gDX+n>0J=Vb#;gN=F8zvus$7~NAY!7s&
zRE*9#cRge7grF=C3(8R@Z5ZdwT##w8?z@JjwrQf0GIfCkZwx#>F5@D6bQo^<PG)^4
zo4%9MC@q`MoD>%Tw^_An@alTvn-E9ZOq#1!_CT!bDL^kLxw;iCLArD&)c_XuK_lqC
zPt44DHt2p9ffwSFpwUYB6buc7st|{SPPj>;Nn8p5CUIiirA*zyh&KJ16Q}~r5)^#2
z7s9xXun*amKIap=FdK(+AciNU*QF)28Kg*mm~|cJ+CU6X3UZUeF@eSeNK-NtwOW!Q
zp*tnuY9>92<-imU?<sLd+i3F`IP!kv_~EVB3Y2O$m0)nc(`J&sp!qOU8`vpuv}x!C
zSDdY9kuZYDPDP=2#o;>l7skng@1!5zO!?3#vR+hqPPUn(Z?z+BCf$@^_)6bW7d09!
zLArD*)qu{eOUV6gY=E(zWpWD)gDGu9E1^t1i+msA&}KqS5<ZOc5TC?}aaS@17xf0J
zVkEq$B($a=66%X+csm)1GddaOi^R8+k${s{$nb2ZZzrR}UKmIVhOv{;BARCcS_3ie
zWCVJno5V;2I~f7l@abh*4s78#cQOL8G1cZW$V~V_)DB`ZCPS?@{T~Q*flz#CV-xNY
zCxd=y6B#Jl=Xy4h!<<aCE5J#=>wK<ZT%Gh?^+Q=Nz>IV|mLvIGFVKuPZ7zfl25%su
zbGobO3q56~igK9#l9+_TNYpzS1B$fUv&3fPBE#D;DUS%z7Uk;fykJN>W~Zyc38{<Z
zVMp(v1qi4UxjJ*6fo>`i|Bj;@La|T>b&LT0SVi0oGcq0niQI7{7$s!0FdQQ32o!Wl
z3q`qpZsM5L1KJfF0%r57bx(j84Q@SEJDQK%9xlW?hJc=_9c`-3Wx#RxtTU3xb9(H_
z9z3*A38nj-9)NVB4NjyFWsmEtklq%j>$ymxb2!7a)d;6JUeC1?nbTf)wFiNLSH!(2
zF|mtBab{7I(~+j#X>)akxQ>23-N7fLC^$@Xkn0Ve!v#w?YW02^1UARBB=H>`^?JGz
zPDHK9)pL;<!qIC_gHA}D4AAz}>gZ`)IH^vilU#c_=;-xeP{3dk%5?NpS{-CGpvWxb
zty{^6#(Po3B~)OW=VS$@N$H$P^mUNt<t;tE0G1F@&lTRtL*@ukWf)=KqXR@z(8Ane
zy0KcDNl(u$u6p|&C#T0a{ov?g7JM%|E2PcDx$aVCDLmcYUYm|}4h)#yPMH}q-bMCp
zJLu98OB8nYO!tjJiru)QXBr9y_)eY~dqySKnLD-800FFa^eivE=9pO}$2olmZF$|3
z6xhi#?YmK#m$9@xW_jK3oTGO3O!v5%$MVdVJ9(!2Jr98$Jo8-AD)X{m<!o4a*n7Ay
zX!2J9O!w^<LQY;Hg)apV0)Y*<!<R&X-yi6>)^7@2NALE92ZJ}y0n>iZ;c!2X=ksVh
z7X*x8BWHr#lV;bR7^_d%cqnUrB+TtUA6e%DPi2@<Lt4aT>XKIrJ83AipX*3K2K^8;
zsG&&RJ81|!TFlYcpauy&YM5fM@1$X>?L(e(>2%qZv2Ul?4YD0+ns+I6y?&5qAqD6R
z7&Fl%ul4Z$NZWJdB|rc+Je)xU6TE!;8T(EVCGL|Sp|xFlaLj8WGbY|(*c3Di`O)Fq
z>(Zk^t8@x)FmCw12m!^=f!ghq<@H$5FQl9Tb6_y^3PL7`6BCF^fw(;g%)*OI><#o9
zUQBcbpNAiPk+Br&HoSTXppMtg4qr1pz;Q5vS1j9*F_JX-JvsA2G^zmcXgG>|X}FCJ
zw8(Hw6~W=nW8C|OybHfda|+VYME<>I63TPR_Q#uf1CgTc!qjJv+iRvhyy!%%AAvfB
z$+LL%7mD9N8qp6i`HT>GED*YmeqQNi3O-cqY+MoDO$Um6r5(&cMT7%ZCE0wQ(|G4Y
zI-R^6%2S-d5b{u#r{VX4k~{b?5(8iS^xg{!VV2pNZwfdc2IH#e$5oa))ks{K^h+jq
zNjdMWpe)_%La#ei%9zVB=FICmFX7EPe!k+XynMk1+)BBXX1lZT41xVFKWV7cX1NkR
z3m$w#R*_d-Df3(W^7OP@S;4O--qLZlL!W$t_7=6hB^Ta)q0UAwm1o$_)LKd~O^hei
z0Q|7=m)NWC=~iN2;C>>Z)>(I;pCpyUH-75cs9)|;_=(@fUccO}GNE#pIu_zj&{{y=
zQp2IL{8~0ZkHKP)Uz`Bl!uN;Pp)Hko_i_16JbxGezc~`ej;nS2Z2uYjeGPx#<WKnV
zQNa-YZ2V>LcLGg)5P#3$FL5}jez*s80F@X*1%3g^A3^fZ8abg7y-CHN-4DfaUrf+j
zIp_;-erB%!ZLY7IE5DcE!5@;d54_`Ma}%xi^s>J5YXq$)_*KJ)cF^P5d9&-ewb+(?
z-ax~*4ych;BQb_+Z9tt{OKz#(#69u*I?Ant+Au8Y!l3GE4e8Pr8jAABDn3BqF^azn
z1L`ZSWFmpvJNE-J(9g9}>V&~WeQ!|xPAionuF2QFG^m~%1b-^UuS@RiHvEqy)txBg
z5c#f_`NqW9JGVBxrM?I%{mVfA+Wsx|q$y7is-rEXY!tWDmwME*<0#)`(H|*4|FqiI
zimSsY-}D;ju;1T8R{`Mq^o9k06JxDpoczf+=r5d7k3Th(ls>vS-LJkLS0k-He8+(p
z*^fqDBp&3(LI0#|zBMua{?!?j&w);VfycpCU!UKf&(q&SeY78^J=os|x=nSjqYS)?
zt<)j)HB|7pFiwB_9NgB2+ywQFwGJMl2g6!^Z&wSk%)|G`M4?}%BZe;W!>NEg+G8Tp
z`2YA*iG$Ljn{Vk?7vr$EU+~#ad(RHWA0O!-_UQ{L;Fa9(<8=YQV{6`s@v{DY!5{Ya
zwQg$TkMdVqiAcP3MdGEai}8~1<E5)B950Dbyo@%7hW`A!@0yS{K6gqb8wcfg`Vth=
zX*Js1yVs|~{%CXGKEM1AgWx};zmBq@;noD`H{9UUzfbx;{dE)Ri1)shI*9f?sy_G%
z;%7W@Zf$5w{T671dHQ1>9lWnw3jLjI99B1gr2Xk4{g9UbFH2oKl1R3O(Y$pGf%O>T
zDQ4)9e~s~}?8$?)_kLtEi8uSjVI<o1k4hPI@MFNxpKIWMTG=lOf0Pb=@b_8x)4IX4
z7grZfDf?50=#SQ~^<lVBAN;>K*bV(GLmvLppXY?%*Pm=0ARtJb`1_>+Wgpu|f3!Y~
zH~U*s4)E_M0Gj^-W|4)%)ZgZRLCQY=xwV4`CGZp9Kl)g^KWOh(^rzdElJSM%$niAl
z?nxod>%=boJ0kHX{!LrTc2frcAdVB@KLR`s7(5j5M}OStl}&qZK=&EEG2ZQO9XJpw
zzaad+zoTv<DH+1|r#~qpRF`2txnJTz>_3h2@IELs{tU*(T8Cx44axW(59eR>Uo=13
z?#Kun_T=Np4>!^NL6JWNeobxXhyB7WW%MWV-C^;U`R>mKP`>e?U&eee>c;zIp7Q<g
zN$HGpLhuHjl25#NwB7!s;4kePOrX6<o&SU%`nr21fB5(e^h(Au`bXW~KKcC3w-+CY
zi^q^h`)#*xSZ6q`&vtu9bcWOAzgyMTWRAFX0Qq*HJs+UGoZpl*5K7UXzwIWvSiJK*
z<|o9%ZEjBwLrYM*aBBEAw9hXio>OjOOV8Wqn7_fFO!6y1`6BSb;84o#+Df5}NuY=N
zF@B)_{0a3#j3@BhZja8N8XwZ(FzL)E)Yso1&<A_TEuEn~dp$jzA*C0=-{&gHr)nMJ
zANW8Y*r|{Ch5q($>HhNMyW@;Mi8(FbC5Wx-d~=)I*B8{ET%daB<A~Djfja7kJ<%U;
z;{p50Nvacm<QKN5w>M%>&#&u}U+8aK>qGlRU*A?AjCuGaKSg%YCv>#8U)n=|VGr_P
zr$1hMd$)8vX#7)d_m;$)*B>sXUaB|tlgFq|^v6y6AWnTgo)Bjl_TXP~lH$Ve`O_!<
zYJCa#*DLndb$gPZB00q0E&i;-U*dI)?4(QnSILi%EM@U0=_V4g09ES}uSvJ7OP3K>
z^at@0ruSe$Cr3i`qd&v+c-*Ipd~a`<p6d4L=s)DMgc(sB?<n{5`|DOI$NS>ip7=NF
zrh0Y!qZ0I|9>efD!lg0(KQ@4JcaL9QU>V{b{ZCo$Px)ocx5)1S`$tet_UQhk{&AFh
z<2qhgK813>$xkRh`!wx!K_60*XTLA$m&N{`UcW5%d;0w{?2oh(?e!n}3+u@Nt|zz7
zV?6}?_O-Sw&L>;v*G)N*2-5dHl9GJBb&UCi_ziaX{y%pF`6-cNs!K3mL*J;|mu$C(
z?c39<^C_65eL?)6R!@FGB_v-mNxV2H<vtttf%2Xc;d0M|rVM{>Uc<c{@Z!>-dSbPE
z00W-#N!?#?_sBZN2=F_43;am}P;)$g1o(CVPwuopN4_N<Jy<Xc9@5@klm8*3`GL1>
z^s#^YdV~6&d`b<l{rKP+`+->4Kj=G(_4-BZy;3s25&wSc2>7KxFyEhBLpVQy3J$_s
zZ$Wz!IvfoiPpo#Olnt6+?#6~CSnf%w2Tgf;pu_&&NBaAT&(;w7Q{rLE`XS5gf2`l1
zY$!edqlN0?Z5{IaS{A|!`-%2`#FG8UzB<?Ui6>TddK3S?K7jRiT;{J{Y44G+zH5W(
z-lw2Uj&)1@cwE`7uKup^Ke<EZ<L6JQFX9_$;@drfO)v1f*W3Ro@(-#HufDFhm)+X>
zf~C$6s$riFq5lWvQt9Ncedq(FyFdA9{7W8kd*3JSI$65^<DdAXhV7m6I|%$ALV$CD
zdBPI%>VzqMG(seQ@L=wYun2k6R9=k`qw-v<`SW=|@r!2C$0|h*@*v|&nqr0CQz`XR
z(4#s@$D;x{7WBnGM|qP_9IYes4~+cEoA_VYLH=$tiT-n>aZy`mc90+DCIH9Zze``e
zb5#D_no#%X-u0q>LZObD3a=TQOM=H^x==q6ti*kCQdi`SLCSNV5S9N|?SF^;JEpX}
ztuKCzc=DVMd3*!<S$Bek!117w_o_Vmo}AbJM`8bs<UKyhb6$$dmv@kF8T&zbuA`#z
zf4GDEGe$mR8bJH((y07%JIJ5k#y;B_mH#mCV;4b9=y<`%(`NAS<h{BqZYbZapJI8{
zO5p{H9J1KcW1Qn}UDx{O31gq_3mw0rOI(`<584VI|AYd4IvK3Qy=~-!0mt}@wr3Oy
zDIc}(%SZG^?Y{!xSogsQpC^A`0NCx~pE1FH@%~-Ih#KWRulKGV{<Qgv$a(!6wddjE
vU4!;y-Rt#hRM7OZGS{G->>s@Th{)NdY_l;qf^w|#a1>(T<l*XFJ-Pn@5GHQy

diff --git a/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin b/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin
deleted file mode 100644
index 1196e215fefc887a1b7db4c451b31d9e08c26559..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10296
zcmeHN-ESP#6~DW)w%3lc>ljFrCNvX>A++guKOB3L@L>pnRt+KrDpV*PW<P8%tamrF
z<Lx>out_UgRTW5WABxlm)F(tg9x5KFR8^=gsPqBx&_4i0s6r@BZI=KQl<n`Fd++S*
zu3c1{P_^YMz8~lNoO|w>nY(%H*zw!Kp-_52$QBmA5luCx;f-tL;*Gs>yl?A$Vp?QG
zKl;l>NC>MoSMyxY$zrC*#d*2r(XP#v^9^B@>sGNmQxTR|Xn2<A7QIT@u2zcn%tEb@
zx9zgy73&2%>v*|2yIgVS?NVX3=wWJmu2U=*%d>WJ%9pb_3AH)LE#yBE8E&CgaO)73
zVB2O(m8?^Ot3{93xk|a_Ic0Bx;%DUx*@anq#&zZkdQmEtF?S0kh4j}>2cTTHB{Sje
znp3Murgh*B;@!*O!>V4x_8PE|^kQeUl~%QwV<7SSP-%PHL}fn!+C~K-9;Z~Ab)D)Q
zdtY@cUd3Ci!oz^P#Kr?Jv+xr1|G#;x6Zr1=!n{?PnW+^#nab*rXgm@er?hVC_1$+I
zyTg|BcEWZ_MF%N~T@IcAd!LP}(6RZPOcIVhrH{dX+nMlBO6QsWx6Z_-il9ESm*y~~
zaKsET(sR(>0Yp6t7#C3#*WgiLK0Vq)e>>oY*wzET7w}L3zC}@do$2E}^c6*~>9oXC
zOs}nIvwfT4eQki6J;(6=9{ArGuAhQ!^jE`V|4<vCW($Cij<n;`RH}Z#^q2L}x6t+}
zD9x!EBJv20@!>Pg=6(@vooVim@>B?dALx-H0a{mNH;k6N5gmcQf+Q@e=tH6RVngUd
zAQGW*Z@xP2RovWMv7BGXVP?-dwMwP@P|+EmFA9;HFHS7fAI?@z<{OK(iTq-<Qd*dI
z($0jN_GVGXtBWVyhZgXZuBU2c_u;9^+)1mtC_=yf^ABF^|K$}z48nEUA|Yd2X!YU+
z<5YMgbkTS${Hh2ouNaG1y<;@c8=<fnS}}}pB6NP$cod7}3kr6ch*y`-uU=Tb@b-BI
z8etF^OX1f-A)^n|)$^+>j0is+CO4rp%&FSxGQa_ogt0wjTt>RDp^mNi8qM}mS0k;)
zAWe5rcPDi>P<IU7%E}7NznYddbxgv<Sz3-#SEKG;>ON21Zt5OnsXV=c)~s(AX6KDz
zT6@&(q^^!`d1d*c^!G9HmqLWl1!v1|F0ZcsWp!nF6`?aj;U>9HA-C!Uw^po_&5VUN
zB+qeY3!Yh<w+|&8VLACcN_Nez6;Bn+Nh-gpM4J|h_H5ZqnpQSr%94D4GWmdcgSm9*
z-~p5VCZ`UV85%SAQ?RK7U=zS6P9OPKvZ%b@pPc$oc^Q(Ig8V~LAL-4c@t2hF%mnnn
z$6~FREfq|wQ1;wKbECy`zp$zfUcgLiVRFK>s-A21#ABnfZI)~&4qSR2$&W!t>SNH6
z{up#j1azcR8_pcHy9Wc(E>#9iitFe#{AAfH=FMa(BbI16i*&_ji#a$}y?*LOoIM6*
z;nLukdwqJc4T4<&<*!er02BL;+xW^V=j<%LsLF}MR5D6=c?_(qqD|t|-M`tyf8fX(
zf+sgGI0?bz1)gGL{(PlwEl9VMC=_Ek&nf1u23)Ppl`IvIBudnnOZd!MR-v$M*4@;a
z?xIp|R&W}EJDkMcj3E`s#h{0XEIb4<xt510r<*lo0){x{ydP2o4GN()b#MYER$Lzg
zuYr+xLg6c-YLz^z#@kdyDF+%v$xJ3WN!~4#thu5>yGht8@l(lznbcG!lO|7Hy~`;)
zB7g@~8m<NrQZNuuiYJ{Gn8Nd9%uO*il{mucN);bVN#hdDA=pw|$odGGGwe(XCNSSO
ziur;j$SX@gp%v=$Ln8CE85l@qdk10p0W;AcGgRV2xq?iORg)$Tqxx#9r#PZZ5F%|i
zPh%HZviUhPVNznOVU(nW@m)r-pLyCaG76znM!{wPK+jg?+=Am3@WNNjEn4^x%XxU0
zz$M1<)ZxR0dNJqOC8y?brI1`ntII#=>GDs@?(*+=vO^?k$T_*Wf{pi5&kA_YmQVQJ
zr=|Do2c?(XYZZFwo=m=8FOicQ@Df}-USe;%yo6?kY}8B6mPy~s$$*zr-Cpt$$<rBH
zYcG>HAscX$*n0fb5o>2Fly1sTt_vC8&qDz}lblL?boEK%vzwPSp?5#2qHn}gu=RL~
zn6jw0r+yqax~05L$%OA~G8J@|Y6|D>jC_2hI0GoLX3|rssiOyPJ}T>&#&d2{ng#Hp
zcGC4Mr;4w#hBf1zvNSP=Idu>vZ5;wx*r@(e(bH^fG6x+;D}YGLs!It6yD3KM(VC`u
zHj}<Blb{A^`C{FgStymX$TUkMGE_Qbau1%5x5DG`%q2~klja&Mb*EFZbZZnxE5-Jv
zDU>twj^&oM)+Fnb{2YoyN4hDbT`Y!UA+wM78Wpay2nWmV0h3EIE5MGPhHl{&JpZjE
zdAR+SRVe4_CFL}}S!xBYrGx5{$jN5iLLD!2-R*P_Dc_a%5_U6wRO?;?=A9hbFt{`b
zM?_>iDz;+WPV;SYPP!wbf%#NKGzs>pVM5_If`kxn?e*7vV!sf}+x&Gzm>qaS?C4nY
zM}2@lB%&R7Rsy_n0(UgKS|YMLDjvi|NE3gNkzuxlFF!zN?97PBP*lV)9|>bV86j!h
z=&pq(AE@ykYJ49f`A&`QHpqY7u2hq5xRy!q-rFQ8CwMD$>pF6@o=ZKRT?)1MdW#Id
zDkgR=Dvf(FmSf`V(ojo$2Xndt`w7;Kh*(x6;#?f`%F8hT&&9-3jVRwm-LxIy1^#Sf
zP)ze31@JG$#rGS7gLJzWefNpD_(L4@gM*i2eN(@pe<Uhy!WvETFGX77TUe3)1mrEn
zTH<lFuF`s>CBCcHWPgM16>)z?SA4IG13%gl-&6Q(OpF5m0pVYGVu18-a|hUg)-<i3
z6vGWM(EOqh)$F?l?2lZF^-rID_Oq=p@PCBA4WqRO{9-$_FhAW7Lf&gn4v5#|qOYND
z!CJt>-+hg(;%Vg30P%x<tTD6|=_!U--ksvi%VA_C+AeEYoQnV-86bRw@aN*8zJ|K1
zBzY`fWf-jy-(Q^q{wxW^AJ$`y7|V}I{_s<cNY{S!b?rxAAMHoP-;ch&&i#mV?8ljx
zqY<`f*Jp;sRitl(^^LH;J(0c!8G^?4l7Ia%-@mIupdV%WQKlb9ykxCldc^0<%R{5&
zKgZ*l7}moh{u=pr(2L`&Hg6>H;@lpw*9(V#_Tsfu#3If;iM3()Ys5cVlV?ttz$ao@
z$9TVZjeOXGe1N=1#VyN*czsVqts(0zL;OBYc$5snuL3_#cv=Jhwjo}L3zMB|-YCR0
z=Y#nS%li_qu^;A+7}2-!&-2>HlYQJq&?A4N4FlWJlt%#a$zG~ie7^~Mqm8)2H(>+h
zKVu;Mno=I(XFkt(U;bX?_aO0eyyBs~Zht?UEZ4$x4H*x%e-EwuDW84&T@j=Fkp7JW
zuglsr>%ZC^GH3^R*I&9*T<vb*I=O{dVBZ}$OR_Z$aXu~{Uql%Opnn$%G~so<5x~D0
z7q>6IFR1ReTHiFp3vsc-$AjsAaBbU1zjEu%_HFmvhacB?dg88QNA0`6eAhiHZl4UF
zMBqOk0V)@=E_B*5t(H~w*wa{|4>qa?Yw<@F@ZyK?(=TdWBvQP3JkBYFL__asT;nj}
zjgp%Me78-6m{YGWbCS&OyJ?TC!~d${|C`S#C*AyHUpI^>f2Rz^L#?BA)Bb!N{@-$O
z#|IccB-$dn`LC0xYMVmN_NxHxQwY6B{LBpypOo|JbO1)v<BI<|1t)%bly&oO#m2y<
z9{pmcWHjaA=ZNOl`$)eR6Kq=1>A_U|dY?%@y<clS4gcXf{FycQDW$sk|FjPO3B{jK
z29bVxUUu`pw+?@E4gKV2H~*F3$G@I4@FT@fF%LG)uVM6iKJlyLqQOFZT2A$E%49#C
zpQZ%+{Z^%)@WIyPmti4x2Z5NAu3)48H$XOZ^Z!`!2We^E&3~QDA8|_;q2~Vr@Vc#)
zf8;mCRsVlBO{>QfN}(Qg{0O`mKD|F$j~?}Y=`pB}zYo>EXnH;BebQr)k9^eoq3QLg
V_eYOGKJrh;QPb;D$5oG-@1J5L-jDzQ

diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
deleted file mode 100644
index bbc7d301593f72433c5b7626638d0e6f0f4a0813..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11656
zcmeHNUu;`f89(-Qnm<kK#7Wv_U70r}tkbF+-)s9iS*tYy225&MVGP8k$#r5U@#NUv
zwUb@1Elayn4I$bMcmVN0#RD*b1P^<FhoMS?X#yU2<N>59nx<+SDBYiKg0b-XzH{#N
z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL
zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W
z)q1gzTc}pns^!&23D~5^eJy<T<-A=i{2QEhv0k*-AObsfv3zX~fiZt=oQ2~2>QZje
z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI
z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv
zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI
zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo
z<HNqV`Xu2P^T#{*w=gCZmew@j_er|O22ZcA?-zYrr`PxQ@lpuFt6NBiE}?8ieg>zd
zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy
z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE
zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T
zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y<up)I+S34p&rP9?
z!c!7W(Zs7bt$z6!%@$2@*8d|$>>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l
z8cXjac$Fp{)2rGySn+!Tz$<TY7;p2uc@<y-*iFT9HU!{`_V=&iqCG)?_8zbQLbBSA
zS<lC4c^6IRiQ}!!tD8KuuQB;KX5|z<ONckMM=`&y{a#R3pCQy0xQ#XQ+)1k$Aho+G
z6ejSVI>2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^
zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n
zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj
zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hw<o(IeEul}4$MFw7Zog1{xD
zleVjWs&>ds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I
zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A
zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe;
zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g
zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V
zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJ<fngdUP
zp5aufjj40l)(q({a$vzPch_8^=X5us^c<P!MwISDc{ier0m{1(Igk{TcOlC7Qnc%x
z-ctP|Wl-LYh)hn|lnWw?mDRGoTyjrVT9z&qU^-=3nM~FSR8^YAsklM#o$a(N1PTRO
zaktWC>g+V<Rf?qyRHBc>S(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b
z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E
z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4s<Qp#eAb!
ztCmX(O&xCo7U+FTzN{ZH@3^D5R$6G}%K3VO??Y0`(5bt4Jq(>lX>$8MT1f}SwbtT6
z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9
zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+
zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv
zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G
zsDgp>M-8;CXUW!_Miv5%hx6i!lE<B*GE~?Oh_W}HD9&%EC?|uG7!c*t?Rjh>x1F+_
zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it
zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1<!o`C
zXTULKXWCt`INju8`w6f(l~T~c=A+N5SB0w4^*=CWTPAwhiOz<1%Ky-0$(@FU3_pIh
zp0zDb8_~&jD}0L+MpToYzkKJ?k~_eGSaj2Coj;3nCK<MOID5O9OwUNKrIl94g4gqT
zTdy}O3yFT%_5CWaIPDpj2xLj7RQl7N*sRQXhe(zaZm~#~t$4pxvYc~zl4`e-<qS$e
zYE-x-sl8Bp4`tYmY)byTYNydzWa8~9vnw6uB;87PrNb#npS8Nu;hd!7=Vn(1?4x3^
zl3f|F5sG29yNlR$$zYn~qri<hl|V(R8#85r@2^?+Rzr8;*~BTj+kiLSjd#Y+Ya~79
zOgHA)HfF<Pwz@GxnYzvM!+_~=XS;G6ZQLo3d!{S5_Xxqd@o1r9obAe;YSW$ZxZz@#
z^cm?kc1yC`#fJB@+qk3kN}yON(68`Q^i{3Mk7oVq`%r#<-Y%};)kAQc>R{!MY@Lij
z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh
zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA
z*27_}t%&<tn%`ZMuJ`Z`o38&<*LQJ6Xgug$vHz|5zHRWI+6I5-L}-f@(7W(kqGF;c
zbHE=JDp7IfL}W{xRYraY*olZurAutYM319hqHiH0z!S|ONu9!FBPzb<L?RI^5AQ-)
zpr3K@<(A)50{MJQe9P(Sp|}5$ch1Me3o-Eb^dQ^f@D9a)G9nHG#$fp#M@8$|(<Jrd
zxZDyGlg`*6+1nE5d&S1Im~f(71P|_#{!KbVLvt?u`3vF(CmK~6*#AjK)97`#l#hkt
zPVdkmz+>+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z<ygR@C(Xt
zZP<<|`LKj*5%5PQozbw5ANf_^SX8`I&hQAu6Cr&s+#{x(_^1cNUJv$%2l4+dCQkSF
z#Tm0t9~3`{iqjL3VGrI2i|3l^+s+e^LjK~;*x0)e-MS?GpJ<LteK+H>$nlNg!-(Mc
zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;<r|d`%`E&RZ_WuvDkaaQ^X8&lv
z!Ec=$k^U-p(%Cge^ccne66I@*;SmWV-(ql%^wa(${y6e|2^;L%pK!*x9>%4A;gOT0
z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3
zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e
zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg
z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mY<R>T8)SViUf}%2
zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p
zIl#3z8Vl<miY8lh@}KLc7GGoimNPgu13C}b1AXTlfLgr4^hsyufIq%-UlDz@BasO-
z`i;kV|0O>T3jBdQ_aBuXVL3zNWS<A})w0k10QBACpLOZ*cgh*q>BWnFWH}S#$6NG>
z<?J0t3Cn@=VLAK8pKQU99~<)IH)6x*N@4&NBmI53Cp6{k+TBK<5Mxx|9M8+WVdQ(F
zMV}r%cbnKn`zh(Wp?}Zb79IL$MIXURp8Vg}qTeIV-7WS|2V(l=UV;1#P{(6<qgNdL
zIshK)ns|lk7u+ubK}B-Qd8)rbFaOBLloRgt^y9pqYl^U^f8!=`w8_daNqwJbzANZ{
zy0!JVCN9Oq9*+)|{{z^~#z)tWGVtHX0Pb3X&TDYYDMXAO9(;rN{ezy9yUBl3$({ND
zdFpim7FIt@O#&mK3=$9B!l}*BpnD39IOsv22%n^;g|RFz-W=q;X&d?PD*4`kKmZg+
zwvqpV`j?xZ2LuA3cxoH@&-F`}M<_wQ5tP48(rP{yULL<uVkQ;A-6QJ7{_$lA)aLw1
ztj{X>?<hRU^L;14_q_DK!~P!i!zbaBr=@G(-N(xcH%CO@m~nqNDV%HHm2>z1AncRQ
zIiD;oU3r&}<f){B^3FE$$CQ1)Je5RH{+Vs$m%GT5pFw&0KQgEgJLos8Z>+oVD*^JK
z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB`
z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD>
N+sV24aj&l2zW{T-tULe!

diff --git a/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin b/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin
deleted file mode 100644
index c5783aaf76e89eda9066225eae34d4470b74a950..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10296
zcmeHN+ix7z89%$TzQ);g45UdDnhC@Z)O5TTU)JGf2!U1&A_Xc`C>>^R_C|Zl?ATq~
z1U6|!tEwVW+lM0c0riQ8wlC#@N>zp0f(j3ahyDRogerv6)OHC_LD~Mk@0>H9-L;Eq
z6VwYw@wt5W?>pc5X69_3JbvQta43}O6S9THZ$wkgX?XJnxp?!S93S5Ku$U2P(To0S
z5fZ|x&(%HGb26Ceaj`7dJlgfSO0FTS%7Rs_6sp4V@(s`O+@e>l*tKfWF3i{SIoqx{
zUU4CBXB;m(XIH9j*)HX0iyo%7=Q_nou`+8X$9*~5lTe>?+<fk1k>Td+d3OQA;%wV&
zshV+0aJA_1I$N#OJ*VQ0Q~a!4J~KaS7hI>D*NalIg1MV7DWt!4+5nXWTQU>gt~>R*
zWZDGoAl}^!KCI?7Y_9<eNiTLrTWQsbSq2in50$pBo2cvuK-;Jw#N(7ov#wK{WAAHj
z)vJ1oHFy}17vFl|Wfoq6{{J_Rbpk(7&X=ufp-|6zGL_XK(Rd^{PU(WJ*ALu#{9aqq
z+i}||6&<7`b~$(g>_axHLfhtZGD$f4q&^1!ZD+ziDeY(a-#Qc9DuVjRUYWy`#1S*Z
zaMwY57ZCL*U|d8|+<-@c`Rqs+{XKviVpkXZA;3cc_zp$!4W^HE(N`6{rqdFam|k14
z&h{OK_pAfd?0JUwcESJ7aQzfqM}I9$_7AQD)NCH`k>U0DG?l7fF#T0s^ewbQ3QBWo
zhKM{)V{GVbvw2wbx6U>X_w!T;f*<LUA^}=gWIv3Syb&FNzk(zzYUo3u_oIX8Lm(2N
zF|S-3^Qvxku2{*<XEC#9oqDxeIaPGVyjdZ#-t6Mke5JTBk(w_|B&+3x%A!-7E!XEy
zEu9{pET1Zu3vPM5mRUMgNfk~Nm!|Tkt=ggp{rb;8e5v=B*9b8H*JX=@j9sC%%a@F$
z@Nnp|@nraQ5n5R_7O{HQXrMPjVKcO97~y#6;+pXU7Auz&><kgFtz2BYv~ua4iwrcv
zATXB0e+q?+9!%FRuB|d6{A`%qgwimlYP-t-2TT&i?vQa6>ArzFw&LqFJ49Wbv>F36
z-9z1d)ZI$madfMzt1$n1TH4ex2@_{%IYM2Xx`(Ly0(JYTdyJ*>^cq^TzI~WoG=^yH
zQMZ@61#~N`E0?9ePm;eBB7`nDTX}0`ZS5~>t1D{=of!%@$$bjBwUBq~#cIV&TT^(!
za@^UxXV%O1blee^lgpuG*X??7DQ`|v`Bf#_v{1BXD`vv9GHFwm<VO;TN6lN!<>`qd
zCjCuL9Wm21rtzm>Q*pq?fsdcL`CrMR^8QF->LcZ)NnR52r=>p9n@-^`A>rw9=z)*L
zdNEVVn^wN!xr^pji|1is)f~Klnb!Q|xM|fq*X)YNR%P2P*^VE%@;VZqfR5xRpd<AO
z=ok;^NF}$NIcj$&0@AKj22F~a=r#Ow#Vh8_L^3UwX*r8@#b=8-Fj~81>NcD`24&#V
zz^HpmYH}R}y8z1Fl1u_74xO;^l~u{w8GKPy;zy}ulydSISQ$l|z^Qv=yNUnk%^L`w
z+`ixh1d|tdijn!{>Vh>d-A<rTjAlKjn6nyiwLVv}R6r6aQKK&5(;HcZ!nR#^lN-8=
zO1WLZDF|+J5_>a>R3I0F9wM^v5Xi(v9-^FX*N|}-;#6{eND(wBgxci9I83a%J_cR`
zBk_d7S47P!c~+gbsfJPxG>8%tlj%5lH(#>miVE!}V5`DUC8nm+lhcW5a@Ez_oWwH%
z1RzVnS3*d^Zc<7HVhX0_ay}dnlu?CELLuoZRedaFjhh5bijejBjE|97!|o(g%Fu5X
zb9qgWQ>K6dezHb=f=FhLwgUr+UEf2Pe#DG7$PSgd(5@geqqT&I<EXxy7Lpv(We8!{
zlI9ugBTF_vZ^lhZ4fdwpC`k+BJB(sSbF^h-6k?~0g3SPco~_EddB@Aws-<Ff(ZZ)#
z*2CKbt}#v|j~>l06tkXPa_Sz}3dxnQI{bs44*#_54*yOh+C-9utdpI~+juketbq4y
z`MB?WN_x+JP=d9EUb-jKuiHyx<rcgISC^OA+YT?GnIT*ClCx#f_i{4eB@(*bOFkqy
zIz=1pWdcWJ3vLoym!CRf>)8sW+wzkuL)!OqI^bu5Q>pDF@!`$Ms?fckRMWTODcHI^
zMNC=Lx~F~|x4NagO^LYgYa$tRmTC&;Zb3f4lAHmQScTM7a_ZQ`9mix@(Rj{nO0xi7
z)K9yf<<xM$*02iRlBJ0`%*hFqv`q+PVVi^zRP;0(o6JGS(F!0^vg}gA!ETC?dbp;j
zo)uDer{mNhEmvHy3iG9s7MWscM21R-Ozy(-@m6>|pt+<eb5dM`rS4Qx)^3gBXeHU+
z6os--b}YA|wI*1f<mXV}Dbhh9?P4(;3z>bq*QjvqML1Y?kC<GNSpjzR40H=O@A>Z~
ziKFZ9S@}wiUQ^ECtEHajS~{SviJVNv%`f0}uCtx#BIVoiZo*E+k89m+K-tNX4Fk&q
zXGCPIU+jdc-S~^_l5^4=841j%BBDvK&kPX?zY)ZRc>AEg?h%KDSlQ*TBf@OM8)8q}
znm_6T{2|fbhG!+f8z*r`v#%v0`}@UXxCm+DFETvDw(#W#2#vjI5gF_kQOt+Km`_GX
z8aKLYp~;78{D&Gpz=$p6Mt2+Jzi!v6Nhe&(BzX625?xdX5xkYUYZEyNe8q=z%b^xu
zZ;|2GMaAAlrSc%gN>rR%9&Cy4Voq0JKgGHc5i5#BT!?{Qc{vK;g{XL@(a(2Lx9>)H
zfj`$65Hoy70sPA`@q@;|0Nw8OzxQ-Z{2>PVfq|>BzP(q`Ki)5H#~MxYFGpJ9+gOqQ
zIOHuyTjD9TuF-n9CBCQDWPgM16>)z?SA4IH0YB0b-&godREz-sA>m(qx{vhla{JhU
z)(oxBi=l?-o4HkpTIM|i_D62SdbUs~w8Fr@3V$0$>j3!0ZfIeCx*vqRSI_r}H)5iv
zK|4$U;NkC~#!m4f6048+K|k6U+==uQgDmeparTukvJ!2dH6$)XfRFSMK0^2lu~1J#
z-Bpr2maj65)`;(~mViG;0`Z6SXd}w<qmn=TOe4~<A3YuW(bGfw5%Kq<r>A{CB5nI|
z_Lcq!TeR<UL*hEpH_ZBmS>J(3PlF6WV|&TJ-l*^2bs^A?F#QPAk0D;NRxmx{bM}?N
z5%Qno@mv(^p<#cG{G0G%IIGRuNW8dkKpgbK;h(*9;}o%o3+J&m41bOI_t)i_6DIKS
zDArNlFJ2=bb|4=h?+J0|iXq-O5K(K$dfO1cj}aavgYav>j}e~Mz`tXN*J8qC=bE<(
zF~j*_KF9LD%xmn2xhG2W>-ZOV?c>QlZo}x2zx@pZ+tHLq0P@LRs#$!$348k+F@<l!
z2FQQTK>9VMJjBm@f$_flgUIgz;^%n9LI>U6UN%{-h3OhH9&G;sTK7^u`}VscO8FuE
z8v|aKwHelby*p^o4)U(Qe4n`9-NAKo2eH7ud%r{zYE462jEScfQHHVV-G>5AcwKJ@
z|5i-gv-p9ay4z}f%MdTd#2z0HrvJgUZ6EvUU3b`bKlm_yT;u7<`;Q;9ANb1s4^q%R
zZJ$QqKOzEDE@WM3H)lpIYwEG5u|ywiR1Y@d?^nRfAHh$*sCAG?@#^t7rxX$my{B=F
z!-zLZZWi#JHW6Y@y}ryzGQaPny?GP<*A@TYd`>y(<R|+&VMO^mWhfqMZLO2`=bP~V
zmWw++gZLq_E~1nFCW)%8Q^?t26`(^3q4$WNxdGzSa$cJWz-W3x@jtKN#7~d1PX3+P
z7}(UMU+k5PrX2hn(foQJ>GxuS%_urOn2KNTGwG-IYt5(OKiY&py#YU^R44x*H{n02
z_~XhT(ofIJPX6~d;csrBpZx6PzZU%X*L4PdtoSMB!KV2&jDF82esx?lScuQcss2rw
z?5Fe7lwiN#sq_;**qZz@EX4jG5OdNMZ1n#I$fi#IpD6wyEzLXmZ<6^V?(87c{9goK
zw~g|T{HD0-|Iemr^>|V#)T53cfw#k__ebl|quwt)2KDjxq1qQsuSdO4dJOWBk9t2e
Yy&m=c=rPDg{^>YsdOhm6>QVFk6Sbn<Jpcdz

diff --git a/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin b/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin
deleted file mode 100644
index b6a468f77a33e4d5c2147ba94dfa5b59314bc5ce..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15624
zcmeHOYiwKBeLv(SMN*PQnevNHT+^A9*t0?N@^MMVL1T35whs1EtV@SB%@~TLEv6F1
zkhF4H+l!n9M)zTHy6wY?Z9q^A0|E?P3hc|aZZNj23kD3!URY4{LnatSkYIq~#&Lmq
z)qel~Ip@AeJGK@SD3A;I-t&Jy&*M_p&wcvTFN8y(LQKdjEdExkt1&$`Pe>qFmd`Kl
z_@X!|Dk6&eoCpb#?JxBQy+M5uBmEq%NSL3s{!*uDi)?2t+wPoM71=?{9%Ki-_F%PB
z>#nwIXU_Fo&01r%v$nc?ZqNqSbhz(^ufJ69wVL<9*=zM%y)}sBM7GyjzB`A&Sid{Y
zW^3`>+1i<2eWfL`5?F3`FeY08xb8!A)@m-j*01;bQs_4LI&wNH;NZFgyEd=~Uc1UE
zj>YFDiqohsFQ4tzyGxw>Zf|w4Iv950Hqll){XxAm$bDFrWs_|C{?pV<x{E%y(pt%`
zo;lNR4bV?Wmg+}MN0-%VD2j$Zcly((Ym&c~tJRm=_5S#LecYRJ3f-YM?_w4Ch*~#~
z8W|!vQmI#fIpo1c)c8$2GcWJ-P&`8{o58CNoFTNdOYJZ|>LS$W4;eo(f`5zgdNp^+
zyThb^(nY9|34ChSjca{c5<biPsS*AyT=NP`W4hq?1$ri@FRia17V)i1>xbh!6oT-_
zMzlzTwiS5;#-waypW>ZG9AVJ~6bk)lW)k-hFd}q(u+lv~SnV~I+MVXP21d2B_5SK=
z=jnF+c-|7Ck+;6ww-)WL)m{5?FTd94_DlJ~({^F0_;h1%wp@Is(JnVyE9ag)V+|LJ
z)-zUrCEFc}(6_(;d;c);T~sUqZ{!sT89PHaJ}{mQ=R!szJR36B!f%Ms_5Zp4?&c5P
z1qSG637rUMoDt)$MnlFVQHJ4{L!l4w$lwm?NN7`X55q5oLPm@Phv7{TdjA~&kR67v
z3J3p^A&23Y!pOi991DwCN`o&aKzMu>#$3pFkZeChw@={q1LGUd(fGT9?ieEo<31X%
z(Cr}Ip1|#f6!{Xt5%O%FMqj{f(>Sb$?-@BFW$E^7;kTeEBZp)sM_i}r_K$4qucfX3
zWZ=&wP$Kd#<973gQK4~{#>Oe!-!uM>ISB*hxE1L}%*H2iyLr=i=Qj!Xs}S9fkl|m&
z=(=%?U>U<X4Bs`bG2b6EA6xt;BfiCmpCRrBZZ~ci6zvDn@y{~`xF?DJvvecs4Z4iq
z!03HBl#&>~|DN$ToZ0^-<u{EwiTn$hV8$CCY#Oh!+5ZZ`)lI&PKV{4;v98iBOi}!Z
z;|@~?yg(W^ja!Tf5hhL?L*nHow28iX)A(2B_!c|xZ#;gL$N$b_WE%qi!Q*f8n5%I8
z`gNmClvl`&O=F!CvXe01Bi`#_EFnDCM0Ty!>*G9@sbtGIw$*!QTZ2r0rB=<=MYi5-
zVn^%O`t7f@GFA~in8mrS)$3%`2AIiWZ#&z`m{umcSjn7SUR|s&*ZSBkpLi07%IsRZ
z-(Fm9WwNc#pf}9ic_}|EvfVnigG~0ERm^0&gI*>pH9BTKL_>|`Yq?_|l_t}psgS=r
z>-lZS6g)Ejzu1f72ptBW=?vP<jG3>9^8}tnI&pr^Bo?}l6mp+Dp>T_cDY4Ld#5D6R
z4)%d-K2kD)6Z5BPIJ|cnwZ+!ib|?2REt<<sS*Yxy;w==8WuCZmadm0X7Ng?DyA?Mf
zP99-{AXipa*Rtni3}*RQW}z{tx0~4^-0m+eXH|4&1(Y7qEfBmSRS=M-tbIb%J59}>
z&oMuUX30-5HNh&JfQ6p6ozH7R@q}1B({k`g3oJtu0)&WI?Kwt(0f}A9n<u=K5>fI}
zwBn;!OaZ-+_u(n@ULN>j&WD%U3GdfKarczdK1zW@&x2l~T;a6a%`OjAI9*P0FAvYF
zpazLfmf>5FCJn;<F@iGPdBiY|$SKc5_c$V_oqQ&ux3UEilg)tIdYhpjWk5%%yfvg2
zmbHQ|PYnyQP>2BTqfnko6}VE8!C#RASBmJqLKV1Bgobgq7Ydv@Lc_EN%~>O~uSP|#
zk#yH@waCdLy6Mwjlri%?j}CFl)90zvmAyuWs&45tjl-vS#?T<K(4(Nn&}0UvaHBe9
z8W7$Xn#=`!g`>KSv@e_xiUx9G<=k?1sja74k;_NDX%-8nRVtM&?umLcyS$o}3VKBj
zh9-*A1U1;=95Tm4Ol6eKOW<ILU>1>CFNH%PO5P-UUJ^&bqzVe?rAX@#Dp*tw52?h(
zC7~kKcE#s@iR;V#PKhsqfwS;zyV=q)G^vcBkp^W{t<hZmrtBMVK{*oW7nNZ`K|qvM
zI--bkR8-~x3Id{>1V^+yPE_Uz3Id`ghwI29hofd?CZQoHi}<mJ>1fd}BeHTH6`>(0
z%du!#T2e=4*)=LwKvphhjx5%TQCTho)iNl{E$qlrb!k`9`!<Ctmtag-l-5HsscSTi
zw0ddes-~&^Osh1>6-`4~eRAZDrlD>=8f910sJFfLm_#-0=8<+UiP{KAW(bg|^Aw3z
zD6bS%Lz9qCFG*Qp21=LQ0?_YNdaNKeR5g07`k>WaU2ZpqS^PBCpdTOW%h^-;haYaO
zwHt%la=ky`lZ=!yvot52-2{7r(sU9bXK;3rN7C#m)2D#+HR_F}Rt+z$(1)e<Jp3>-
z7jd>Bhw_=DnOx>cI1A?-3({GHzYS+0HR3EP%9^(2Y~FFU<Z+hPHE(87z->4SsS#&&
zjke`%!Ev_Yah7%lud~$R+whjeM%+c$Z$Gz1$6eF(_-h9H*2FB?j>8ZeaagNDeoOb<
zxh2=(qSs-pA^r?w7H-F3h>bX`=K&nH?%83>b-3(x7z>oo;k=&E_qH^U*oebeoODZ+
z@7dw9<8VIbahP5Q{0{5JyGMs1HsY`@7BXD9XNN1U!v(Ly*mB^o|Exk=OOH-vbSA^w
z2Jr+~Xou<1d4&fh%mnXAh{Z}E(Z=K+(n{c8=n<Z#j+a1th$Q%rTP5DlNrIg4O3?nn
z5(V0GJT_>Lk_0;J_yV)!2@TEQcs*v1mo(h9MSTck@0;?Sp;^Q}FnT_af#V*O<p3R7
zP)B8Da&QmIsxuAiZWhO#{)#exxCdo9O^#j4NbuxXlqtqND9f32WXGNVin0p02V~{T
zs3SY>{70uZ6bkpCEb2r1tcrKs`A;6TshmMsZe1-~qSF8!I@GTT7Hg(=dNYr3Jijwo
zEXQyA=q8T@Hv>`4Tn<rqemJn$KN6u{_K0vDiJGUd@hgY6j`DbZWb<{c#YM2Vj20^F
zb&YdOuKVYV#rac$<k~?0VT<iOvCy-sd}6_p<i41ec?ys={J!Lir_9MAx<8&Wr-rEX
z!sD4~WtkiHIq=d|=G-WTvA&gyM@wd&{ewA=e=>U%E&Yt)$#+>A0zKd#r;eyqe}c*~
zaZ-#<!R{ha=G-ZU@%~khnz?+D9mGs`9jrJBEL6~Z9tSI&K%)B{tZ)K}Y6cduiYJ2=
zzw;H&pkf&7VWoJ~EEO#F5RP~}bW(^6`#t1bR7O%*;S^RtH7V(yL{&UFtoU<S;T$T4
z@g7!>=5xggdx)NPJ*+x8wCEMhlfx?Kkm&v#Ryl`6H3Ki?RZk8B4N&DADu%HhR*Of`
zW;yl{fqOi3a#*%V(*3=#$~ml#<gm&)Bx;`Dg4|}SdU9Cx=dj8-R1D)itR5}o%5tJW
z*7XqU5iOIn{#5w)HP=Nm$5~_!e;U!!EJO_Xz@}~HJedR|MeY6$o8w$MD&6SJ9H&Z2
zn>m^jBP$V@IL8QavEIw}hn+?y;jg@z<HTBUf_(NG@T`sC3iu_HJ4T9AYvx$klxw#e
z4!?ExXp0FLlRsRTIVo+zN4zzv4+hja4iXku)1=?CM~I|YQjb=~(vf*ZH?6S@sE}g7
z@-xl?VpR-<{8$D=r5K92u?(n~W*F}zN}(7^<fOofvzkF!)$21QLcSL}@0#?!T+Ljh
z!z7r^+*oc)A3XW(I_I*RW7(ZI0cOYBxYnK@%kI2NGCSUfHG5$!d)4O*-bgijaU6TW
zIpwo<yb<1+-FZ)BcD#?>ncew;f!XmkMC`rR!1;@b`LO%foK~kve?YlN-*&b5H<5(;
zN~ylM*lVrf&ql%P;s`5$>NUz36#S^*Q3qD)4e}vzK7l}lh#Zge*AE#Q@04T4r#$0w
zM645iulxuiBJefB`-dFZ5WDa}+)fAPzr}mtkcj*ENk7ROpT)Nq_ic&D{c(|-65?h2
zi_A{5E&iwhY(_dGB9n0el9&x+Y(+>LU-S(H4SuYi|Er$2@Z{h?qi+_xUw0fJ0bD`+
z&r9?OUZH>DkLS;aw%BnB_JJ9Z9x4|W@a)Wp=g&`Xi5D=Y;JyLaj)+agA~upDYSZ@+
zxVDf<!0DkO;X`<A%!ses6OjmpkM2P_L4V$kiIbFcy1tVXziY>0^!3Zc4_{1**OTCn
z#ZYDO=!D|GFd-f#ULoQ1P<<WpATSRk#k`GrVYnq;jERloB5g;v$eytWMqa-nQt+4b
z8$TpE@y~=$0+#&A(0)4_RUQj*Eh(mL5lMjFUEDMXPW7VE8REYM`D@8I=wb4pJKKv8
zeP=IhF#WA7N`E3s@{usxi=^y$G^6ETgFbsQdRW8o$Fk=VzXX^sz$y5X1Hkr}9u8L&
znf!<S_)~zpvx5l13-&~U;+<xDj}d)R($|>&bvqu&e<G0oM1t}Yck-V|`12q4<)7l0
z`jJ?+pPF;+vwu-Z?=t;TdglaxC6&%ji$4h?-rb3pV2`hB$%tjA6HB@uHj<G`>DiRS
z|3D1<Q;9b<KkR+f{bbpD6B*Y&^hW~u!k4z1&ixvVQobTizM@foz9PPS&D#gUPCuYu
z=6gwAOt}95TlRg4SG9h?3-+$vPChXoP~WrzZycYE+WTZS)O@>?PVVA$it^he{ri3T
z7wpL?N1pY?_d4)K68iQ8^zFwgh)d~Pup<W?{%gtEOGnair#`T^U{BxYp^Hn^y>7ji
z3~?#3Z<^?=Zz(A*eP$2mm*K?_)(iEG+ad65yeR&DsJ^xPD#3ESaX!mK{VjXKSl1JF
z<2dG5#CQ&@TURmPUOUM1pXMjef8;Z5@7f}Gg4P4TDXd59fYp44Jf;swdcJ+F7m1R;
zk6y)kv~dONML6ZIFQhNEMe(P2eIb2wVXPF=KJvTsLYU{n*5;KY>_zt5v6L>av_BbH
zckM->pZUXby?{QH3l6*fO-I&!a4Lci2_^kCxnA5)@lgC6Z`_R+`GWo!#iRA7?F7>k
zDj#WklHo}Qo?>{)fp;)GO)%`G?Ky@yeyYcX80QNnczwtEwd<D0AFO}zM|^N1@g*NT
zIeEbcPfz`khB>~}j&Exi@ulotbKlc2$0zrf808Oi_!H-TBYzl9@cIZ51y4=-={sb9
z5*_xe3v);?Uy6VCE-yT9$GCr$J=9OGuL)@%@D$Uh9C(u99Rx%FygkkEoI~Hu@NNmi
zzU)8t7yUGECk(RB`w8NC{(NLhTn2*nW9-){duE3AN8aDSkM()xDZt&Q(3Qx~Uw#GB
z&jUgHfWO)o6Z};yP4c%NBRDoo4acw%0vu1$gkv}v20XQ2pyL^i68~hH07)l!dM_z=
z;obWL&TlT9*dwq*61));2b?o0`U(D|?HJD|xnCa_m@hGBzJUJ1a2K4zMe<8CfbS3U
znBc=ipOc?tGM(VPVLCRt{OPdBcwu~rd)N!_3=7(DN&W#|A7X6q$EX+fqlfINna@A|
z_~Y`t@EFbm3I4d3=jUrzDBlx&UPvL)s1Mc`o^L|Rqkgkf<jxG$7x92UA^EX?VtsJs
zah?Dj^-j?8HmmqaAFVg6?^<$K3i%;dwSLP^CYVk*sbBen^8xw|`eV|bJP%lQEFn+V
z%paNVVeP=Mo09uA)^naO+;6WR<o(zA3K{V&*fU9|AJK2Lf2kdb)>rK32NItkU+7{u
zm6(?hufN#OI6vxqG<*3oVw(2PNE!$7?;nExIPX^u{R?8|eur*^u%GdM<IvM$V$PvQ
z!|<Q?D@i{%d-;fn?{(?qKhJMTKQ1m;sh^eox1-{6LQL=W!!tBr*uRaaIQ`oI_`)vq
z4)q(_r2+)^&#&M-NUJk)`UcK}|3vvQsQcxGe9ha2;nWu}_E&d)ga7hSM5w`}y@TSk
zK937&?-z%+1ig1`ZT*%Zt|rAkhYq2iLRe+y<HyGt_=#kI_7<L?0c$4}oRqtbLv{8d
zg`wXE#>xMVlDqgJ^0Zb3SXlF{I`-;!5#pgOo7Z|g^%_fu$)Hc9Ag4BgB?3HGkoVwi
z<o`g)?+OS6Kym6e@_(lO%gb^gxS;&kZX>^sg2naEwAg#Dp!^X@>v~vt`1XSmqxTwJ
zBigIH*RM;Ud(w-<c={oU_<e;Zd4A7Ruzq|;`afd-WhI|cj6wUlj&FlbKAlv2`f*js
zYgpUYa=QK}VE<0?nvdkErGoPIZRFon_Pz4d5<&UDxQ+bg7<uwDD4z!oDsZBtw<`C-
z|6Afq|DPoCpITy_vF?rD^+@N>xhOd*DFhi+7uC;iMTl2`BYD#AyR^IyIXc%i;5_;N
zsQ{l)@;OickLw@GDY>{wzW|T9_K|X2`l<6r=<Ui;AL#s&Zttb@t)E_dyd>NvFOCkC
RZ?7DqZX>7jqn}#t{{RQ)El~gf

diff --git a/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin b/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin
deleted file mode 100644
index c7bbcfbc8082bb2f81a0bf73c6c8c5e63be6fada..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10816
zcmeHN-)~#h9Y6MUg3|`)2T=(tTW>})%9h>uM-scGD}h4Uv?4mnUO?z_?dv24$9Au8
zT&JPbDHB4|RHp4^NSlOsLTt|yn_%h#RCEt$|G*?vp=s(m<U|Z1jp2R1=g0N+wX?yJ
zcALbJe9rm){&BwNoa<|!d*Jx7a43|ziHR0wzhezKr=fX30L?qZ_}I;lv3ZtZ{piP7
zh%uwORCOKK%3`L*)nx(m(X1|&>^d__HKR~kC^N&&*ImPP3U0Y%R>}o)VWpb4O|xXV
zg<9UsT5fL1ER~&QvzT8jxR{!*V--q;(xPeEwwdu%ZCgoo$#U}cPN{M7)x1-KzBu=A
zu~^PpMZ{ciIh-q(s;*UXXDE?|ozJc;nhTD#oL4}xP{Q2F7bVeytqw$~W(r}Vn^mh?
z6+&C!EhN5Mz{6GCy6M*8A=$+eDKCvmA;*y<?_q`SZ5O5gKqwz2hGeW_anZ3VOFa6D
zQ+CVlY6TJc^u@Q8dr^_sr~jg?Q4yk^PcG+|jq<`mHSdbbmfJ?rQSaEjHB-SSA9~;+
zQ}CN{(<&A$R45kRzah;1CQgfvTHv)M@aU6jSNy+i5pUykZt>5xMYel5s!e<yw^tI|
zPGiH}n-5pQgby2$^J(Dqmq&dRF+Bmi#zwmEPXZtE;de`pMb1Ce#b1~Fice{Lj`J%o
z+G0QE_`Wtm&fets{w{ntOn$0cvQ5r%{H``a&YlH+bhw?LhBWnC&VNf6e+zBD#L}Fa
z#v;$qI5fE4XdGbC)_UVWlusEW{8X0~fl#(0GYCe=MpOo(K!sTajG@rqW4q9YfMKC&
zceyg{mYv*Ep=7V*Ff$jeYPno`s$fkoS&ZeDEbHlNsZyw>t?bh?#o6TCnewV*ubfWS
zo~oV66>NL1=-RHme7auF)t9Y&dd8@%vd|lU`sv&KZ|-4i0HKQ(327st&5u6NR>Q-g
zkF;~)Kd?}9L#qSQ2B<qtT_miBu5Dnr$U@q);drR|fffR8Lpw_Xo51Pl&!PW|)`$M8
zwi|NSIFl9zLR$;J7YcFO=C#cYK_C_3iy=fl#&bby)R_w)e7-X6){u4!IW$cjck(37
zzDZq`ToF!>6MT%iON{zA&Y|Db25DYHx6$0#(C#9LhsOh5B=BbHO4M<^b2Pi1x+l5v
zNt$wzQ#9L49T#?KMj}_WBQ$-GI$@jaH8+}n6CqQIdd-?wo12?|-GpP!O--Y`yi9qa
zMKWr6r&=hN^o)_lBb((c=3TwIY#xkT%)nz94*RNEEu6{gb98Xa!(BIU+%J~&gl=Rr
zx;V@qPb8kuzooAooSo9?GnbyyGc;!Kk+^gmxEavnXYc-3y69qfJdysl`ZA<13H=9!
zJ+hlg;gb;f%na-x#zM7_E#`G2UviyQeH+K_0cKPzJk06F%G``@R9r{zO360Vn_RgW
zpSmGt5;w+7^2V4+-54`7H^$7Y&rB-0osyL`HRn@!9ZNUO?xL>~r%G<Y)|0byY>mK0
zY!!S1(+4Iihcid8^*Jhwzy>Cr!!z+V3a$ZVA5P8y$MzpL@wKs(Gqd=LSc)H_6RT*8
zEofvVZ(?pre|!fv-4l2B(2YZE2XrT(n_|J`OF=A`YsQKQK7kW!GUr+a+o&Vp>Qd2=
zX`!$t9ikuX<rq%Y9dIn!GiaQqJD@uS-5sGKmy;+93e+DbB0-!WCVIq~#0kFxzRbWE
zt7Ln*MTD>k%d?4iFRQpecEIW^WuWXRs}u{Y3fptE+IhEP6kVgrYoP+8lMXSZ@z6Cx
z$y+HJO9e@Gl5KioHj_+eGAT-=qe>}>ixBeQN+FYe0&YY>NFoci?J2}f47{L%MCOWR
zk4ly4B*Bw}FV&T-N8p7if=SLwH%}MrykfAWCx`@++@RX3q|z3Z4KB!f1%kv(>G3*N
zL+uV2mzep<N<zoJm)|UFNuJR)=n^T>&tkc_X8jv_T&JvenB|GFsl;GCXcplI%pzot
z+1e~m*^Fm4*)~f#?97EoFLD;h1y8<BHAqfjDP=B@Dj<-2?nBPWTW-EmE*5gD2EHTb
zTs*Adb<^?Wp+osvA?KP!tLpNzP)H>Vq|l#MrKz1(_|hxB<B1NHWFcqemhvXPD!7I(
z>AW!Jyy#~{^gIqKhmO>XoQs<8juOHD2T?+*D@xemn%Yq+-GNp7%u#aBGhUQwUzC|(
zlnK7GZLuf1!%V2s{(P%RVqJma^aun>#<#Ve;y{Xfp(f(KSQ9u)I+rxTE7}&vQIBBN
z_W7K_LaZxTFLCW<O>A)}eCBl}oj5p>?Qm26aOvFT6}%uW=p?TLDzAl9I+;E?d+$+k
zrYSn#z*KBLI;);?T*Ip1ZD`$CaL*WunWs9L#(CPCA;k+RNd-nHt`tjgm-&H#Yr{|A
zCmUZ6>^w@6e0WF!2ZxSj;?$tcF4T;Lm15CT$+Zz1j#)flbkX@H7xWY<po2}h!!Ll8
zu#J5ANjzJ^-xM7N3(J<_lsrkEtQ40QRfp;duyA~e>k->3LH7jRBGSC5zZ6#maX&=_
zC6Wvw#`ivs$Igqxe^pHB{2JkTNZ>>#DJSoGk86oT?T65O$)=~bv-n0?&GUnJK)(F5
zvRNlz!?SL%o$X@fzm0S=1_gHt?sj0=%8?HPYXb;`MW&<dW{l%B9}#o19U1k_(-GDn
z+^vH|!rQS7V;@d<a6gOV2c!`X?qm2d$w!Ycy#v<BnvWi)_lrI_#G)NAHwwCT0`KPb
zwpir$D7%OkUGL!&8HP`IlR+;t=%d{>&my~`EQa}T81uOZY2%IFrP1V5IsQYApJ1e1
zP@{KOln>RuEGI#{(n<L4w}nEF|E=TwTj)`CuJ?F;E!1N4`jQ;~YK+~sDlJZ6EXCOQ
zwZRtqA?C#M3gCK#H6;tXG6a52K$94*#Mn#qD1X;@&mL$5{d|3Z&C>!<yEMdpQXd$g
zx3|%cFATAFhQL2CFb4RZe#!q#l-)zT<PpiQ#airnKx98o@NkR0AmIwZqb>F$36uYI
zvED6uh5PO_=wmH*LDI7^HVXQu@TYb0!cAm<#KCJ_wAQx)UwRSz(H8rG<VQS%;AdY8
zu}b!14gN&#tcTezUwiHAtuXk1kNE3aYadv`d!Ur-)54^%)qIiD_t$r6f2MjOe#FxU
zdEQ#!M?NO&gS$|!Y>?}Fn5|z95Aqt?YYeh05%3cgjriXudHHUdVCb8y#~5+Mg#OTP
z&VYU%2+9}NKg{(H3wrpadL*zueS!7q>!bCFc<a;G*SS8Cj`dl;9Eov{_I_oMWe4D&
zI19MHS009cqul-|*?)guWc_mguos^l0)33r$GCsfkSE8{WPIzF2S<4ZC?BuHK&Kt-
z(L<B<D33o%@>sw1%e%%r{#RpwV^PmPtmmvdg1z22Lh9L-eQd%Fhky0<ooTKU`J8Yy
z&4ZD@XjSZ4rh_9M13WAWhG5jwZmJK~c$VGQ)F?g)L-&Uoduxd3IK+s)3HmhA2?qTm
zjlDa>be@dH5!^|<e)LzlzVC7v{^;ZMBZ80qI)^<v`Ntc$X#Ym*8h-j{h%E&D<S&tU
z2Y;jW5lL_G`q5w2P_7N35B00R&gq{138(KSeqQl>4TUG1NI%c7fSK-SA|KrUeFR6u
zi7+qpJ8W3|76bU5Y0#NG{sjFFN7JZc_^MxenBC!sb9OgyRNoU$Op_~)dVUe`5c7Wh
z0sW!x5F@pQ#;y&q=T}*j`1|+b0wB6N&xn3?h&{Oa38Qnp)p|~27l+umM~BdV0XEH}
z-@pG}^Vp-0;YU<HJ@Lo`N6nMpd*o3H>WlV82L7iqK-*sI?@qVoCHRcI-V~Mi{f+i@
zFZo*~QXU?jNKl@BISrWLyt=#=m!tsk(DNEUUNMeyF2>TJ2XzwZ1^KYHB>B`3<h_3j
z`KFW~3<v~3c4`axPo$&c0f7L>e!YeKn7lT_{IFNvcgK2{AXVBNVmAIIG47WHwMP7`
z_c6O-UYYmdXnI)6za{Y`Pp$;z_h4ZVLYIBER|x82@M=WKt95)5!i1Zbd}`38yjo|n
zPZy1nQ}|zOA)o0XPbC$U|Kk?&C!~B_Iz;yQbFPGy{Rrak<$t4xec}t+$D=5(U#f3^
z`<axdoco)SS2+6L22|kxdx3VlfYpBo$bZ^D4T1K0ddwwxqWfE8LO@cDM<U*i1_b#O
zl>fPu@8AUxluwHKVfO_XmHY``VtUmd#Z7rt|7#4WVf23#YHC#ZBlLFU)cSDczD})|
r8vXY8gHkY|QdX^#8vSw<qgoH2DLz%|)aaL^_*5Q)aw@NCRC50Wn^<wZ

diff --git a/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin b/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin
deleted file mode 100644
index 1de4097ad449a4d7aa34afa2f37a276f37ce5d45..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11320
zcmeHNYiwK99Y6N9Q@aUv8c+u+TW_HmftBl561$JllmcT~5go9HVCd@FzKPvBwtIaO
zoR-qGOlY8qPGkEpX&(@uCN^oBgeEpkTR#A+CbXe_nE2R5XhO7fNhk<u4Da_pkLznY
z*&|J>ZQ?4q=ltK#bI$d(PuzXro?sx5*}%jVWWQ$fa!$kBw+Z0wVKF{@&BJVh<yar?
z*RcR&Mx)el9oNcZrpCEx0rSypl&ZxhGpe&jxjI>6hHE!n!*$ATt!mb5Wpi?-VHZua
zYPscE+ss>Tp=4HT&a_#vr^+s-rt4VcYPmXPmMo{}TGf)7%6JObNmT>7?c(|>bL@ug
z%tC9DdplLB<*f>`P<A<7s8t(?#vP^98bv!lGi6RX*0imFO1X-;V^<{6gRKrkb=DNZ
zL^m5&qalP=!CNRruYiZEyG_$=!b7r)qA4$pdbz-nB=2E`?`;>Q|3D}oC5B|IN@dEi
z>Lnh1-Kn`Xcdm{IefpB?Pyx}YSMvTz)32}?RkuGlZBHAu$;pQ8ioTaiOVP0!u-a!$
z1wVM--S?S--%OfTrEH;p(GvdUVs1CFyL5C4?>vFWJ+0Qv|JnlbmR08l@E<Lp_1R3-
zLcgL*FpXuYvB93jIfSERn-3e3^L@bU&ky-1VwwkDW1D*L4)6gVzF%_8asGG@|1*+b
z@hOep=lsfxw%7|CA8sS$><=8@*Mq+<NPeo*wM}+8eq$RUXWs#SXwa7*sq&AU|Jok@
z7Ow3QOLMw37J8h<_`u2e`4JXrotz(u@F`=2-_fH*BBZU*9t0y~Ln;GL02O3^24f)b
zx7bG91HiCA!kw-s+?rD;m8->>0%qov)u`2~hssu>e267VwP`!iM$y4hv=Vl8wvn)E
zZaGmY=M(igw^XZUj8rnwD7*IUb*pgLnz9>0al%xp6V6Q4E#rJ-h4P_8Gl%W0Ggq6d
z9hrNq<~%l&ZDwZ=S10Qkw>&m?sBok>b#$(1O<BdsnmaQ-oi*xnEbyzpJojqfZ#T;o
zQ?e^{+i4Ib`0;Q4^h)0^UqVz-<XT*zfVL^H{QgDlXmBv_z7`CgV}V7j3E;i+S|F$g
zmNYGx3|v^&P5`(FKo2ZlxVWr65qyIMmKM)1FTQ_)Git{{y7bN^u$&77E?jyy6x3d4
zPy`b{KS5(q+lygIi{h@$&~y>B{SXU2A4I-2UIFTi&I$nGbC_wH1KPEu;<I!kSG8}@
zZ9Cl_C%3fgX*xi+{d60on}gfZ($eCRb~Aw{-8dh2=P<zY+SdsADmg8rZJJf-HcYqa
zF6H-o6mIJp0%vhsTv}XSKL4Ib%L$|pvt;1nB664vEWdl{-4D+5BrNl+zqh=!L|H=`
z<|z$S7-JSiE7z)e&dB0<(sHJ3S8q(4JChbOtYQ%-OT%oGkJ|b;of7gS(G8p}Q&l~s
z8~L0rPL4-XsYmtC>&JGE?a=9eJi9~B(U`-(#ATDfje?#$zWbl)qEqORRQ6x$%aOh`
z^zRh*$Zjr!|CGSzMqvjrmK)`K#nuhG>N<1!W$*DL%&1#<y3~!C@loBVyN=$In9DX|
zaua59$5k|zx|+t)SJPPLY8o5$8Ox+Ezu#mljQP}F@xB{px6((HBRD0BdMce`#|WH4
zx8t*&9v!alNoDT9BIdX}qKpnZdq&f36g&i~xF=0Gw*7#KPp8#_na3y5YH}CtJ(Z$Z
zMn+!prm%b;S+|Ar=<XFPjIX1G6f96&IJYUF>DsI@BNCCq9yDBVt#Z+5BKStBV#pMw
zu=@==M9;0{9`?+2OF(+X1YkE_Hwzh9=m;D28%DoS(EgZ_bi@o|YNeR5cdwgAqwvV8
z7QJjEpRf%3>9JAxT5~)K@&JmU6A2%&>PE#i8oYva93UWrDU%(~q$$jqicu;{vXg?-
zf}Tx{<<i+)E<+(ZssYkCm_P?t1~K~yLNN$q@U&P|3ez~nhMhF0rjxt5xk}BWQa?Gw
zliCSq8Z&v1BKnC(lZFpm@Ue2yRt!by3OEoZx2RSd=`2!NFp$l5E#dkOJ=r8bWK+Ys
z#7qp=Q#zKe{7gBU<|#b}CBk|AIO@qY>z~t;I<?3OP6=K(-{lk!v`7^rr;t186nq8(
z{M@U8V_U9WuT{#0IRhV43oagE@IL53de<&{wp?({iq&xWz9OVjMpt~W(-oi6-4)+~
zREJ8kP_POm+r(44Yxtt)o{xIbXGHWo4(hLt)Qj*$_w`1Jrd&gmkm`vNwe5-$mN{mv
zQSxpX_o5v4MVal6lCO~>EuxibnZgoTLzpDi6Q{~pyIW!Ovf|_$L(YqHr!USFZ>5fv
z#20T-Yzn>gq}_fkkwUB|Qsk7YYDenjajhlgWlANzSW{_#u(YS}-klUjTbg$OHP&P%
zo6hbZ+qYkAD~is0o7&7rXN@C{Ygl!>sB9XO?omTA^E9Wi6|Tl0Hnwp%L7Scu<1X{0
zV`=#a8L{nB!@+OLkvw)Yw4Y68?#U(Tg0^CL)|i~BRFujL*G6pE>CnkNbiTYL9Y;6c
z(xf{Xz6T52bXL$&p-4PiY3^@^QaL$o8BWzB^JEEmo{F8jyE!;I!`1LyQ1{7)juaj`
z#Ia>dg>}Krv(F>t!8Y*G+_OjvV^Yz0qR59jLhe$<t!1FF%}zW%Rn&`}s>pmD*^v&W
zJqKOopvVMwVl=s*`=Km*Jk-%BAXndJ68=>!mArb27b_$Dgi-FlpDs!&EF!VOdD#%7
z!Ulv>$sA7j(QJn2Aw|bE1*x9bd`7rT9h&?Mqk;iex)jkl?N{XUhRf7_mCVrdFkino
zwK@-0|53U_=f^C!h*f$VF)_z>y?l@EYCmV&)gnEV9tVrju=xQQmG4EZeBQBV@j$<(
z>v)eo{$11>wXMm8D<jk!*tAuk*rLaxl*UjZ!mh!%ndY0soID8)`R3UWn<w0-28e|3
z_L7WU+UCJwHp1BACJzoVy@Rf?>pS2ez2Ei82Uw&7=0-r*4&vSDmKF=$7-8q=MI&CT
zh6V??FZ7;={<Rx&EVMDgVwevGF&__+Hh$5&OqzTw$G^+*Ba9aX3*K2W9=`<Hr&qx%
zoy6b!wwB&~a`%)PcGoI;JcfUBa_U&1#ppFKIdD^q-7qHww_&Wt*r{WY7W)?F#Pb5+
zW{53H7Pb%vzkG8&4CF$L{h%4)?`-#OhDOj&HKS~T5=qx<arWJ2G)iy5BOg8+XJ_N!
zk4CQpythyCKOSLwiI+Sg`D3vbdkPTQPZB)XV$VppPVi8ReOtohe^b=0MemY-p8&nD
z#l9)&`4}4l{bPWwGtX`y`<tA89zbh?;L~iN$@<xIjMejt8tjK|0sP|^U%agq1pRl2
zx2d(ZLY{3#P+XqguS4IPr#G;-;w;>hFK=6*Bi`-JFq=R>^piaJhntZwwOWMhyPutW
zIvC-#-(n1~g%Id5PLC0NAs!Al#hPHG5Bd1TQ3Q4h3Me12H{2ZL{DVS&aJV_d;UO-+
z75<PlT+*N6W?w&H`Zzs-{D->g1%Es06%JFqLSDVX;m&%6I_h=u=}^pz=cWMclcTuE
zKF<%Q3V4CyAL!$+{kME(fQ`ffZye+vQ$Bta0~{IhVD#UZ8^@ZPzk}4Vg{^Fx8w~#B
z)myUM#=>d9n&!dCf21MS4%5Mri~)}EdU1Fm$hJA55YI5z7Z3EKKcN2!cIPtczcsYY
z35RL_;+p~TFX^!TfyQ2e{-%cX%%f%J5nmz%I<^XuzofCZPAeGv9|9)+xa5aM_@`s*
z0{<O2$#{PBmo>E0{FmA=r?=se80i=M=qHMgU)b_!{px4Ae!&la^|PE`z^@a3jPGUq
zD=@4F-O)ro$o|=_1c#`fxqXs%f-$N;hhIyun;lJ~it;L=exQd%$4~I}5OqTM{)etp
z)W6^l2Kxa;359r(AC&XV1UrZPy%i7jJB;@qn@7HppQsaV34M@9eBj5fO#D25#2*F%
zDL{UqZ$pm6L8tt`0r*}b6O8=A{(amqAU3bD3vu?;9P5Js>)V2jp6F`7Bl<gW_Jz5R
z813J!){`1L6KB_ZbO`+)!lt?ZD|hWP?|JB9d@0VS2Oqe5zxm*oA9#q8_Q~~02L3BD
zK-;s}S32F9kl^>krr{BZ-+$2_w37UoMBdaz?3Sl*zPn6t^<U?Q7h@$UKs<C_$WxA<
zO!@Ui(7SaK;f#FPD+xC5-p%{fRpdvcp;KJ~T_AgM75NwC{<xt_pbKPYSCRjjJl=0t
zKU|Ozbj#l=@O3XhIXSmQOtwo{m4WtX|8-OV^$9-`^U#d~@sh-oJRODI@*^k=eCo0P
zvXs|-@-$WQs*d!#4#G_k(RaNm<yD=@KAlfWPT_yLiu~dV^3+n@@_$`LJ|aI*B&APe
zpU&%U`OS!bCI53$UinG##MdpKgMIw<tbv14o^tNLl)S>x?`}w5E*FJn^XdbUB#?bt
zKl2`kn2bpKME76wHw#Fn(Ib&9|3$y=A)mVCe<<bsyfp8Yzf1HFyR(Z?$=?gQx>o8x
z3Xk%te$T3S)p$@ERHMotp)W^H)koP;qpFu0{r0GTy}Bp?HL3#2(Jx0RRS(6kMpYj*
T`sFA-l}E*|MwM4JD!G3EVVCTx

diff --git a/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin b/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin
deleted file mode 100644
index 4f37daa42cedb56958717d999109c3a635cd248f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 9528
zcmeGiTWlOx^~}uL*>!BMotCtQJ|=AzToJ}Q>(?eJt=sg4NYv045Ut8EyF2S0*qxos
zj-7R!w3~nmLa3F150Ln%`b8w*1N8?9p?<VgRPj+E_}~L6iXv2%v~t{pfLd72xsUPe
zu1&#iA}V?n&%Ni|^FHUEd2F7(|KJ0vqD*cBK2-P(w8WaGH*V*^8@KTJ@uA0I4)V~C
z*Xuz6FdCIc7=(5KS8`seahT0kqvDmCV0ep$>y_(ZgibRw!oUsdp5@nFtGv)~N|xo>
zp}Xi<1v@NOEUzBatg18bhPbrCz;->?o43qtBxh?98WlTmN*{=fz-c(aA`0WS%~$IM
zyNa&5A+xnu_Zp$?g;NwiqvRA8=B;vI*Bl8{T@TlRQx%&LY<D2MMT;}jzSXcB4bHRy
z-o|+MGDJ8(Y+7LxEhN2|8EK{AyG4d1{s_x$Uo%nIj|gd_zz~mJt<DFwUt!++K|QR8
zOFnuSlb79m;CU5Zh3o%s9m@iKwC2=|db!+iLSD+^kjQ;3a6F}pmV_TYbpIiX(_2}~
zuDUjsBxX5&0<7H@c7=}Qvtr_SyyoOF_^_Rc=%jR>>Hq3XY^e##BYSlYQx1=q1|vNO
z?FwRc#jpvn{v_TC^yhcPcKGr^#A`6xga0z(l^A}zp!h1IPxsK*1-+z`5|1%@X~i1b
zw;8@~4Ix%9GJJmz{_hMg?}BUFUs1{afi;9!Eg*i!$Xa|_!h~Ni`s;e=+c<U$EUjs1
zkT_0rdhlecH4e%4$<}z1EddC>r$-7$NL`8Z2y@<qjKCidrGk%FMR|940Iv#q2FgTO
z^C!Z3P^`FKX`zTKYu;|u>)zL0d%~>%6y3^9?dX*6Rm){})-L<gwIlUlYSI73QMXbn
z*WDxYzUNF$AFEXg^<aKs>8M*U{3TF+{pTON(*Mhk0i@7%J`##Hs;pjqdqrDPN0iH&
zs-6X9MQb9kdSUgg)weZG%_`bcs*c#Fak+}?*Fm{>;hncN1+kYetX#N+L@h<RxT3aH
zbmL_dt+ImbbPUlk#h~p}wChlhb{kzty_TlSVY=+1%W=}A?WE;)x_k~VfBnllZ)qu9
zyt%T%wytO{?G4g}%+|g{7bcTkJSJqAkY&1L@Nxm=>54W$Fe$!xaTPr$7go^~T~W^}
z=m$l}SagDhTle(5F^kVbb};XRdZT9T&Dvntr4n}7hShM7Ir<EBOVLqv1AFYer<=M_
z$m_g=K4F?q>Yvq@_fGH8>2GFskDjMFkADI;n?>9d?q^Tz|97&e8$V&rzOTGI$;+Yq
zy<8vZ%}?T=$?^Fq)Pp{{4YyErbi?t&U`gL>uNw!$xAA#NHx_25bi)q=y(b=<m2EM}
zR(8+T*I`}*9l2|uWAYm4n2PC`%x$`IM0TfR(ymqo&A=V>)O^GX-I8wR@~}+cJeDiI
zW$3Alf9LF7c=i~ofG(vn!JWCuH5A%~sM4Le9O7X2K?`4syrNaW7a%XYk6K2x#E*ed
z5VR(qx+k_;_)qR%NAS$n1)C_Cyuh0dnP00f8VlTQ>`$3uXuBn&iJmqpRYSxB8&)R3
z-P!d_Li|>J&8_PzYTc^fNfaFS5W8Xq%Ygi|y^?g##HN`EaL>%G>mlxJ)sQJNq#i^X
zu^ndu7FIJ$v35e=sD?&^CBkRBCSG&X*?b=N3uQ;_1txZPmW&+UHq8B)ra<P;#73WC
z1mp+at0EpG#SJcYveS1gRop^?&rIe}+Cm{>!di=%rYVjKRpY2ziYzJRSlmeLlu#$@
zpEyf&3UJDigHaMt;vPNQMBH4)H+4KZ;+2F-ZJ8A6CotTMU;nh8)hTf6sOG+)iCwB$
z2udciy_CcF2-RpA+JbH}-NnGML&vXI-QtpgZ#Tsd@A&v-;9zdwK4;M_hE~;XgzP+X
zF4O4p0<*B57YEG_0VJ<z7b}j1k9H_e_?U})oa8>TK%pnVvnImGoiOQGVi>!9LG&hk
zL9QNOFlL+gWis+*Cguy)U#Bm0@~|XW$@ls~CvX$a5L=HoGRIx@^`B*CD)MG;%o}z_
zJ5s|>Vaeof_2h<5;%59owjO`*6m32`(~&=BF76OE0iK^SZ~eT;$m;UsY;N|z^gRcV
zqH!b$4co^LMNOj|9y25{Z(NjP>K78rV!_TOHWIq6h`R^12^9N4K1&0cO75ajUZ_?j
zXO1)RZbI#jI*pK;W9^Q|nT5GrvyH%$+w6#N8rI{ePw~!z$2Znl;tgdF>oYU@38D?0
zFuEO@`_^vnj#r|);|Y9NZaAzxq{PPryHE(6MSP0r9w&N8**Dc*!fwV73hp(aW*5nZ
z)N%?Pfy6`-hH$=~)}wq)x)VEM>)8af2zSFEZQ-}%EWo8*5!?sk04t*roB+LJUxOPv
zVD?%V<5wWrvCov?zIGVj<v-De#7#+PX#mgTUt(mC*}`6ukx(1ULt-Ea!?+$%aXphD
zX>8CtIW7Jv=6{I!J)FsR8uSiK{>yQ*Sajp1Ov3kmb>MR1UoY<4K#tV2+4<D6(q^AG
z$e`PXVQfiA+J&<>45ya++wdJ+(<hJ@5N;-5MUcR`G)#wsEGBIVAi`ry!ut%t!|>x~
zl6{-Fdnbkw_fIubFh{gBUQNUIo2e9i?MeRa*)*I<BYi4GwR3mBpg*33yNMP_82@sj
z4bLD%_GD4sa<UEI6|hh6KpVa%VDx846X0nC+b=!44PKsrv8LL_>Vcu_(TMZU!DukV
z3~uLf|9l!g-UOAIM~Ojru!-p`0QiMV8rQG3;afs}VHo*&zT4x^Z6o_h-&w2&rVrtf
zAWiaH?X&G|@LC!&&Hi)(5&bIad+lL3d3vy))xwxD2rmp^JfS0=YB0(X6eNsarkP;;
z3F1GOR(^F1zh#{g_Fz0SO?8Omt89M)<C*A+e_vPp`}#<9B8q=sUuXOi9q~VTdNdK)
zdrpn~Ns_)-)9T67pt7%PW48=a{xi)avv+{mb1U>U#kVvnnBS(+|ABt87s&r{g109l
z__j2tSZ^4<yMXyW0lUJ%!JobIsaaxyGY><uF$AoO=m=xJcLn03A|>(MbFiKA2d$^z
z-W3hb-<l9G?yVx6h9t%Z_j!r^E)9B;>H+n^9LvA{BewtfHJI&l80{zhtbj&P-|k`1
zgFY62Mt|m3gaP<DMtN8dw+Gu<>9X&bn7>1JARimpkuId+nI%wZuYU|16XyGnJdf0|
z-b}-TOYZ^on|Aw~8oZQ-8>4+>`VhjFb>NHl-D5rQ*yH%)0$Uz_<o*NJqhEOBF&>tW
zt{+9<e;@+XPVjM|XUqwVPxp)1VE2$1OlNvM{-i)&ejk4NMWKs?OBX+7RD?pJp?<`A
zG|qIdqha8Fw@rkvh=-2~0kP3dyMF`z*9HF{W4yHN<|q5QaYT1o7<z_)*yyJH`3C$)
zQk>~e{82KJ(9JK}tRE4G#o?WNv0K10N9-OHBR;~{{#*=4%TVyYDDcEjw}Ni|AxsR~
z)T4j=2F|GSDXJ04FY`#hml1AG(8=i`!7uYn`sscn`6T{_8}NT^9eygQZvNkIz~2=7
zSz!?AXU}s2miph_fd8lK=qEqB_1}#A_^;;-yd?N3=J6r<B`${x;upt7qN4-S5&<Xs
z>HM_TDA;O9=%@YoXx+piF-2RXD?Y9uKsI&r|D>ByFm?0a!RrU^jSI!K<bMeF<yf!&
z$Zv8{{(l}xE9b*Pp`2y>2)z|PnIEY~&N5$ej_YI3OR{hzgP3KW<Q(TCA7wte*g%F^
S&T&5SPsXvEPsUZwlJ8%{bWHyM

diff --git a/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin b/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin
deleted file mode 100644
index 08525c4095e3de4a6b007392b56a1fc657fd2750..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11400
zcmeHNTWnm#8J@j?H>`J^4Ft?Z9FUEINKf{9Y$s7;wM|i_P-uCm0HI|sC-%a6cR9N@
z$8n;VfTC1Yp|pLV>I-cj3hGOx@=!!oMdBh<^?}Fs&2kY$A+1aT1gUA=@Be4!?Afzx
zNK@3R5+ixe{I~fp^UpuC=lJ-+LthVvLiwwOY+><xv8u-OXx}A)U*9304~{-4X2rCK
zq2D1wLYSTTPS@#{D;VkL@`8l<S?bI;YpyVxOJ==!xFyW4?RL$sQ}4E#rFN@cI=tAi
zYo%(dxzuVbcI&`qJnqZk>&%xOyY_EzI(El)mLLK>j@`ICgTPq6JkFY3S)3~!cFGI3
zFeT8aH!&t%0l3_UXfBm{^irqX=}4h1@MXkwP{6~ryKbrLcKv#lQS3{fnOK}^xzU(&
z%I$d$e%onvTixY0%qCi^+3A*>-OOi2StiL9_y14Hq_gP$1$)749X{N#yE1Q8K59C$
ztW?6HX!!nn4&GCe{H08(+^Cm3!{hZuZptn6DY<zWRpfI@-5k~^B2oh@^%5|*`LIzn
z{u7?Y3)_7Z-w<Up*vh~~gqH46n~ab35Nh;O#$PpnUt_$k<~{OtVVZwi51~d`;L~G0
zzgCs#eaxR8;NL`>P*@sMgWX5znMyply1Gk@Y(BZVYlMeF5PtW-EE1u0MPG!`QZ_oK
zc>jWDShUfHLVr(gLmvW@2<_=EwD)vdPIbQCtSweCD$SKUtyc5VdU=oYm=IOxv2LMT
zd35eEyV|KN7VGtVcm8laZ+CJFPOCcCXieqpDXZ|9b>v8K&T;IDJ7>0+Md;OcfAq#x
zzrl*d;ks<mP-J`P!p9LeoC#fwtc2G@Xye@9E}T1m?!vi`BBAgLp^(IW2o?auV}#CK
zJp0!VA~DDSBo-Q;MSofl#*m2c+fg|63pf=P<aEEAK={lMksYukJQj+W)bS`yqhB4T
z{xv~;I{Y@~f0Zy7A`bfZBRdHHICVT~)96O(9;MEpE=*?LM5B3hA4GQ2@EYnCsrwST
z_czXe7+EIhXRPi$R!4}75<QCH*^gQ0#mEt2{2nu&2x0s|<QWD!g#R;(zRZXx2$7)f
z0gA~fV*Y63;s!T4!$xF3kq&~g5qU$9slPi8%*IA!ngsU}^=|4ygpI@r=`uGD_YizP
zbs6fuO<k6{Z&8<{?mp`B=+2*y)M<P#bsXU-!eRljG?WhJlI?VGJ29rsDO_U8&Yayf
zIt!(lOj(%aS`GERQ|i=@*oIXgcHDAor)l6wqw9%bqSDVbjjUyum1zTMSSdG39h@Bx
zJ-m-5uXpN|hHaR3v+FDypI+E@3A0^BaWu?Dt6-Szu49;TMo-OW%ur(aQfBw(HB;6%
zQ$BZj+H+SRllRH|XTBH2opd#NyxFbSjBIXNtPnVdaME(cC)@jT`7iHNxC(rVPdfXj
zGCdU316A9fn*vTu94g@^+pLx<_FTQ0*-I(YsL6FQD~dN;+-*Gc>1(T323avEZe6Z;
z7UE<PswJ7S&{{GVr3=}q-Nt0KTdvp4WtiQWZ<xxt>@+A2(Q_d9MP@)iluUP@C^u`G
zKbK*C5Us*KQ8{dD6}hZO*llF{WpbL7hmMxx&6Ov-5}e-WrxaL%ZOf5ixM#J^Mpr3o
zXIWVe@|cln_Ty%UeQjr18__a~=97~eF*{k}^EQVyQUQX${s83=ei#WlEbJptOr0zu
z0R#OJl2%hhi=cGo9WO)_QZ3;NUEYf+OfN!9FGP9isGQRBhbYf(O3_?Dg`FgN9{PQA
zaZm|QOXtIr?+)Q-`sZQK36G7^C%~RF!9t9EGqIganCVx|o)aOLgKlp*^^!@;;A=6H
zoq`d1xi#Iha0|M$6n<|(mj*-27IbMhv}{3_=0nSHy4)0bXxW0!tC)wDE$9dgRsdQQ
zT`ZJqW}~H2+9_~?P)akMnX$zL4j_+(RS;t~lgU%!iG{_6`9!^DYnB>U0iFzNz_O}B
zvHBX-DU`PC)V?929H9b~1VojLde9c3t)QzZ8P(3x>SD{vU7|^(Q<Gq=UyAa#os~Vd
zgP$a)k|gAbpF}2TDY)P#QF>?+O!Si|Gc;+YpF}PzQVwbEm!f>oq<~*pS}RSXWsqeC
zkw$D`c*ezCRmU!O?RKkCuP&STeO{$EAmxU6D7Sa7y;QGuOO0};%O{AG%9@bsRV-8y
zHIuGkC{Y6#t*ys{@Ui|`SZLs+k@pAlP_|DkSy3%l=j{^i+g)?1zy*5+4sepmWQCVb
z!8nGqi5$xrcNrPuVM5lZwECS!^sm5aNDVlRCFYrW4(Yjj7%XUj2kWSv&RvPqoZFN_
z4wsQ>&*`GiX(S_@?%%aCQ0OUN9V&eO4a>*^s8n_R1j=!T2w5tkehOPcl-x9>v2XXY
zc|^%i2h_05L@7*D68mK2mIkFrCAW{n^^SzBso|BxW6;`EC1Gu~ESpoyVnYoSAz3R6
z@ID~Q=4escaD$?peg(uQAj;NwqS%mwqVNSlMF0b$yj@ZBq{VdC7?hPO1&*LB+vCY1
z?+0b&Vj?U-SvJU%1$9tXE;r&5lx2fFS(L~@S(y+BOi-2$@??48d$+ryJZNBUF@p^A
z-4l!KE8TWf0af?MBKtwqDJwu>D|3*@LJMtBEdmpKKZW{|vdBIc*~lW@?Rq!WBD+Ku
z_V0=!=bRFRCe%&!GENn)_(PLUaSF9*Ww3x9^Iu<zoPwmGn1v4C&8^58Msx~iFJ2bM
zi>QVE4QX+>q#txg^6fi|Bc>R(ve(MpmCF>SrO_M{_MTmxhS8O>V|JFCRU?kceTG^b
zbXpX>?(zqKB_qnc)gM<&h8!aOakW^Z6tvVatX9xsJ17}~zHtPVBpt%Na$|7_EoPwI
zjnhXhlbRHCER;1Pe*^W?XqPhaE?h;2G9ccPAy>49GQdB{fC@R30g;moSp1<32%coH
z3Ynn{2&81dRv5~Fh)Ra(88TAf7|t8WS9J_iAow($(gzEjxiiKwI$OY77{)v8=gnq3
z=Hf8snLg&M$7~H_#$@VH=c7O8aZe59&h~NVJ?`nD+}?$O=f*`s`8YF_JKr~V(c^}R
zL&BFW^s!r#-Lbpg&*SXg-cNVCS)*Ue$LOn;%@?t_`aV;xR2+K=uLy$e*Z?bk<QilQ
z3VvSjpaBczD%lWUiNlfTo)LcQdo%P!x63i9j;4L%si;^b-1P|};VtwHLVS3e2S>!U
zLTqgJV18lggF|AZkDn)jeB?g7bG~U)L~kAuKftT%HT*@#60D0~=7KRYZiwi%5pfO1
zV_}S~D9y%=-W}86pX&J!^}K{9T;oRXYW;7uZ`lI>wJq=`RzjOR0V%ybDaM!8+uz#&
zH<RMT%GjoO8e>xObHHv?Y+%T3JtbnUvUU?50UlqDNE&@FT~CUixY1}7!vj0v7U(Bj
zd^zR!i9o)Y5<hffF?#zR{p6XHcq;||SPan?2d+~5N2B5ZU^JGGxyoBlpIkFB369%S
zV!|B_lfF&y%++Fjj~I8wCVeFc@09jUxRHoQfAd+9c4MQLi2pSB-${uZUA&40%nk5P
z*fVS6^*kZoNrlo*6yFiRe~I}M2^XJ&U&nXi%0`6b)6S-%pMZX{Clw`nEKK&KqTr8`
ze0!`7d`fuo>GM(Ix7@2!;*_!g{4vCvoAP(kO=3dI70mb6y<*ZGyV`>xA9KlDG~)k5
zN<29}GR8C9VkX3kN%3Snn)KlHuy}1*eOoyNncHE%J34v^q8tylAMfW0=6KTjL7FE3
zKl$sXN5LWE0e{l&4*JmI#gpPs8O8Aj|G1mpB=Iz@2g}13z&iezFP+ga^griBW5j>p
zH1gw*snGVL$sGi5u05Ng^|g+s)5@F?${)m==}C$Y=C_U}wYeh%PrzQ*FX_<<H_0&b
zLtokz;aQqr=0h~?j)nJY81s)gg1>@|t!?uB<8I3D-x%T-q4f=aBK4nG32%y%Kv3FN
zlE|-AN=2Og$NG<QK0G4xXQ!J;dHDu?peH!LDDjXFuN;B(C)9kf_f$v7^`rzn^DM!r
zY}k_ouOAe*c$FCOK9>?tFRO2Iv~ABM0VDdW7!$mU==1>$yadk^3~N^vyo>mWfE8ta
zf{|~lewd;QkBE+Xl6K<>14G^~uip!JTVhtj@9=(zOaHLZIp5+5ipQwhFKPFh#3Mfb
zw3|$<_F#(7HYtz5V0{5!BVi;e=9_f4#mS;D&$ox@Nl9Pg{fG5RywIBu>zz!T?t>GF
zH4S4v%iR%w1TZ(im~T|h*JXVt$Hg{^aC97ne*Gz*K3+e_kH$|+X{wi057hdi9`O9B
zxPfqUd!6)M7pL%ZgMX85n)93eL4H{7_3^bHJ!-k*@mbn%)R;fP`7QlVjh(FHb0<h_
z-@CDpt|uP-0g<2#Q1me3cbzYOV<*2Ml9a!aejV(&_WB+j^UnyJW||%YedoB3E>7Ml
zuB8ga@@HZK@!CmrhSy`_p6>$Swi*#<b^hsm`X>C1$6wI?!T(7&+*@DNyOYZz?Af<|
ztGH*mf4;kyF9|w7H#eV(h|?)?gGUG7{{Yt3eewJv0{@K&P^qB$8f>!)k<xoh!Me0)
ze!8a)lmEVwJN6m!xE;{xG0?`;p{Q#g@z9Y;wS*dO<lNFQC}<NAGV0nhFWJ04$b0J+
z^3N;z_dH2C2+AMYLjJh=zmZ=C1OlLVZVUN4;!@u$fqyv(%HJv3+5t1g+D#HOp$K}7
zsJ8mscO=lB^&>Hk(hnrGAIk@?G2hD-tRHVm`v>$tujCD%JPoyey^d!TZkC9?_FE;d
zQDi^q*K&IO_d!3+Ig3Ak)AE{+<SC_s^6nP$Z!7(Nc}j_({L5R&pBW-gb_V5hz+nn;
z72kR>&`tet)Bh7h_EY|^YPxzJ$CKv#Y5>1Fp%A20HHsgd_5St}0FtNq{mqm2BQYjh
z{cRlx!hOY$#8}H`e4Be%{_xoke*J&7foa4IN}+!0_z{|1yAnArE&7LXK%tJee){#1
Yt=g}ESPxY3_RA4!D>)q>{nT>*0?5{&i~s-t

diff --git a/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin b/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin
deleted file mode 100644
index cf975523f82abbc0139fa3d9c7423217617869cb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAn)amDfdh&!kyPx)s_l3PU?ahTVOz6|p+xH8Q6zIo
zIajeg%UKX~K-;X_fHv5dBA?m;!yX+F^uw}rZL^@juy)&rZl3}pABw~s6x&;ZHLdph
z{m;3Vmk%YDmkk(}3;5o1{_ofMpL2ETs}G+$8w>=}VWqF2dR1+hF&~>5JxL_=^Ygnt
zujW)<_2Ir-1(b?6mK#mGS)9kneXg(SxjYMv<!Z@M@#@9+N_DZO;?1(tj5qC-X02MN
z*H#LPYmIWLuu!XBtX0;UD}bgv>N}xpEEnx^X$O>cxly(+f+eBicDZtA27$16XOyM#
z{Mu4s(Jro*Ra{RhD^-lCt7Bel15_^-TKGbv*l1{`ZQw=3w41?$)tgSC=`{U%jZxxD
zo0&+?La|a=vWxX)34YzKHEYfFI?N`nR&6wk)n?+OqO6PLuJ8Ysipi~_$5+d%@!H~I
zqufM2Ay}p!T|BC+P(V`L`QzswK3~xEg+!rPSt&Mp$Lo{Y)Jy0?YV!`d$j4N=Ddfmf
zqg|Q$CLo7>(2yDb7|+<($9x#y5N)&bDgtIHm%D|n3cRlcFrz;fcz+lCO@X`JyoKHy
zr2N4az>JcBk4Ia6Z5Wcrgnqn>ehb&6fpW|Xc3<XmbokoF#!(g7y0&pNB15GJf1pc>
z2DrLH=b%jUhN6bH0zZPPj#(h^vynmE1AwT&baS;n-K^OQ%PZB=+5$#}rDCI2t6o?s
zPS=Y{E!2w_nw4gKsb)LQZ&%ivYt^+Sr?xb+R9n4Jzp%XMRF~@M#`64<le5>;xy8!U
zsd#-|1^)3r|M2_$Z|qZQ0M_U$6tKnuH?6^7G+;dwOa!a}j@E-eRDm0}ZoIpB%X$`o
zcW>VKcNtjy!B_yJSb%eV2Yo*|Nl0h{_h*7%?+aK5DXYfY7;bOhc<09L+mMYpwcNB8
zDeNB3{iB2jdrff&nkCX4z-{y2tN?XgB{YB==NK*o-vsSJJp`?2ULe;&kOzNgA-a#i
z>7Zit+v5#@FCNO;9k6zh^E*oOM+3L4m#=dCuLAkYtBj6i6SAAP25*euO>2U~-{NhA
zxBa|b;El$^OWf|GkxRVYt6>;Ta`Yr`;>~wO&#R(S7+2`?-wX7zD1Q|En>tt1WP6<Q
zra0Qe+ap5scfwGT@i~BRY+4Tydz!cJ3-UaNTHPz6?neUt3mNBtd{0w<R)#l!hveT0
z@c!$(9p!Bvx1YZA(_6Y!-h+eDz&qAI3+<nh_LlVzGWixK@9J9nTY>zIh^hiCk$l&B
zRlvgpydt=N62LVXe~Mrxb#qgcY)W0+(8(?lE?~=(w+X^;iGY8Rxc-TZe<Z^GT*iMU
z)pUe{>4tTPx33E7ZIa!(DP?W_CdRiVz&C@xA4CCh^~5ih?FJ5Wv3xv-GhfkODmP<|
z)xvC|sN%&^3EN?#&{%o49Lw^E8ONcpY*%Au_l(7{F)meO$!sh>pN}n7YV*ZPp@FUT
zi6@U!_)24CzEY0G%hjg69{ccp;HZk%i`YwI@wIFw7Oyw$SX@i=)O>`5g76m-4}4sb
zl0Hf4)SYQh-33qD$MfIwJ)`dDN%HAxbEOnZrt<0%r%MPY4&SkXsrrdj?xEubHV>Z$
zrtA~5i53Xz0V<tH&jO|<&lPa+uPzkk%S$WO#DgrEN=fG_K5uAKxd&oTe0Xl%%%Cc|
z+4FbGo&-BB!sbF#R%;jIYube*Oqg0|7FSB~b(q~)uEdRVNl2fviJt+&&oT=Hq7-+I
zt75g}(x(za4`e!dT+J_*T@|UMhj<{CfvlSC@uUPLno^15eoUGe!<uiyWCR0zGE0lg
z&3e4jG|K8pQI>*S%Iidmx)fM@A+<(GC(`e5Y6cp0jNr0AMyagW0z6~XShhW%8CE-q
zU;yEdg$^p=&;tWo@lG+U!UDK2u4#!Yu_?sC7oxOwRR<5DXoWQ`eiDyx`@rHM!EFIe
zOF#)uLbH#YZrcKHmo82d-k!3w_$=^twc0|#rrITm%W2^0T1#_ATsDsTliZM^ttUN;
zv(V%w*+puz5rSSwZ94+d3#lD|@LovqA`RiaklGOl?}5|{gVE`QL>Lid@b^MW9vKZH
z)#_R$zP#d=SVlNm>G_%b>})!dm9hm*r54w*urpc!C{c!uwRsP&Yh7cJbP?@illW3W
zTGoOuGZNg4#;3_xEcx|$rJ?Dd6<eXFRpOfg4W;@K5?dlN9SDgl5p>SSF(Va1M7{$d
zQ6VCk=)h=iiJ|&-!)Nr;fI81`iODl7rLt>&Nv}{b(-`CSapthZZqCOngd+=Wth$yn
zpg<zU55p?zT-G%0>SEO%Aw(Ul5;Tt$J-3@xy9}>7Sap~5cu@h}yb=+{xsz9t@A0At
zx_O0U28MU?nh@zn8rW`LiO38r@8H$T$kU6`>gGjqr)=i3lUJPac-d0eVco5r5!O2g
zS_|^yLotFbiq-5xQKpN6L_doDyQo|{itf26G(A5Tt#?r<B0q`&GbmX*J~LPbE(%59
z=i=(AvqONuY$~41TtF_43t4Y?s7jPDJWDvJ#Ra=uY?kY_%F4oe9DiF|;9oC`mH4^T
zgAbN3t}HYQm13hQ#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2muwS2c?-pb<#GXUy^sgi
zOJt5+!2yQ`rDDfoiP)1c8OC`Uw6TcVT^P$Y&}l3JC6ZulYRAT=JY#2k#<J4<kwr%C
z!dP&18SCZ<#-?{{Y}zw6?=u!P*Jf<e4am;wfNWjnVnw*I&Ft9RjAw2#>9dzL-EJ@z
z*Ik4+3AQeST~Ubd%#ID7X&IdH8;lOpZZH<_-53nEE`!~50E4qTHaOcdIOjK*Cr*F;
z@<4tU29vGJVDvP%Byu}8IOiFhO85*;p#!!X>=xe6YB&Y9E`!}<A;S3`8=P+$oc0@x
z{if?BhHofSb_RD=xSVgxPn%;%ou+9ddu^j!smB-fyW9+YT|D)0D)+hk*)trc&!kW1
zPv>QSOHq@1R@QHSb$PQ9MeM4bNbX01)SNy=TpWw(hf}lZ+$pxS#q{Y@xwE;mG_B4v
ze!E>P&Baq^K9|p*NuQ<A83+I&mqVKaaxS17Affhpigxk5<hsb+&YlrEZgxHlH#>vj
zUf058<*cZAAkL3rU<?oUeIE}uO@raK@54xLv^4JeK8&_oY~!Zy!_ZwLjVsFc)|Zta
zGvN^Chl?2In$e^zbEMBo1T<U6X*nw)$O<P>Jq}2MqX^?}!#%-@g{71TFQs9A_u)z_
zhK^2Nalzy5a4;v)H*dl`oR~i^<RtdQ&d!>%iHnig^RPXWfH2=%bxvYWPM&u0M+}z(
zVe<V6O~>BED<_`iBsV#v-xuSY7*27fU|bfRLe1EomYf8EVd`L!(C0KGjc&T1$;8Ab
zrtI9T9A9R(K^{wHH6ELAFXm(El;HUu)Dxe{Ydj*;6Q9j!JnZg?&nGmVA?jiLu~}`O
zyruTSCzHlNESLGB9dE2x7h(gnw)H2DyjKfcyRBM~24xeQ>L}^F)Pl}lp04?X>0$su
zdn)phYR#C3H?5b~yv`^%W_Wb)6;@s<f%p!?B%#D+X3b(ZHIkP?Fov1c?xLg?Gyx63
zvDbVuy_nGzdkJS-v0ncngXa%GvR820owD<@$5IJoa~nnxp45SqC6r7Vz6}3zjJ;_#
zh6I{}w$Z2$(cZz7hwC14Vpkx55|WASL}cBqOU~y#e9$uNa?|^RWAee)A4AL4694{q
zg+Hq(%iF|&`GG)je%>x$#2;NduPa@o@+*sO!cNAIYu;_ZYH@)!3|tzxqC(RV`Re#7
zs0)qhF*S$AedAn6esdDqKP*Ot)B&Y#A6D+%QhV@2lre8E->H1_fQq!yixBWzkKvo=
zy;~}DUqr<Ql&a%jC^{^<<YOWTt%;Zl4Mx-u#?c_g*$}14#c!7!{K7ncZk`|Di4@7j
zZ-xG^st31$pV<cf+@-*l2%yp<BWhwjq$xj*M|DI!cPY4~p2wI<UdG%BsZC9x)b&v{
zY3~vvx73%zoKLKqZ^ef}bbUm<<n)Iq@#H?Z1^jbPSk1`?8USA(Re#`w!~FW&|NfUp
z)f=Or4~G$Lb+X^kU+z~YF~=qR6YF}Srtmxh{Z1$p!%*v=Ruj(97A@%82PJR3sK%XW
z=n&`cYx^c0d>8lVM+5gbLqp#8R@n0yCmhYB(`n;RXxLFv+EagO(^7AZLSFxnMCcc;
zylD7m$VW7&=i^RfNb=|52g3As%f;Jardn-%0D8!S{Qf!0AHN(D`F8&Rzk&+>dx8&Z
zysh!h;E+_QCqEpfzx_r&<V7_7vxa_ni1L)7pK>B$+8-49N8!)N5d9Opwr6k-)A~3P
z4gKSY4_1T6e{Et{M85h@#D~?(K{aL%QoY*x@r#oGtTQJ0BaHcmJwtOC*2lH|m=Eju
z#SrmXXK3&~;N`;nB`4BRF9RL*GBCh;iFoxgFwkBvk+yo7w1?a36Z%Kyz+WG48vVP&
zM@2bqZ;ghs&ff5e)?6W8W36~iNBf)+jom_0b-av!b&)sajPGvIkE&}c!>xGz53GkN
zXU|^0zL&5bIB%^t{8A34$|dWquaEHydGYJ(YmZ;3Eq>R2V-%ed7xR6@!g{-R$X{<!
zXDEy|r?%eQV*U}o{oxYnd0~F-k&!(u{vqUhB8(-dFT`)s9&6<X^C_qQUcWzmj`<A;
z<Nf+H@Po4cq<-PgwSn<|(rf$3AKCBm6F)mP=+!IjxA*w<1*e?ggh!A5bM5{;t$ZzG
zKCwIK$)69PzwOtyAI1ay^1E)o`+}aMzM||W(l6jY`=$4@8r1h>b8ze1x%(e-`xWUa
zzy8#JSd7<XbKlm_zwsG^$NI5Vl$${PS8iHdkEFMR{<id^h+_@je(x2O>_(q@>yU~#
z!F|`hc;d4vup#R|5(!!-zx0j221|hJ?JIVukNQr&hJH(XoW1*Fn0x)(2}}RnFa4A2
zfA8Kmp>XRp^ee<?>~g=<+t&Lpj<WvmajdY{KQSM(`}Z>5TT#&efbydj^+|sMe$v73
zE?j+BZ<9{H<V*A;U*nEtaeongphv&${{X3%Iq=!bmdFE1g!W825y`)0`rm{T?i2Ym
z0{C(4H|#HA_J5J5`td^(fB)6}bsqg4@i=Vn3hn~ETt9eJeW9VEGG`GBemJO437o$e
zI&6;yy<aa~AMHDAj|5GJko}GP!)(Ch|4BgPjmFGe<c$ZtUpXQFhwZVT_p2?$hx{W6
zKmDGd^lN}%Pyb<icW^hyyuLRo^2g>dHulQ?{Q~CPzZXXYzlRG}?9ueQpK70v*UcRE
z9rO-zNCM^)&M<y&<?gSCkpI46=2tG@pI?tqGJa2mf1e&v<946ao4emlI3ccNrC47&
zAAM0w4W9e0^rLzd`vv^LoTo2UkLvz9>Gbu@VIFq(Q)f7|>8p<^N7uLXhc~8CFS}g(
z<oxh$Y>VnS2w*%=Pm|6rsh?d&-U|~dL<d3#aZtNHrq2H!CUR|9>TUWrjB)@4{hJ>F
zKh854ian?|^pn>9iu!+HT}i!YdDH5=dwx+`-Y>6zpm?6!+WK8fy*8>2cz6i?e=vua
zpIo0r;D0Rw>^<yXo!6WJjB@kvUDB(3a~|G%@qfwiUHJ%p_UaB2ksg&(G~P{&Lej8B
z>1L|<Mk?1h@SQdhk&r`<Qp=jmyLZwa+J^sYhQHpy&@ol#w&6EjU!Cu8u9N>O+wlJ?
zoyDbJHuRYHPX7BfiaS)>_0$nPoitE4N8a9K;J>Dy^*J+lAI}^9KQaLF%Q@L4FwfVu
z{ayMG{E{Y!8M;pWZXSOEJZ+jYbnYW>_}#gy-{o`je;oQhjNhdrKTE2U-`R%$F{9tl
z&ywim|Eq2ImwWKj&QAUmU=V>r7<c{FwPE--OdZgE4mVmPGP;U8WuESUtGEGBskxZH
zb}dSM6EN~qe%s~pw*kBB?>da9{iizUT>Xh)3(>277*m?){=cRcwH_Uya9r-wjUT~x
n%f~u!<4f88%Z<1D^y`zBuuWV%87AI-J^^jR=f=l<x_tiyry;Ja

diff --git a/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin b/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin
deleted file mode 100644
index 936d39048d4b65699b59c1fe697ebef495f7b797..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6280
zcmeHLO^h5z6@J}4PG-HEon7Y;TVT?TypkiYdS`lPe;iwA1rA(d5h<4tggQOby)y$d
zJ)Q2}@yu=vV`Ew50!WAhH^h+x95^8%<U<l6goMP269)?+ffH-9ikt(6;d|9pv%NjL
zvLO}{kdmhBz3;v6)vH(CRo(N=7hn2<u4&6pQrdL-6%D1#UHD9@l%G!f*PeQfHfWuW
z;W|$mkrg-NBud;GB-IZ(sV;gaZU$bTte|JLg3T^jiQiAGBx)tyzzMr8XLBp|Jty>|
z&Q{_(^=>$D?A5XVqj|<nH}buI$v5(2Kk9*0Cd>0{TMcJ3ay!0K+N}U`<hP}9th)z*
zpy#AywmY#K$0_Lu+(j8C31ciw`cBeMvg696kcb*sBB*EGZMPed8#YC?!l;{clR=2I
z+13r>#0`@2!;6xx?8mtN_uT_(@n7lq9jm*!8T$!t1C&kPNXm|z!*Q@YN`K|bi&vb~
z-zht8yX9iN|Htm0M|w0L*X?EFwlipU;^zAqkTV&yAmu-zH-A*hSW<C>&LHKrZ=S&x
zWnj0;w+jM)A`jCy1g>_&ee=%+etaL0((izunA^{PsHcWq;XgOwKSDbzQI@$O?>D(G
z&0HT2&rxx7eR!@YL?Xr)C!(Z)8msUs^psmr75E7tokCnS?QgTkan+C!X-i2bTuQo8
zz1a%9tvV#9;l|x=@U@n^)VoSl?_I6gcDULP8f$&ecemH-Z}hiUlBjEYp_{a-og|3b
zD}Hl(d%NFk58}<0uUp}Ov|s-D`|mvQ7gS<OmVl_B8S~oSp7Ew$){KH~YP)w01M>U#
zc0VvQ==XN@+d6V{MYde6gV_SeL?c8q=Jh$vn8xKJqlE}Y%=&p;_6!f#-4E{HeSc3l
z+~0fe{vO6NHT`=$2!fH-^P{-c4a{|G4JVNsHT=YkJI;l&OP1?-c+O)dZf*PKYBlw5
z#jRS~H!VL%qJjCaXX6}Mp^Ib0w6<2OrWGcUX*Js2n%g!l>=uo{tUNS^BO*BE#m5(`
zl8Lo!KWYYchV^l0vrNzPsrE*Yv^=w7uhR~z4U`DywmG#BURwF=MG4iA;?zQP$*%50
z7$1;#X&n%qeaXRFLr`~WexntXU*MH$duipZnsi-RG{5q&<x-{o>cvU^>Z9{3;B!uR
zwm6wiw`XmobEvE>nhW*BZFyE7iO0>hC1+Pzhb3Zrr3_1!a{(5Vi5VG!T^0lw?Th3F
zo(g1F7-#EBc^#3u(b!@Oi?CN<%2<|#1@@H+q7jb2&}t`EEV2&Mh_GUz%4o>rphFrM
zLB<Z$*=k!?Tb?gtL>|vPvf)`px}Imna$Q7l537UaTsvqIVLdq5<tk5QT$P0g(=PXs
zh^iAxhZ*M~Q1cLC3TR^?teCh-(x=htRYjF{kOh}pM8j1iSFz1&SbR>fVxBk4CYCpw
zO9d0p4^y3lFvYwV?kSaweRwh)8+jlcuA)el)p0iVem2>P3W|r=lNA+lJUFO|iZaAW
zt_)(X%W3pQ^T?ub84$oW4-Zhy1~_~o93k?P@#9i?VgF(F1CO7cYy3^&*Awj0Q}Wfy
zt<@sGhb<%Du1&a#PmD>&Jn?bqlL>U(I_EI8GlhxZ2wW=C6#5e^&!@7$Ph{k^0u9;z
z%nY0GzH*Z2!!u)DC!CDC^J873;sLuL!QXfp+)s_D@aZDGjN>lEuP`?wV)N<8KjV~1
zh2uryH<>vd@@j#jiN^05Ec`?Ef0z9O^kOow@!8JvQSG!8^7!Q=aK%&ojU8<ylHnwt
zo263&X*q*Fn57##`iO2q=7fJdOUL?!)M13#{HX+8zQeJ8rv(0VYF|KiXO@1k4gVXE
znGgH1ep+K<|F=tsUmVdbX@9OnXZxrb)DeB>N&3waoq+xTuA|$mvweD6IOx!S{asq<
z>(lJ7vwr7Gg_%B0)3=|du(oT^yCqs>UA+*24?CsyJ%i(2f2*L28J+&jOgcaLE-B};
z&@YPkMUMa5l6L)vGw?^T*uQDet+%Mi{K9E0^qtSsl~Yhe8yfW2XX(~}iZk#}?Irz+
zsxPs<L|+^{AYR|m=vxNeF44)c9YOvNbXlxVx=$+bAE^Mg6727jZ9_tDvqPrkRBR@P
z^DoOi=2^AZ$QXG(|BaV(5uQ!42@w?~fqi(NS7*aX;YIWg`g}|_h4K@-Dg9KJ_x(K#
zd5nGkAo)L4MA_$k`GCq}?0QQc1HR;`F7JC8Kptb?yOJNt{wV(QsewPN546{xO4YLx
zRckEEI`;i?Duo+kBo%H-{!NK9pHG53e~{)sF@8}V8)k;jqT;J{yeF{@HfGv6$ye*l
z<MWazp2E*#e2%$+U$$)=!sk-v`Hds^uO7nZaq@irzlhf_w@fsY{Gq(LIe)%?hxlb1
z|F6U`)%uKW^JD3Nz++|G4*)Rsxh%-C<d^063~@@T{;YH3yctvft235uC*^>uSMz81
l-{tW-s`_NdVp-Ko_1W=6jgP2HK2DY==6nQC)kpP;_fJyYA>IH0

diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 90571b0751e..93bb438b8ee 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -57,6 +57,25 @@ if [[ -n "$MODEL_ARG" ]]; then
     echo "Model argument: $MODEL_ARG"
 fi
 
+# Cleanup function to remove temporary files and directories
+cleanup_temp_files() {
+    echo "Cleaning up temporary files and directories..."
+
+    # Remove temporary files with specific extensions
+    rm -f *.cubin
+    rm -f *.pte
+    rm -f *.so
+    rm -f *kernel_metadata.json
+    rm -f *kernel.cpp
+    rm -f *wrapper_metadata.json
+    rm -f *wrapper.cpp
+
+    echo "Cleanup completed."
+}
+
+# Run cleanup at the start
+cleanup_temp_files
+
 # Function definitions for each step
 install_executorch() {
     echo "Installing executorch..."
diff --git a/export_aoti.py b/export_aoti.py
index d798654ffe0..229d6e567e3 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -44,7 +44,7 @@ def __init__(self):
         self.linear = nn.Linear(3, 5)
 
     def forward(self, x: torch.Tensor):
-        return self.linear(x).cpu()
+        return self.linear(x)
 
 
 class SingleConv2d(nn.Module):
@@ -63,7 +63,7 @@ def __init__(self):
         super(Add, self).__init__()
 
     def forward(self, x: torch.Tensor, y: torch.Tensor):
-        return (x + y).cpu()
+        return x + y
 
 
 # Model registry mapping model names to their configurations
@@ -132,7 +132,7 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     # 2. to_edge: Make optimizations for Edge devices
     print("Step 2: Converting to Edge program...")
     edge_program = to_edge(aten_dialect)
-    print(edge_program.exported_program().graph)
+    print(edge_program.exported_program().graph.print_tabular())
 
     print("Step 3: Converting to backend...")
     edge_program = edge_program.to_backend(AotiPartitioner([]))

From 9b9c28bc737bb5a6e10563c7a74fc7f1ed5132b0 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 20 Aug 2025 10:11:50 -0700
Subject: [PATCH 15/50] remove temp dir in test script

---
 ...l4fepetv42wg64xygsadkkb43zczod6.kernel.cpp |   6 +
 ...wg64xygsadkkb43zczod6.kernel_metadata.json |   1 +
 ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 0 -> 8968 bytes
 ...3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin | Bin 0 -> 13832 bytes
 ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 0 -> 11656 bytes
 ...qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp | 965 ++++++++++++++++++
 ...jkb2gr6xgxxo6t35umkq.wrapper_metadata.json |   1 +
 export_and_run_aoti.sh                        |  40 +-
 8 files changed, 999 insertions(+), 14 deletions(-)
 create mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
 create mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
 create mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
 create mode 100644 ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin
 create mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
 create mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
 create mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json

diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
new file mode 100644
index 00000000000..02ec4e5c2af
--- /dev/null
+++ b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
@@ -0,0 +1,6 @@
+// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..000ca4c1209b77cdaec3c8757e532677b79ccc0f
GIT binary patch
literal 8968
zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ
znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8Y<D&eE)yW*yHhT
zn}&iw>XC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J
z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7
zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z
zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH
zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO
z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM#
zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f
zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z
zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb
z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew<A*5o`3mwBfmn%ve>$8dP>`$
zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb
z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut
zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L
zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z
z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz
zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY
z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa
zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1)
zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5<e2;d$N<O;
zmxSG_MZ$|D43;&wBq}RTWE|ULa_O*9ToP?B5~sjtJa89bJVv&N%#Oi`(F;l(%mI3l
z=+YP%0xvMG*$k9-m$v3vut`xRc<jW2-5q$eMbCn5;>GuFVIK-CY(EZ`C4GTUaK$2L
zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3
zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U
zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty
z02F5dZe(2`e4Kc=$%pwRQS}@<aNKsYQSF%cQmN9npxrc&7mgfpHX7BS+_e3G55Ls1
zOlb8>3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8K<dajkCB9D4>FIpOH77K*R{H
z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn<yKJQZnyvW5c<nMCbTo;?dO@}z(=
zNC-$x9F3m94kade=o2NzN3|x8P|X6K!_~?5DZYd%pVpo#m|RlohAnXI!n##JMa8b(
z0(K(rO<97GJO@I|LJ4)_*S+Wn7uXNj*g4f~q`(J?m<2koxbzjHh|?T<<A4&KMfpG|
z@);!woK(6=`j3R7JcJ0F{KQlkIi&~|YFvFnvcJb?yke1y5vl86fkg=#tl?ohu$>|U
zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC<S(oZSlT2tHAMrU$H&Y@3g8$b`avC#8P40
zw_jqQl<PsbN{nP-ScPGfiX}i2L%769QmqecqQp>@q#g<oH$xm?;ERJl<CRMCEn$!c
zUFO|aEYO=JPHWLXhSb$U8R9!*Av%V1l@yJ$bP^$3+DUdXE?bhao)h%mF{~r~*B_@<
zqgRfz^h2A&H)2-(WMo$=p0k11tYkYoB+9?L4GSg}Kdtp}0Bd%YV#uD(Vk1JI9px`k
z`nuHj%Q^Yhr(*L3U2GEVb7RE9m+`GaT)nHe*2GQtadLle%^z5?bxMp5$g>gXYY*W&
z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ
zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{<S8=`O~9a@eJm)nV(@D
z>S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM#
z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f
zaF0gz)9(?V<TIJRd@glcNWX)q_GV6ug<?+@OSgh*z<SEl@!K}+zYl#{ha(xz{fBZ5
zd7bRLlMvj>>4(C7I_d98v%g93M<m}>{PB?<|F=0Y6OQboct=_OL6Henm2?q5@R<om
z$9sI>b8dR?2(87mIVOHSA<kv@jrP`CY4Mu~pXVS0e5OJ@{WPZC3mlJ5`Re_w1U#n0
z@h)YI@nU@Nflu;`C+Sb<o3y6=Bi^ZSeEbetlRe-u9Zs<R1mTVKR5&(9^$_u)KXG(m
zo#uRVKCmC)lV*Ir1H6IXm%>c?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma
z2PCeK{b#acB;VbBF-P?<e`e1F<D>YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN
z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa
z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV
z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s
z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS?
zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy
zX;T0AF8WpVCl87*-V#gk(k}YzYNH<zz_FIpzgP0ym<Ue;dWk_hB!^y()U!xYh4{9d
zxywC@oV=p+f28P?V|q>|^)Jf!hx{MDSt=TG81f(G@hxR@S;<AiLrOndNB$$dDE~|F
zPcbb=f@+NPBRPJ?ftS<|chRpa|8adPiKPB7cG2J5L7(DG>eJuxU_y-WA7n6z+AV6L
z&H=?w^}nge(LnE3Wb<ngAsTO2lUTwifAnq@Z?9oN`ec8gMfwB8;rib~@D%@76LLc7
z=VOO^r}`mGDa7dibtKK)O2*NmF~X0?H>$^wM)<PzSf}vjmu`^b{`gAR1(%)$g?C(!
NNxSGp_(Wr*_b<c8Te<)M

literal 0
HcmV?d00001

diff --git a/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin b/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..82f0e9e31b91e006c53fe1a2116c7afef6fb68ef
GIT binary patch
literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAn)am5fenf;kyPv?ik)N#U?ahTVOz6|p+rieOHs@v
zEnoRD%UKX~AJ(kHfHl}Ybotf+!yX+_?8CBjZPOsYuol~gZl8i6ABw~s6x&+^BrEs(
z{m;3Vmk%Yjmkk(}3;5o1{_ofMpL2ETtB;*|JQxV1!%AO4^_to;V?MSsdXh-!=a+VU
zNiC?n8o+(G3MdtCtTvi<v$%+n``ld9b9ojTtF@A&;<b%<rMA4T;?1(tj5qB{bG=rm
zuU87o=NsixVQIa#v0gpjtN@zwsBed^v0Ajtr8}Us%Z;+V0hWY{+vV!*83e-O?NOG>
zi|1Dg%XV?Ctm1l7t<*54uC95p15n#2wDE;TvC+^>JHU&GX)l8Zt2doO(`owk8l%LQ
z4l|LQrDC<ZVi)VH68yTo-dt~P)?qer>$OI+SZgLeEy}t`?)v_JshHd<dU~zA7GGap
zZj_s-Cj`sXql-tC6$(g-JAeA@V`mGRzK|#stCeD-f4n}gO}&IZp*C-$i+o0<n?jB(
zHQtk{Zvt}I2Mw9=kMWFsW5S2=4be8cuOeWUa=F{sn!pFz05kd{fe-e;-xj#r&D-d`
zLCPO)1I#E1_++&0*Ono9O6Vth=v%m^4U}VEu=_Hf<D=KMwvMVu>)O`Qhzyk?{CJNP
z4RCdZ&On*w4Mh!Y6@CO&9kW2-qp@M!1AwT&Y;&zX+g!JoRx7pA`6Y}BE5*k8dhJ}L
zID2kIsiku(>GW!*oKDqO)3*KP3psmvv1xBCK3_by?p$2@@?3Q}y?&uuDnDP}sHE++
z`o&tjzNrE~*!ueS2jASM)DW!ES14dj1a4Tv!Dzs`5KIKDA&xeKKU9J1H?M!Nebc%K
zzy~+3|GNyV!C)+aQ7pi@zJq>{oFpVPf%}EvHwFUMLCRX^Z34IVuD^f%)-A}!oLX*J
z%M^AG=l)T`gT1ae1kDO*4&b)^Z&rZ1t`Zu+jdKi_f^UQNpdNx&G%u0sAjpG1v=H4V
z;B-(i`knCxz!wi??G9MG$oU<m`J;iG)~i=J{#Swg)m28vvI*JETZ1=7@P;+T;cxOb
z#@l}0&hbX$;U#YO(a06v?$t1ira5|+H}U4XqUSZyDU22R{0{=XEXtn*|F+K6EZLr>
zycv%6@b-id{XiH>GCl+F^=<1RVo&q-eL<e(P^<g4sQZzC|5C;|Ab+N*AC%#Z-y->U
z0=)kkZ%27s#O<f=|MaFVm7l@EXyAS8pN00%NPE-zN11$!lMi&Q{k=f`PDIrJmPmeJ
zy(ZvM0=_M{e-gko8GnIbCUtvTlx#~~T-V7i5iVfMleZ1RZ;60^k+}Y;jDIA;{zAro
zF4c5|g6W2Jh_^o!)O#emc|*$D`VEY4Nq}zze=mpv;_8WSl<fu%bFq9phcjQ%UMV+Y
zjkUsjqNw7<QVH8(qtK{aEXT4uV#aYOEZeo1**#-%Y>X?lSTY-nFXm$_)%C?<wa~y;
z`^>W^D7@0BELO|0c)8ZJH)Ef?4;)qTdJ%g`EPg(liN)(pI~LaxJvE;qp&<N)#PQEc
zQqm_Wow_~ksk`7w`*{9)zGu|^JV`!ZYgS6JWGb&Nak_$V;_w|CnyH^m<sLd=V2ki+
zXvRJ{pJ;=i9-z|6^gLi{`b+@_|JqVvvAj~LB_3eOR7*Nf@kK+M${mkA^U1k&GlQz=
zWzXL(dlKxl2%8H{SzF(TpVuxVVZzK(vsfv`H(_>TwHi0hB_VyrCVmbGKg&E2h*I1+
zp^CMVOP@*zJ&@_-3AMOfc2%U39^!Z`16k{~$CDD2Xi6nc_%Uf>3~RmvlMxK?$t*1{
zH|z0g(<rMaMOg}RDX$wT>QZ3shtwJ&ok+jKsX1uWF@nqf7^Sje3-F9lW7YP2W?1bc
zf&qj-7CNYeLk|pW$2-Na3Jc)AxTYnp#HJ7jUx?D$RUJHpq8-+>_(?p%?E{O41a|~9
zEdeDs3C%uox@`-*Q@S`!cxTGe;<Laz)oKd`o9dJ#E~kN~Yi-RLaoITTPjW+wwx0AX
z&O(!$WEZK;MhN;Lwe1K*Kcsd5!uuh`i!_AyLuyALybn?@3`VCP5@AG;!QT%ld1N$*
zRBPv}@zsi3Vj1CNrRV4J^YiIUR>~GM)%CcJg`Lp?K#4MJoL}_dy4E!oNf*&RHi<74
zq-8DmG9$swXndNC#gbp2R~niQTCo*+S|z?2&`_!$A+aSQ(}j?@5<%yD9CK13MC7{=
z5)~qni7t%xmKdt<G<;4k4XE=BmzZ3rl*+F8CA~t$Ok<4O$C<+tyEz~85RNQ#u<BaQ
zfC7mWKMbp=b6L~0tA|y4gb;PHO3*x3^xR%n?J~UTV%1&J<3$DZ@=8P)=Wbp}zQ>Cq
z=;alXIT+r}YeJ+SX<&PKB_eaMyo*;aBTp|%tCttaow1qAZeDT5<7G=>hjq7hMp*A0
zXf4Q(55)+&C|0u%MVT%N68$Lp@1k;@D7xpO(DeLVwBALbi2Nu9%%EiL_{?A#xF{5X
zpNp%f&JF<rv#EG4a{;+HE@Zvop(;_r@GRk=7MJXDv01LKS1U`Kar|v<iGRH;R^w+<
z4?Ixbs4O)L)ncP5#~IC)jDyRFLaWAhW+W}Zf*1}k`dAu2lk5;cc}vBm)p7xEy^sgi
zOJt5+!2yQ`rDDfoiP*C+8OC`Uw6TcVT^P$Y&}}RNC6Zul>W+;~dB)E9jAf<yBa4jO
zg|Xo3G1koyj7{IMv1!lPyw6zFT!*nqHz0Rb2W0Cp7c0VzZRU>6&3NV}lRkS{)13xm
zaot6DlVIyH*cFBN&fT%Wb8Uk&euL3LIt|9+y&HqU)?=`{4q$Nhjt$PX4bJ%u=84l^
zzdVrNg~4R&F&I6~Es5M68=UhDP9=N>r_cdA4R#Cf&T2RXwjP7sWFf-&J2p7qHaP7!
z82e4nOAOyoX6y{^tZ+HsmY+ArkUCA%NcP%Bxmu4e>vy?1`nsHcEcr<8@yw$fpL#5n
znoG~0mR?Uy?payC{nh2oMijBDb|bkT2~u<V6mfAZpL#TtIQ_`ur$p)L)amT$Q&5WD
zsZZ=aa+c<D`q4*{sd;@Oor3@nayfK3Am;+QfnA*1>nYmB^OEZ#cRPDd=(ySWFx>16
zhI?Hbla;fg=7Bgrh8w@(;lA(V;ihRY-1dDK$&HrAUEhb%c8hJ?^nDn*You{S`QG}n
z5@aSE!u)U%!(20(lx2?eS&4vV>pCrGB?MXFB&yE=NpKWl+-<lgSh28_65*vZ%<ny1
zX~odd%_}Z=yj>3FB>Lt}n1>Vd=Y^cap4j<$b2f1?5_=xD=MoU+d#lb#?8(W~F8+w&
zav)5;KcVT^n|S5Kvz+86hxGemoD;(-&J>KxqEo0DJJXVrATUf_EE4*hW~9+g_cNK8
z_{@x*o0sFu+z!ZN$*jg>6Yj@+ES(ZO--G(%GkJ|iWcuQ>8I6bCeewB(#xq2Hj6XK7
z?UT3Ee)we47>MPvShVAf&Dv6Ih}O3M#F6)Efor#23wfypVpCluotIkB*~`;4pD<kv
zAm~g*UQ(?Y^YEtq@|xEf1;-qZ4!*+5OC=EBWtb$C*v!0H?50NYQV7N{v)Wyh)Pg3U
z0XX)WPo^6gU9p#NwiWC3A2N9U03>?_r`;JlKYuKhKsI+^6yZr7NC{gqW%x4u%Q5z*
z`4|#t4%$YeK16#5Qy#AS$cbHn07^(Eb`p_ww=OxK_whl?u**&F501$P+J6i!*Gl~R
z;}!m_qAYI{L*@qp#l=Otyn#Quc3)R|Naa@+y@cJ2pVhqAfVJWhZ5X;VbVY?`Bl6Yp
z3s4uD&|_*2P5Q>Uko@K(w0~5L3aJB1-8!t?xuy2thbR-?T)tEJ<^dJypcf(Fx1Pc`
z&wE=cbYDcphLoz~Unn{%y5wUb2(77@3Jpip2*%MM#@P_1$;EG%9Q@oo|I<7_#uF)$
zi{A?UUsVt80Dp7`_zRZ;EfGMaN5<6DW=K<h5s%uKdf`&ArC!9CN?yg>38`&Oq0~F$
zYTDi<Mz+*f!<<iTns3F2LG;d;dc_$GQR2h<;1=*NIAOIQA7}u4V_f~d6Att1@8E}D
z9anFTgFYNawAI6dhW_%PdKhzD!audCCu#=IBhc@JLNN@r{#iBUjI?OMz&<E>^JO*Z
zL_>!-|4`dE?clq(M?W68#~B&%zPG}jUv$FJOgf!5{)9#y6{S7(=e8~N?l|NPj!1-l
z?#hdXKSDmDK|P;zA|sMN4?h&9zbzMUhnZ^C`WW<(2l;~wls|bnB=YUSA$|oF{PzSO
z)_7avo#7FwQcr$3On(QBe8`Jv`iq8sbcFJhp`URgVcH)Q`X}Mf*a-a-ytZd}0n_><
z5)J*6h!0kS$A4{VS46)0PsK;ot3fqk4^zEr{rF|cf7Y3h{1L`{!=8}^4C|BHe#}Sp
zd?Q4B))^VT4|usSf5nM()yq&<y$lVpULsz-3=MVGOQfS-rtQ&=`h@<m1@PA=n@0cc
z@KI5Y+q>hTtg|<KvOQOb*F-yBv(W)(Ok-O}s*acOuP*XtoXOp7`f+uwGTM&U|G;{f
zarW%>>w5+3f%Eoy!!PAvs$8<(1_l_vkQcv!fzJ4aI^uWj*T>N*aWUUVEUdSCNBs2`
zbw<KybE@@ri}^?V_J>QP=Y{#TC&u=)`A3lNsW6tHz7W4@d!n5m%x9dzd;R_lIOaDX
zjQ1O}zz@s%llp}}*M=qsNw4iAe`LSMPyGDEuvf3N-`?Zb7o2f|Qyx9~&$avawDYx!
z`PA;9Cx0=3{<dG+ei#q*%kR4V?kjqZ`iiojNWXyp?3dopYEa+P&EeKZZ!J9N_AAm;
ze*L-suo$oD=Dyb1tM@$Q_R~>YMY##ofBA;R^+<Y4=x<3sia6Hr-&T*GRDrDl_3j}R
zaf18)_~|3BDYYf*KN1OAr|$og2f-5Hdi$Ck8lb+1Uq`>CJ<i_!G0a_kQ%+d==YHv*
zT>pFbz72(~*U_&KpNY$ZQg5vfUmj=u-{V+euYY1bVGr(QyjxMwzeV{`i~6KL0YB~F
zcNeZcthZ@rQ1T`Ek*`U|vbev9KG37z4t|W(%N+RZWlQ9NBtm<porvV$GW`$s4~Tpk
z0sJKP8}^ql`@hIjgZQC|zyIp~x`_UccpSEO1$Tj7uD704UumeQ%vr>O9}VhL0_QJ>
z4%_2F@7GK3j1L^P$AYFq$o@wDQ8r-m|0E#t#$#qK@+O1cubhzo!}dhb`_&fWL;kUZ
zpMFnJ`ZYkXXYjDSJGh%;Uf&xR`4bBm8+&E{ehG8#-^*iy-@^qf_GtRu&vnix>t+u7
z4tfVUBmwg&XB5A;a`)Fm$p647^D7tdFK$LC8Na8(zb}rdNqa!*&E4;&oDf&CQmikX
zkAWzr2G9Lg`cXZK{Q~}A&eNBwM|FRlb_NC(Fb})?sWTed_SMIXqw8Dx!<)0HmtC%X
za(?(5Y>Vmz2w*%=Pt(pWsh?d&-b+&|L<d3#aZr0_LY@5`Oyt_K)O+-A6y*R4`nNa+
zew=486nju_=qK&{74`qprjmNm@@Cap_xz%?ykFY<Sn)j9YW=pQULRKnJUoQ{KbXVI
z&#%uT@V^!T_8#`H?rXsS#<_X;F6mXiIS=ps_<z^%UHKG#_UbMYksg&(G~P{&Lej8B
z>1L|<Mk?1N@ZB~Mk&r`<QmdNGyLZzb+JXP;hQHp$&^1+OcHlQ%U!Co8uABdBJMjMs
zoyDbJHuRYHZvOi<iaS)t_1qCXoi<Q6N8a9K;J>P$^#wC`A1@mIKQI9D%Q@L4FwZx%
z{XO~*{DLNk8M<!$ZXSOEJZ)MqbnYW>_}#gy-{o`je**eHiQlCoKTE2c-`RoxDWl)d
z&ywio|LYz2SNrhO&Tjq`U=V>r7<c{lwPpCXOdZgE4!7DQGP;U8WuESWtGEGBskxZH
zPAy7(6EN~qe#hnVcL00q?;RLV`%iVzx%v~qHlknsFs3xo{eMkuYCSqY;kewV8$W{Y
nmXCGd#+S1Fmm6>Q>DMPMVTZVQGEBVvd;;2m&yA1!bou@ZKN71C

literal 0
HcmV?d00001

diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
new file mode 100644
index 0000000000000000000000000000000000000000..bbc7d301593f72433c5b7626638d0e6f0f4a0813
GIT binary patch
literal 11656
zcmeHNUu;`f89(-Qnm<kK#7Wv_U70r}tkbF+-)s9iS*tYy225&MVGP8k$#r5U@#NUv
zwUb@1Elayn4I$bMcmVN0#RD*b1P^<FhoMS?X#yU2<N>59nx<+SDBYiKg0b-XzH{#N
z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL
zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W
z)q1gzTc}pns^!&23D~5^eJy<T<-A=i{2QEhv0k*-AObsfv3zX~fiZt=oQ2~2>QZje
z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI
z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv
zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI
zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo
z<HNqV`Xu2P^T#{*w=gCZmew@j_er|O22ZcA?-zYrr`PxQ@lpuFt6NBiE}?8ieg>zd
zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy
z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE
zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T
zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y<up)I+S34p&rP9?
z!c!7W(Zs7bt$z6!%@$2@*8d|$>>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l
z8cXjac$Fp{)2rGySn+!Tz$<TY7;p2uc@<y-*iFT9HU!{`_V=&iqCG)?_8zbQLbBSA
zS<lC4c^6IRiQ}!!tD8KuuQB;KX5|z<ONckMM=`&y{a#R3pCQy0xQ#XQ+)1k$Aho+G
z6ejSVI>2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^
zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n
zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj
zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hw<o(IeEul}4$MFw7Zog1{xD
zleVjWs&>ds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I
zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A
zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe;
zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g
zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V
zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJ<fngdUP
zp5aufjj40l)(q({a$vzPch_8^=X5us^c<P!MwISDc{ier0m{1(Igk{TcOlC7Qnc%x
z-ctP|Wl-LYh)hn|lnWw?mDRGoTyjrVT9z&qU^-=3nM~FSR8^YAsklM#o$a(N1PTRO
zaktWC>g+V<Rf?qyRHBc>S(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b
z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E
z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4s<Qp#eAb!
ztCmX(O&xCo7U+FTzN{ZH@3^D5R$6G}%K3VO??Y0`(5bt4Jq(>lX>$8MT1f}SwbtT6
z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9
zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+
zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv
zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G
zsDgp>M-8;CXUW!_Miv5%hx6i!lE<B*GE~?Oh_W}HD9&%EC?|uG7!c*t?Rjh>x1F+_
zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it
zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1<!o`C
zXTULKXWCt`INju8`w6f(l~T~c=A+N5SB0w4^*=CWTPAwhiOz<1%Ky-0$(@FU3_pIh
zp0zDb8_~&jD}0L+MpToYzkKJ?k~_eGSaj2Coj;3nCK<MOID5O9OwUNKrIl94g4gqT
zTdy}O3yFT%_5CWaIPDpj2xLj7RQl7N*sRQXhe(zaZm~#~t$4pxvYc~zl4`e-<qS$e
zYE-x-sl8Bp4`tYmY)byTYNydzWa8~9vnw6uB;87PrNb#npS8Nu;hd!7=Vn(1?4x3^
zl3f|F5sG29yNlR$$zYn~qri<hl|V(R8#85r@2^?+Rzr8;*~BTj+kiLSjd#Y+Ya~79
zOgHA)HfF<Pwz@GxnYzvM!+_~=XS;G6ZQLo3d!{S5_Xxqd@o1r9obAe;YSW$ZxZz@#
z^cm?kc1yC`#fJB@+qk3kN}yON(68`Q^i{3Mk7oVq`%r#<-Y%};)kAQc>R{!MY@Lij
z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh
zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA
z*27_}t%&<tn%`ZMuJ`Z`o38&<*LQJ6Xgug$vHz|5zHRWI+6I5-L}-f@(7W(kqGF;c
zbHE=JDp7IfL}W{xRYraY*olZurAutYM319hqHiH0z!S|ONu9!FBPzb<L?RI^5AQ-)
zpr3K@<(A)50{MJQe9P(Sp|}5$ch1Me3o-Eb^dQ^f@D9a)G9nHG#$fp#M@8$|(<Jrd
zxZDyGlg`*6+1nE5d&S1Im~f(71P|_#{!KbVLvt?u`3vF(CmK~6*#AjK)97`#l#hkt
zPVdkmz+>+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z<ygR@C(Xt
zZP<<|`LKj*5%5PQozbw5ANf_^SX8`I&hQAu6Cr&s+#{x(_^1cNUJv$%2l4+dCQkSF
z#Tm0t9~3`{iqjL3VGrI2i|3l^+s+e^LjK~;*x0)e-MS?GpJ<LteK+H>$nlNg!-(Mc
zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;<r|d`%`E&RZ_WuvDkaaQ^X8&lv
z!Ec=$k^U-p(%Cge^ccne66I@*;SmWV-(ql%^wa(${y6e|2^;L%pK!*x9>%4A;gOT0
z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3
zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e
zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg
z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mY<R>T8)SViUf}%2
zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p
zIl#3z8Vl<miY8lh@}KLc7GGoimNPgu13C}b1AXTlfLgr4^hsyufIq%-UlDz@BasO-
z`i;kV|0O>T3jBdQ_aBuXVL3zNWS<A})w0k10QBACpLOZ*cgh*q>BWnFWH}S#$6NG>
z<?J0t3Cn@=VLAK8pKQU99~<)IH)6x*N@4&NBmI53Cp6{k+TBK<5Mxx|9M8+WVdQ(F
zMV}r%cbnKn`zh(Wp?}Zb79IL$MIXURp8Vg}qTeIV-7WS|2V(l=UV;1#P{(6<qgNdL
zIshK)ns|lk7u+ubK}B-Qd8)rbFaOBLloRgt^y9pqYl^U^f8!=`w8_daNqwJbzANZ{
zy0!JVCN9Oq9*+)|{{z^~#z)tWGVtHX0Pb3X&TDYYDMXAO9(;rN{ezy9yUBl3$({ND
zdFpim7FIt@O#&mK3=$9B!l}*BpnD39IOsv22%n^;g|RFz-W=q;X&d?PD*4`kKmZg+
zwvqpV`j?xZ2LuA3cxoH@&-F`}M<_wQ5tP48(rP{yULL<uVkQ;A-6QJ7{_$lA)aLw1
ztj{X>?<hRU^L;14_q_DK!~P!i!zbaBr=@G(-N(xcH%CO@m~nqNDV%HHm2>z1AncRQ
zIiD;oU3r&}<f){B^3FE$$CQ1)Je5RH{+Vs$m%GT5pFw&0KQgEgJLos8Z>+oVD*^JK
z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB`
z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD>
N+sV24aj&l2zW{T-tULe!

literal 0
HcmV?d00001

diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
new file mode 100644
index 00000000000..e91fc32554a
--- /dev/null
+++ b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
@@ -0,0 +1,965 @@
+
+#include <torch/csrc/inductor/aoti_include/cuda.h>
+// Definition of AOTI runtime interface functions
+
+#include <torch/csrc/inductor/aoti_runtime/interface.h>
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+
+#include <iostream>
+#include <vector>
+
+#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
+  try {                                           \
+    __VA_ARGS__                                   \
+  } catch (const std::exception& e) {             \
+    std::cerr << "Error: " << e.what() << '\n';   \
+    return AOTI_RUNTIME_FAILURE;                  \
+  } catch (...) {                                 \
+    std::cerr << "Unknown exception occurred.\n"; \
+    return AOTI_RUNTIME_FAILURE;                  \
+  }                                               \
+  return AOTI_RUNTIME_SUCCESS;
+
+#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
+  do {                                                            \
+    AOTI_RUNTIME_CHECK(                                           \
+        actual_size == expected_size,                             \
+        "expected " + std::string(name) + " vector size to be " + \
+            std::to_string(expected_size) + ", but got " +        \
+            std::to_string(actual_size));                         \
+  } while (0)
+
+// AOTInductor uses at::addmm_out, which doesn't supports
+// arguments that requires gradient. For this reason, we
+// enforce no_grad context for run APIs.
+//
+// A RAII, thread local (!) guard that enables or disables grad mode upon
+// construction, and sets it back to the original value upon destruction.
+struct AOTINoGradGuard {
+  AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(false);
+  }
+  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
+  ~AOTINoGradGuard() {
+    aoti_torch_grad_mode_set_enabled(prev_mode);
+  }
+  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
+  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
+  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
+};
+
+extern "C" {
+
+AOTIRuntimeError AOTInductorModelContainerCreate(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    bool is_cpu,
+    const char* cubin_dir) {
+      return AOTInductorModelContainerCreateWithDevice(
+        container_handle,
+        num_models,
+        is_cpu ? "cpu" : "cuda",
+        cubin_dir);
+}
+
+AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
+    AOTInductorModelContainerHandle* container_handle,
+    size_t num_models,
+    const char* device_str,
+    const char* cubin_dir) {
+  if (num_models == 0) {
+    std::cerr << "Error: num_models must be positive, but got 0\n";
+    return AOTI_RUNTIME_FAILURE;
+  }
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    std::optional<std::string> cubin_dir_opt;
+    if (cubin_dir != nullptr) {
+      cubin_dir_opt.emplace(cubin_dir);
+    }
+    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
+        num_models, std::string(device_str), cubin_dir_opt);
+    *container_handle =
+        reinterpret_cast<AOTInductorModelContainerHandle>(container);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerDelete(
+    AOTInductorModelContainerHandle container_handle) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto* container =
+        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+            container_handle);
+    delete container;
+  });
+}
+
+AOTIRuntimeError AOTInductorModelContainerRun(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
+    AOTInductorModelContainerHandle container_handle,
+    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
+                                     // are stolen; the array itself is borrowed
+    size_t num_inputs,
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    size_t num_outputs,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
+  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
+
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_single_threaded(
+        input_handles, output_handles, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* num_constants) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *num_constants = container->num_constants(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** name) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *name = container->constant_name(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    const char** original_fqn) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *original_fqn = container->constant_original_fqn(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    bool* from_folded) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantType(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* type) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
+    AOTInductorModelContainerHandle container_handle,
+    size_t idx,
+    int32_t* dtype) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *dtype = container->constant_dtype(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
+  AOTInductorModelContainerHandle container_handle,
+  size_t idx,
+  size_t* data_size) {
+  auto* container =
+    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+        container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { *data_size = container->constant_data_size(idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+    { const auto ret = container->extract_constants_map(use_inactive);
+      for (const auto& pair: ret) {
+        constants_map->emplace(pair.first, pair.second);
+      }
+    })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle,
+    bool use_inactive,
+    bool validate_full_update) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->update_constant_buffer(
+        *input_map, use_inactive, validate_full_update);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
+          constant_map_handle,
+          /*use_inactive*/ true,
+          /*validate_full_update*/ true);
+}
+
+AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->free_inactive_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
+    AOTInductorModelContainerHandle container_handle,
+    bool use_inactive,
+    AOTInductorStreamHandle stream_handle,
+    AOTIProxyExecutorHandle proxy_executor_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  auto stream =
+      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
+    AOTInductorModelContainerHandle container_handle) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    container->swap_constant_buffer();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_inputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_inputs = container->num_inputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetInputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t input_idx,
+    const char** ret_input_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_input_names = container->input_name(input_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
+    AOTInductorModelContainerHandle container_handle,
+    size_t* ret_num_outputs) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_num_outputs = container->num_outputs(); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetOutputName(
+    AOTInductorModelContainerHandle container_handle,
+    size_t output_idx,
+    const char** ret_output_names) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE(
+      { *ret_output_names = container->output_name(output_idx); })
+}
+
+AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
+    AOTInductorModelContainerHandle container_handle,
+    const char** in_spec,
+    const char** out_spec) {
+  auto* container =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
+          container_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    *in_spec = container->get_in_spec();
+    *out_spec = container->get_out_spec();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelCreate(
+    AOTInductorModelHandle* model_handle,
+    AOTInductorConstantMapHandle constant_map_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
+      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
+
+      auto model = new torch::aot_inductor::AOTInductorModel(
+          constant_map,
+          constant_array,
+          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
+          ""
+      );
+
+      if (input_map) {
+        for (auto const& kv : *input_map) {
+          constant_map->emplace(kv.first, kv.second);
+        }
+      } else {
+        model->load_constants();
+      }
+
+      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
+    })}
+
+AOTIRuntimeError AOTInductorModelRun(
+    AOTInductorModelHandle model_handle,
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    AOTINoGradGuard guard;
+    model->run_impl(
+        input_handles,
+        output_handles,
+        (torch::aot_inductor::DeviceStreamType) nullptr,
+        nullptr);
+  })
+}
+
+AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
+    CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
+          model_handle);
+      delete model;
+    })}
+
+AOTIRuntimeError AOTInductorModelGetNumOutputs(
+    AOTInductorModelHandle model_handle,
+    size_t* ret_num_outputs) {
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+      *ret_num_outputs = model->num_outputs();
+  })
+}
+
+AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
+    AOTInductorModelHandle model_handle,
+    AOTInductorConstantMapHandle constant_map_handle) {
+  auto model =
+      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
+  CONVERT_EXCEPTION_TO_ERROR_CODE({
+    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
+    auto input_map =
+        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
+            constant_map_handle);
+
+    for (auto const& kv : *input_map) {
+      constant_map->emplace(kv.first, kv.second);
+    }
+    model->update_constants_map(std::move(constant_map));
+  })
+}
+
+} // extern "C"
+
+
+#define CUDA_DRIVER_CHECK(EXPR)                    \
+do {                                               \
+    CUresult code = EXPR;                          \
+    const char *msg;                               \
+    CUresult code_get_error = cuGetErrorString(code, &msg); \
+    if (code_get_error != CUDA_SUCCESS) {          \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string("invalid error code!"));   \
+    }                                              \
+    if (code != CUDA_SUCCESS) {                    \
+        throw std::runtime_error(                  \
+            std::string("CUDA driver error: ") +   \
+            std::string(msg));                     \
+    }                                              \
+} while (0);
+
+static inline CUfunction loadKernel(
+        std::string filePath,
+        const std::string &funcName,
+        uint32_t sharedMemBytes,
+        const std::optional<std::string> &cubinDir = std::nullopt) {
+    if (cubinDir) {
+        std::filesystem::path p1{*cubinDir};
+        std::filesystem::path p2{filePath};
+        filePath = (p1 / p2.filename()).string();
+    }
+
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
+    CUmodule mod;
+    CUfunction func;
+    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
+    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
+    if (sharedMemBytes > 0) {
+        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
+            func,
+            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+            sharedMemBytes
+        ))
+    }
+    return func;
+}
+
+static inline void launchKernel(
+        CUfunction func,
+        uint32_t gridX,
+        uint32_t gridY,
+        uint32_t gridZ,
+        uint32_t numWarps,
+        uint32_t sharedMemBytes,
+        void* args[],
+        cudaStream_t stream) {
+    CUDA_DRIVER_CHECK(cuLaunchKernel(
+        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
+    ));
+}
+CACHE_TORCH_DTYPE(float32);
+CACHE_TORCH_DEVICE(cuda);
+CACHE_TORCH_LAYOUT(strided);
+namespace torch::aot_inductor {
+namespace {
+class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
+  public:
+    CUfunction triton_poi_fused_convolution_0{nullptr};
+    CUfunction triton_poi_fused_convolution_1{nullptr};
+    CUfunction triton_poi_fused_convolution_2{nullptr};
+};
+}  // namespace
+
+
+
+AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
+                                   const std::string& device_str,
+                                   std::optional<std::string> cubin_dir)
+    : AOTInductorModelBase(1,
+                           1,
+                           1,
+                           device_str,
+                           std::move(cubin_dir),
+                           true) {
+    inputs_info_[0].name = "arg2_1";
+    constants_info_[0].name = "conv_weight";
+    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
+    constants_info_[0].offset = 0;
+    constants_info_[0].data_size = 540;
+    constants_info_[0].from_folded = false;
+    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
+    constants_info_[0].shape = {5, 3, 3, 3};
+    constants_info_[0].stride = {27, 9, 3, 1};
+    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
+    constants_info_[0].original_fqn = "conv.weight";
+    update_constants_map(std::move(constants_map));
+    update_constants_array(std::move(constants_array));
+    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
+    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
+    outputs_info_[0].name = "output0";
+    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
+}
+
+std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor,
+    bool initialization
+) {
+
+    if (!initialization) {
+        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
+                  << "aot_inductor.use_runtime_constant_folding=False\n";
+    }
+    return {};
+}
+} // namespace torch::aot_inductor
+using namespace torch::aot_inductor;
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_0(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_0', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 12
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
+        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
+    }
+    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_2 = ynumel;
+    int var_3 = xnumel;
+    CUdeviceptr global_scratch_4 = 0;
+    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
+    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_1(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_1', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 15
+        xnumel = 9
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y3 = yindex
+        y0 = (yindex % 3)
+        y1 = yindex // 3
+        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
+        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
+    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
+        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
+    }
+    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_7 = ynumel;
+    int var_8 = xnumel;
+    CUdeviceptr global_scratch_9 = 0;
+    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
+    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
+}
+
+template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
+static inline void call_triton_poi_fused_convolution_2(
+    const in_ptr0_type_& in_ptr0,
+    const out_ptr0_type_& out_ptr0,
+    int64_t ynumel,
+    int64_t xnumel,
+    int32_t device_idx_,
+    cudaStream_t stream_,
+    kernels_type_& kernels_,
+    const std::optional<std::string>& cubin_dir_ = std::nullopt
+){
+    /*
+    async_compile.triton('triton_poi_fused_convolution_2', '''
+    import triton
+    import triton.language as tl
+
+    from torch._inductor.runtime import triton_helpers, triton_heuristics
+    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
+    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
+    triton_helpers.set_driver_to_gpu()
+
+    @triton_heuristics.pointwise(
+        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
+        filename=__file__,
+        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
+        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
+        min_elem_per_thread=0
+    )
+    @triton.jit
+    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
+        ynumel = 20
+        xnumel = 64
+        yoffset = tl.program_id(1) * YBLOCK
+        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
+        ymask = yindex < ynumel
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
+        xmask = xindex < xnumel
+        x2 = xindex
+        y0 = (yindex % 5)
+        y1 = yindex // 5
+        y3 = yindex
+        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
+        tmp1 = y0
+        tmp2 = tl.full([1, 1], 2, tl.int64)
+        tmp3 = tmp1 < tmp2
+        tmp4 = tl.full([1, 1], 1, tl.int64)
+        tmp5 = tmp1 < tmp4
+        tmp6 = 0.1508762389421463
+        tmp7 = -0.15852206945419312
+        tmp8 = tl.where(tmp5, tmp6, tmp7)
+        tmp9 = tl.full([1, 1], 3, tl.int64)
+        tmp10 = tmp1 < tmp9
+        tmp11 = tl.full([1, 1], 4, tl.int64)
+        tmp12 = tmp1 < tmp11
+        tmp13 = -0.047068577259778976
+        tmp14 = 0.010523972101509571
+        tmp15 = tl.where(tmp12, tmp13, tmp14)
+        tmp16 = 0.07869197428226471
+        tmp17 = tl.where(tmp10, tmp16, tmp15)
+        tmp18 = tl.where(tmp3, tmp8, tmp17)
+        tmp19 = tmp0 + tmp18
+        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
+    ''', device_str='cuda')
+    */
+    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
+    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
+    uint32_t grid_2 = 1;
+    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
+    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
+        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
+    }
+    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
+    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
+    int var_12 = ynumel;
+    int var_13 = xnumel;
+    CUdeviceptr global_scratch_14 = 0;
+    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
+    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
+}
+
+namespace torch::aot_inductor {
+
+void AOTInductorModel::_const_run_impl(
+    std::vector<AtenTensorHandle>& output_handles,
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {}
+
+AOTI_NOINLINE static void check_input_0(
+    AtenTensorHandle* input_handles
+) {
+    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
+    int32_t arg2_1_dtype;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
+
+    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
+    if (arg2_1_expected_dtype != arg2_1_dtype) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dtype, "
+           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
+           << "but got: " << arg2_1_dtype << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_size = arg2_1.sizes();
+
+    if (4 != arg2_1_size[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 0, "
+           << "expected: 4, " << "but got: " << arg2_1_size[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (3 != arg2_1_size[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 1, "
+           << "expected: 3, " << "but got: " << arg2_1_size[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_size[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_size[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched dim value at 3, "
+           << "expected: 8, " << "but got: " << arg2_1_size[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    auto arg2_1_stride = arg2_1.strides();
+
+    if (192 != arg2_1_stride[0]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 0, "
+           << "expected: 192, " << "but got: " << arg2_1_stride[0]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (64 != arg2_1_stride[1]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 1, "
+           << "expected: 64, " << "but got: " << arg2_1_stride[1]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (8 != arg2_1_stride[2]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 2, "
+           << "expected: 8, " << "but got: " << arg2_1_stride[2]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+
+    if (1 != arg2_1_stride[3]) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched stride value at 3, "
+           << "expected: 1, " << "but got: " << arg2_1_stride[3]
+           << "\n";
+        throw std::runtime_error(ss.str());
+    }
+    int32_t arg2_1_device_type;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
+
+    int32_t arg2_1_expected_device_type = 1;
+    if (arg2_1_expected_device_type != arg2_1_device_type) {
+        std::stringstream ss;
+        ss << "input_handles[0]: unmatched device type, "
+        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
+        << "but got: " << arg2_1_device_type << "\n";
+        throw std::runtime_error(ss.str());
+    }
+}
+
+static bool _check_aoti_runtime_check_inputs_env() {
+    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
+    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
+    return result;
+}
+
+AOTI_NOINLINE static void __check_inputs_outputs(
+    AtenTensorHandle* input_handles,
+    AtenTensorHandle* output_handles) {
+    if (!_check_aoti_runtime_check_inputs_env()){
+        return;
+    }
+    check_input_0(input_handles);
+}
+
+void AOTInductorModel::run_impl(
+    AtenTensorHandle*
+        input_handles, // array of input AtenTensorHandle; handles
+                        // are stolen; the array itself is borrowed
+    AtenTensorHandle*
+        output_handles, // array for writing output AtenTensorHandle; handles
+                        // will be stolen by the caller; the array itself is
+                        // borrowed
+    DeviceStreamType stream,
+    AOTIProxyExecutorHandle proxy_executor
+) {
+    __check_inputs_outputs(input_handles, output_handles);
+
+    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
+    auto arg2_1 = std::move(inputs[0]);
+    [[maybe_unused]] auto& conv_weight = constants_->at(0);
+
+    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
+        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
+        AtenTensorHandle arg2_1_aligned;
+        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
+        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
+    }
+    inputs.clear();
+    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
+
+    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
+    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
+    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
+    AtenTensorHandle buf0_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
+    RAIIAtenTensorHandle buf0(buf0_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    arg2_1.reset();
+    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
+    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
+    AtenTensorHandle buf1_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
+    RAIIAtenTensorHandle buf1(buf1_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    AtenTensorHandle buf2_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
+    RAIIAtenTensorHandle buf2(buf2_handle);
+    buf0.reset();
+    buf1.reset();
+    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
+    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
+    AtenTensorHandle buf3_handle;
+    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
+    RAIIAtenTensorHandle buf3(buf3_handle);
+    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
+    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
+    buf2.reset();
+    output_handles[0] = buf3.release();
+} // AOTInductorModel::run_impl
+} // namespace torch::aot_inductor
+
+
+
+
+// Compile cmd
+// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o
+// Link cmd
+// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json
new file mode 100644
index 00000000000..bd5d2c60334
--- /dev/null
+++ b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json
@@ -0,0 +1 @@
+{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 93bb438b8ee..7aa4950c790 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -61,6 +61,17 @@ fi
 cleanup_temp_files() {
     echo "Cleaning up temporary files and directories..."
 
+    # Remove temporary directories
+    for file in *wrapper.cpp; do
+        if [[ -f "$file" ]]; then
+            basename="${file%wrapper.cpp}"
+            if [[ -d "$basename" ]]; then
+                echo "Removing directory: $basename"
+                rm -rf "$basename"
+            fi
+        fi
+    done
+
     # Remove temporary files with specific extensions
     rm -f *.cubin
     rm -f *.pte
@@ -117,27 +128,28 @@ run_inference() {
 case "$MODE" in
     "reinstall_all")
         echo "Mode: reinstall_all - Full reinstall and run"
-        install_executorch          # Line 1
-        export_aoti_model           # Line 2
-        clean_install_executorch    # Line 3
-        build_runtime              # Lines 6-16
-        run_inference              # Lines 17-18
+        install_executorch
+        export_aoti_model
+        clean_install_executorch
+        build_runtime
+        run_inference
         ;;
     "reinstall_aot")
-        echo "Mode: reinstall_aot - Reinstall AOT components only"
-        install_executorch          # Line 1
-        export_aoti_model           # Line 2
-        run_inference              # Lines 17-18
+        echo "Mode: reinstall_aot - Reinstall AOT components and run e2e"
+        install_executorch
+        export_aoti_model
+        run_inference
         ;;
     "reinstall_runtime")
-        echo "Mode: reinstall_runtime - Rebuild runtime and run"
-        build_runtime              # Lines 6-16
-        run_inference              # Lines 17-18
+        echo "Mode: reinstall_runtime - Rebuild runtime and run e2e"
+        export_aoti_model
+        build_runtime
+        run_inference
         ;;
     "inference")
         echo "Mode: inference - Export model and run inference only"
-        export_aoti_model           # Line 2
-        run_inference              # Lines 17-18
+        export_aoti_model
+        run_inference
         ;;
     *)
         echo "Error: Unknown mode '$MODE'"

From a7ae3b7350f187b9dde7bb2e0239da42dc06bbc7 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 20 Aug 2025 10:13:57 -0700
Subject: [PATCH 16/50] ignore aoti temp files

---
 .gitignore                                    |   7 +
 ...l4fepetv42wg64xygsadkkb43zczod6.kernel.cpp |   6 -
 ...wg64xygsadkkb43zczod6.kernel_metadata.json |   1 -
 ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 8968 -> 0 bytes
 ...3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin | Bin 13832 -> 0 bytes
 ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 11656 -> 0 bytes
 ...qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp | 965 ------------------
 ...jkb2gr6xgxxo6t35umkq.wrapper_metadata.json |   1 -
 8 files changed, 7 insertions(+), 973 deletions(-)
 delete mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
 delete mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
 delete mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
 delete mode 100644 ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin
 delete mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
 delete mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
 delete mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json

diff --git a/.gitignore b/.gitignore
index b166f8c9512..78268c70d8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,13 @@ tokenizer.json
 !test_bpe_tokenizer.bin
 !test_tiktoken_tokenizer.model
 
+# AOTI temporary files
+*.cubin
+*kernel_metadata.json
+*kernel.cpp
+*wrapper_metadata.json
+*wrapper.cpp
+
 # Editor temporaries
 *.idea
 *.sw[a-z]
diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
deleted file mode 100644
index 02ec4e5c2af..00000000000
--- a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file
diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin
deleted file mode 100644
index 000ca4c1209b77cdaec3c8757e532677b79ccc0f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8968
zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ
znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8Y<D&eE)yW*yHhT
zn}&iw>XC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J
z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7
zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z
zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH
zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO
z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM#
zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f
zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z
zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb
z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew<A*5o`3mwBfmn%ve>$8dP>`$
zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb
z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut
zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L
zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z
z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz
zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY
z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa
zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1)
zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5<e2;d$N<O;
zmxSG_MZ$|D43;&wBq}RTWE|ULa_O*9ToP?B5~sjtJa89bJVv&N%#Oi`(F;l(%mI3l
z=+YP%0xvMG*$k9-m$v3vut`xRc<jW2-5q$eMbCn5;>GuFVIK-CY(EZ`C4GTUaK$2L
zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3
zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U
zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty
z02F5dZe(2`e4Kc=$%pwRQS}@<aNKsYQSF%cQmN9npxrc&7mgfpHX7BS+_e3G55Ls1
zOlb8>3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8K<dajkCB9D4>FIpOH77K*R{H
z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn<yKJQZnyvW5c<nMCbTo;?dO@}z(=
zNC-$x9F3m94kade=o2NzN3|x8P|X6K!_~?5DZYd%pVpo#m|RlohAnXI!n##JMa8b(
z0(K(rO<97GJO@I|LJ4)_*S+Wn7uXNj*g4f~q`(J?m<2koxbzjHh|?T<<A4&KMfpG|
z@);!woK(6=`j3R7JcJ0F{KQlkIi&~|YFvFnvcJb?yke1y5vl86fkg=#tl?ohu$>|U
zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC<S(oZSlT2tHAMrU$H&Y@3g8$b`avC#8P40
zw_jqQl<PsbN{nP-ScPGfiX}i2L%769QmqecqQp>@q#g<oH$xm?;ERJl<CRMCEn$!c
zUFO|aEYO=JPHWLXhSb$U8R9!*Av%V1l@yJ$bP^$3+DUdXE?bhao)h%mF{~r~*B_@<
zqgRfz^h2A&H)2-(WMo$=p0k11tYkYoB+9?L4GSg}Kdtp}0Bd%YV#uD(Vk1JI9px`k
z`nuHj%Q^Yhr(*L3U2GEVb7RE9m+`GaT)nHe*2GQtadLle%^z5?bxMp5$g>gXYY*W&
z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ
zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{<S8=`O~9a@eJm)nV(@D
z>S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM#
z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f
zaF0gz)9(?V<TIJRd@glcNWX)q_GV6ug<?+@OSgh*z<SEl@!K}+zYl#{ha(xz{fBZ5
zd7bRLlMvj>>4(C7I_d98v%g93M<m}>{PB?<|F=0Y6OQboct=_OL6Henm2?q5@R<om
z$9sI>b8dR?2(87mIVOHSA<kv@jrP`CY4Mu~pXVS0e5OJ@{WPZC3mlJ5`Re_w1U#n0
z@h)YI@nU@Nflu;`C+Sb<o3y6=Bi^ZSeEbetlRe-u9Zs<R1mTVKR5&(9^$_u)KXG(m
zo#uRVKCmC)lV*Ir1H6IXm%>c?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma
z2PCeK{b#acB;VbBF-P?<e`e1F<D>YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN
z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa
z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV
z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s
z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS?
zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy
zX;T0AF8WpVCl87*-V#gk(k}YzYNH<zz_FIpzgP0ym<Ue;dWk_hB!^y()U!xYh4{9d
zxywC@oV=p+f28P?V|q>|^)Jf!hx{MDSt=TG81f(G@hxR@S;<AiLrOndNB$$dDE~|F
zPcbb=f@+NPBRPJ?ftS<|chRpa|8adPiKPB7cG2J5L7(DG>eJuxU_y-WA7n6z+AV6L
z&H=?w^}nge(LnE3Wb<ngAsTO2lUTwifAnq@Z?9oN`ec8gMfwB8;rib~@D%@76LLc7
z=VOO^r}`mGDa7dibtKK)O2*NmF~X0?H>$^wM)<PzSf}vjmu`^b{`gAR1(%)$g?C(!
NNxSGp_(Wr*_b<c8Te<)M

diff --git a/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin b/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin
deleted file mode 100644
index 82f0e9e31b91e006c53fe1a2116c7afef6fb68ef..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13832
zcmeHOYiwKBeLv)-MCxIQlAlWAn)am5fenf;kyPv?ik)N#U?ahTVOz6|p+rieOHs@v
zEnoRD%UKX~AJ(kHfHl}Ybotf+!yX+_?8CBjZPOsYuol~gZl8i6ABw~s6x&+^BrEs(
z{m;3Vmk%Yjmkk(}3;5o1{_ofMpL2ETtB;*|JQxV1!%AO4^_to;V?MSsdXh-!=a+VU
zNiC?n8o+(G3MdtCtTvi<v$%+n``ld9b9ojTtF@A&;<b%<rMA4T;?1(tj5qB{bG=rm
zuU87o=NsixVQIa#v0gpjtN@zwsBed^v0Ajtr8}Us%Z;+V0hWY{+vV!*83e-O?NOG>
zi|1Dg%XV?Ctm1l7t<*54uC95p15n#2wDE;TvC+^>JHU&GX)l8Zt2doO(`owk8l%LQ
z4l|LQrDC<ZVi)VH68yTo-dt~P)?qer>$OI+SZgLeEy}t`?)v_JshHd<dU~zA7GGap
zZj_s-Cj`sXql-tC6$(g-JAeA@V`mGRzK|#stCeD-f4n}gO}&IZp*C-$i+o0<n?jB(
zHQtk{Zvt}I2Mw9=kMWFsW5S2=4be8cuOeWUa=F{sn!pFz05kd{fe-e;-xj#r&D-d`
zLCPO)1I#E1_++&0*Ono9O6Vth=v%m^4U}VEu=_Hf<D=KMwvMVu>)O`Qhzyk?{CJNP
z4RCdZ&On*w4Mh!Y6@CO&9kW2-qp@M!1AwT&Y;&zX+g!JoRx7pA`6Y}BE5*k8dhJ}L
zID2kIsiku(>GW!*oKDqO)3*KP3psmvv1xBCK3_by?p$2@@?3Q}y?&uuDnDP}sHE++
z`o&tjzNrE~*!ueS2jASM)DW!ES14dj1a4Tv!Dzs`5KIKDA&xeKKU9J1H?M!Nebc%K
zzy~+3|GNyV!C)+aQ7pi@zJq>{oFpVPf%}EvHwFUMLCRX^Z34IVuD^f%)-A}!oLX*J
z%M^AG=l)T`gT1ae1kDO*4&b)^Z&rZ1t`Zu+jdKi_f^UQNpdNx&G%u0sAjpG1v=H4V
z;B-(i`knCxz!wi??G9MG$oU<m`J;iG)~i=J{#Swg)m28vvI*JETZ1=7@P;+T;cxOb
z#@l}0&hbX$;U#YO(a06v?$t1ira5|+H}U4XqUSZyDU22R{0{=XEXtn*|F+K6EZLr>
zycv%6@b-id{XiH>GCl+F^=<1RVo&q-eL<e(P^<g4sQZzC|5C;|Ab+N*AC%#Z-y->U
z0=)kkZ%27s#O<f=|MaFVm7l@EXyAS8pN00%NPE-zN11$!lMi&Q{k=f`PDIrJmPmeJ
zy(ZvM0=_M{e-gko8GnIbCUtvTlx#~~T-V7i5iVfMleZ1RZ;60^k+}Y;jDIA;{zAro
zF4c5|g6W2Jh_^o!)O#emc|*$D`VEY4Nq}zze=mpv;_8WSl<fu%bFq9phcjQ%UMV+Y
zjkUsjqNw7<QVH8(qtK{aEXT4uV#aYOEZeo1**#-%Y>X?lSTY-nFXm$_)%C?<wa~y;
z`^>W^D7@0BELO|0c)8ZJH)Ef?4;)qTdJ%g`EPg(liN)(pI~LaxJvE;qp&<N)#PQEc
zQqm_Wow_~ksk`7w`*{9)zGu|^JV`!ZYgS6JWGb&Nak_$V;_w|CnyH^m<sLd=V2ki+
zXvRJ{pJ;=i9-z|6^gLi{`b+@_|JqVvvAj~LB_3eOR7*Nf@kK+M${mkA^U1k&GlQz=
zWzXL(dlKxl2%8H{SzF(TpVuxVVZzK(vsfv`H(_>TwHi0hB_VyrCVmbGKg&E2h*I1+
zp^CMVOP@*zJ&@_-3AMOfc2%U39^!Z`16k{~$CDD2Xi6nc_%Uf>3~RmvlMxK?$t*1{
zH|z0g(<rMaMOg}RDX$wT>QZ3shtwJ&ok+jKsX1uWF@nqf7^Sje3-F9lW7YP2W?1bc
zf&qj-7CNYeLk|pW$2-Na3Jc)AxTYnp#HJ7jUx?D$RUJHpq8-+>_(?p%?E{O41a|~9
zEdeDs3C%uox@`-*Q@S`!cxTGe;<Laz)oKd`o9dJ#E~kN~Yi-RLaoITTPjW+wwx0AX
z&O(!$WEZK;MhN;Lwe1K*Kcsd5!uuh`i!_AyLuyALybn?@3`VCP5@AG;!QT%ld1N$*
zRBPv}@zsi3Vj1CNrRV4J^YiIUR>~GM)%CcJg`Lp?K#4MJoL}_dy4E!oNf*&RHi<74
zq-8DmG9$swXndNC#gbp2R~niQTCo*+S|z?2&`_!$A+aSQ(}j?@5<%yD9CK13MC7{=
z5)~qni7t%xmKdt<G<;4k4XE=BmzZ3rl*+F8CA~t$Ok<4O$C<+tyEz~85RNQ#u<BaQ
zfC7mWKMbp=b6L~0tA|y4gb;PHO3*x3^xR%n?J~UTV%1&J<3$DZ@=8P)=Wbp}zQ>Cq
z=;alXIT+r}YeJ+SX<&PKB_eaMyo*;aBTp|%tCttaow1qAZeDT5<7G=>hjq7hMp*A0
zXf4Q(55)+&C|0u%MVT%N68$Lp@1k;@D7xpO(DeLVwBALbi2Nu9%%EiL_{?A#xF{5X
zpNp%f&JF<rv#EG4a{;+HE@Zvop(;_r@GRk=7MJXDv01LKS1U`Kar|v<iGRH;R^w+<
z4?Ixbs4O)L)ncP5#~IC)jDyRFLaWAhW+W}Zf*1}k`dAu2lk5;cc}vBm)p7xEy^sgi
zOJt5+!2yQ`rDDfoiP*C+8OC`Uw6TcVT^P$Y&}}RNC6Zul>W+;~dB)E9jAf<yBa4jO
zg|Xo3G1koyj7{IMv1!lPyw6zFT!*nqHz0Rb2W0Cp7c0VzZRU>6&3NV}lRkS{)13xm
zaot6DlVIyH*cFBN&fT%Wb8Uk&euL3LIt|9+y&HqU)?=`{4q$Nhjt$PX4bJ%u=84l^
zzdVrNg~4R&F&I6~Es5M68=UhDP9=N>r_cdA4R#Cf&T2RXwjP7sWFf-&J2p7qHaP7!
z82e4nOAOyoX6y{^tZ+HsmY+ArkUCA%NcP%Bxmu4e>vy?1`nsHcEcr<8@yw$fpL#5n
znoG~0mR?Uy?payC{nh2oMijBDb|bkT2~u<V6mfAZpL#TtIQ_`ur$p)L)amT$Q&5WD
zsZZ=aa+c<D`q4*{sd;@Oor3@nayfK3Am;+QfnA*1>nYmB^OEZ#cRPDd=(ySWFx>16
zhI?Hbla;fg=7Bgrh8w@(;lA(V;ihRY-1dDK$&HrAUEhb%c8hJ?^nDn*You{S`QG}n
z5@aSE!u)U%!(20(lx2?eS&4vV>pCrGB?MXFB&yE=NpKWl+-<lgSh28_65*vZ%<ny1
zX~odd%_}Z=yj>3FB>Lt}n1>Vd=Y^cap4j<$b2f1?5_=xD=MoU+d#lb#?8(W~F8+w&
zav)5;KcVT^n|S5Kvz+86hxGemoD;(-&J>KxqEo0DJJXVrATUf_EE4*hW~9+g_cNK8
z_{@x*o0sFu+z!ZN$*jg>6Yj@+ES(ZO--G(%GkJ|iWcuQ>8I6bCeewB(#xq2Hj6XK7
z?UT3Ee)we47>MPvShVAf&Dv6Ih}O3M#F6)Efor#23wfypVpCluotIkB*~`;4pD<kv
zAm~g*UQ(?Y^YEtq@|xEf1;-qZ4!*+5OC=EBWtb$C*v!0H?50NYQV7N{v)Wyh)Pg3U
z0XX)WPo^6gU9p#NwiWC3A2N9U03>?_r`;JlKYuKhKsI+^6yZr7NC{gqW%x4u%Q5z*
z`4|#t4%$YeK16#5Qy#AS$cbHn07^(Eb`p_ww=OxK_whl?u**&F501$P+J6i!*Gl~R
z;}!m_qAYI{L*@qp#l=Otyn#Quc3)R|Naa@+y@cJ2pVhqAfVJWhZ5X;VbVY?`Bl6Yp
z3s4uD&|_*2P5Q>Uko@K(w0~5L3aJB1-8!t?xuy2thbR-?T)tEJ<^dJypcf(Fx1Pc`
z&wE=cbYDcphLoz~Unn{%y5wUb2(77@3Jpip2*%MM#@P_1$;EG%9Q@oo|I<7_#uF)$
zi{A?UUsVt80Dp7`_zRZ;EfGMaN5<6DW=K<h5s%uKdf`&ArC!9CN?yg>38`&Oq0~F$
zYTDi<Mz+*f!<<iTns3F2LG;d;dc_$GQR2h<;1=*NIAOIQA7}u4V_f~d6Att1@8E}D
z9anFTgFYNawAI6dhW_%PdKhzD!audCCu#=IBhc@JLNN@r{#iBUjI?OMz&<E>^JO*Z
zL_>!-|4`dE?clq(M?W68#~B&%zPG}jUv$FJOgf!5{)9#y6{S7(=e8~N?l|NPj!1-l
z?#hdXKSDmDK|P;zA|sMN4?h&9zbzMUhnZ^C`WW<(2l;~wls|bnB=YUSA$|oF{PzSO
z)_7avo#7FwQcr$3On(QBe8`Jv`iq8sbcFJhp`URgVcH)Q`X}Mf*a-a-ytZd}0n_><
z5)J*6h!0kS$A4{VS46)0PsK;ot3fqk4^zEr{rF|cf7Y3h{1L`{!=8}^4C|BHe#}Sp
zd?Q4B))^VT4|usSf5nM()yq&<y$lVpULsz-3=MVGOQfS-rtQ&=`h@<m1@PA=n@0cc
z@KI5Y+q>hTtg|<KvOQOb*F-yBv(W)(Ok-O}s*acOuP*XtoXOp7`f+uwGTM&U|G;{f
zarW%>>w5+3f%Eoy!!PAvs$8<(1_l_vkQcv!fzJ4aI^uWj*T>N*aWUUVEUdSCNBs2`
zbw<KybE@@ri}^?V_J>QP=Y{#TC&u=)`A3lNsW6tHz7W4@d!n5m%x9dzd;R_lIOaDX
zjQ1O}zz@s%llp}}*M=qsNw4iAe`LSMPyGDEuvf3N-`?Zb7o2f|Qyx9~&$avawDYx!
z`PA;9Cx0=3{<dG+ei#q*%kR4V?kjqZ`iiojNWXyp?3dopYEa+P&EeKZZ!J9N_AAm;
ze*L-suo$oD=Dyb1tM@$Q_R~>YMY##ofBA;R^+<Y4=x<3sia6Hr-&T*GRDrDl_3j}R
zaf18)_~|3BDYYf*KN1OAr|$og2f-5Hdi$Ck8lb+1Uq`>CJ<i_!G0a_kQ%+d==YHv*
zT>pFbz72(~*U_&KpNY$ZQg5vfUmj=u-{V+euYY1bVGr(QyjxMwzeV{`i~6KL0YB~F
zcNeZcthZ@rQ1T`Ek*`U|vbev9KG37z4t|W(%N+RZWlQ9NBtm<porvV$GW`$s4~Tpk
z0sJKP8}^ql`@hIjgZQC|zyIp~x`_UccpSEO1$Tj7uD704UumeQ%vr>O9}VhL0_QJ>
z4%_2F@7GK3j1L^P$AYFq$o@wDQ8r-m|0E#t#$#qK@+O1cubhzo!}dhb`_&fWL;kUZ
zpMFnJ`ZYkXXYjDSJGh%;Uf&xR`4bBm8+&E{ehG8#-^*iy-@^qf_GtRu&vnix>t+u7
z4tfVUBmwg&XB5A;a`)Fm$p647^D7tdFK$LC8Na8(zb}rdNqa!*&E4;&oDf&CQmikX
zkAWzr2G9Lg`cXZK{Q~}A&eNBwM|FRlb_NC(Fb})?sWTed_SMIXqw8Dx!<)0HmtC%X
za(?(5Y>Vmz2w*%=Pt(pWsh?d&-b+&|L<d3#aZr0_LY@5`Oyt_K)O+-A6y*R4`nNa+
zew=486nju_=qK&{74`qprjmNm@@Cap_xz%?ykFY<Sn)j9YW=pQULRKnJUoQ{KbXVI
z&#%uT@V^!T_8#`H?rXsS#<_X;F6mXiIS=ps_<z^%UHKG#_UbMYksg&(G~P{&Lej8B
z>1L|<Mk?1N@ZB~Mk&r`<QmdNGyLZzb+JXP;hQHp$&^1+OcHlQ%U!Co8uABdBJMjMs
zoyDbJHuRYHZvOi<iaS)t_1qCXoi<Q6N8a9K;J>P$^#wC`A1@mIKQI9D%Q@L4FwZx%
z{XO~*{DLNk8M<!$ZXSOEJZ)MqbnYW>_}#gy-{o`je**eHiQlCoKTE2c-`RoxDWl)d
z&ywio|LYz2SNrhO&Tjq`U=V>r7<c{lwPpCXOdZgE4!7DQGP;U8WuESWtGEGBskxZH
zPAy7(6EN~qe#hnVcL00q?;RLV`%iVzx%v~qHlknsFs3xo{eMkuYCSqY;kewV8$W{Y
nmXCGd#+S1Fmm6>Q>DMPMVTZVQGEBVvd;;2m&yA1!bou@ZKN71C

diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin
deleted file mode 100644
index bbc7d301593f72433c5b7626638d0e6f0f4a0813..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11656
zcmeHNUu;`f89(-Qnm<kK#7Wv_U70r}tkbF+-)s9iS*tYy225&MVGP8k$#r5U@#NUv
zwUb@1Elayn4I$bMcmVN0#RD*b1P^<FhoMS?X#yU2<N>59nx<+SDBYiKg0b-XzH{#N
z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL
zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W
z)q1gzTc}pns^!&23D~5^eJy<T<-A=i{2QEhv0k*-AObsfv3zX~fiZt=oQ2~2>QZje
z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI
z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv
zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI
zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo
z<HNqV`Xu2P^T#{*w=gCZmew@j_er|O22ZcA?-zYrr`PxQ@lpuFt6NBiE}?8ieg>zd
zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy
z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE
zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T
zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y<up)I+S34p&rP9?
z!c!7W(Zs7bt$z6!%@$2@*8d|$>>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l
z8cXjac$Fp{)2rGySn+!Tz$<TY7;p2uc@<y-*iFT9HU!{`_V=&iqCG)?_8zbQLbBSA
zS<lC4c^6IRiQ}!!tD8KuuQB;KX5|z<ONckMM=`&y{a#R3pCQy0xQ#XQ+)1k$Aho+G
z6ejSVI>2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^
zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n
zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj
zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hw<o(IeEul}4$MFw7Zog1{xD
zleVjWs&>ds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I
zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A
zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe;
zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g
zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V
zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJ<fngdUP
zp5aufjj40l)(q({a$vzPch_8^=X5us^c<P!MwISDc{ier0m{1(Igk{TcOlC7Qnc%x
z-ctP|Wl-LYh)hn|lnWw?mDRGoTyjrVT9z&qU^-=3nM~FSR8^YAsklM#o$a(N1PTRO
zaktWC>g+V<Rf?qyRHBc>S(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b
z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E
z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4s<Qp#eAb!
ztCmX(O&xCo7U+FTzN{ZH@3^D5R$6G}%K3VO??Y0`(5bt4Jq(>lX>$8MT1f}SwbtT6
z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9
zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+
zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv
zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G
zsDgp>M-8;CXUW!_Miv5%hx6i!lE<B*GE~?Oh_W}HD9&%EC?|uG7!c*t?Rjh>x1F+_
zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it
zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1<!o`C
zXTULKXWCt`INju8`w6f(l~T~c=A+N5SB0w4^*=CWTPAwhiOz<1%Ky-0$(@FU3_pIh
zp0zDb8_~&jD}0L+MpToYzkKJ?k~_eGSaj2Coj;3nCK<MOID5O9OwUNKrIl94g4gqT
zTdy}O3yFT%_5CWaIPDpj2xLj7RQl7N*sRQXhe(zaZm~#~t$4pxvYc~zl4`e-<qS$e
zYE-x-sl8Bp4`tYmY)byTYNydzWa8~9vnw6uB;87PrNb#npS8Nu;hd!7=Vn(1?4x3^
zl3f|F5sG29yNlR$$zYn~qri<hl|V(R8#85r@2^?+Rzr8;*~BTj+kiLSjd#Y+Ya~79
zOgHA)HfF<Pwz@GxnYzvM!+_~=XS;G6ZQLo3d!{S5_Xxqd@o1r9obAe;YSW$ZxZz@#
z^cm?kc1yC`#fJB@+qk3kN}yON(68`Q^i{3Mk7oVq`%r#<-Y%};)kAQc>R{!MY@Lij
z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh
zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA
z*27_}t%&<tn%`ZMuJ`Z`o38&<*LQJ6Xgug$vHz|5zHRWI+6I5-L}-f@(7W(kqGF;c
zbHE=JDp7IfL}W{xRYraY*olZurAutYM319hqHiH0z!S|ONu9!FBPzb<L?RI^5AQ-)
zpr3K@<(A)50{MJQe9P(Sp|}5$ch1Me3o-Eb^dQ^f@D9a)G9nHG#$fp#M@8$|(<Jrd
zxZDyGlg`*6+1nE5d&S1Im~f(71P|_#{!KbVLvt?u`3vF(CmK~6*#AjK)97`#l#hkt
zPVdkmz+>+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z<ygR@C(Xt
zZP<<|`LKj*5%5PQozbw5ANf_^SX8`I&hQAu6Cr&s+#{x(_^1cNUJv$%2l4+dCQkSF
z#Tm0t9~3`{iqjL3VGrI2i|3l^+s+e^LjK~;*x0)e-MS?GpJ<LteK+H>$nlNg!-(Mc
zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;<r|d`%`E&RZ_WuvDkaaQ^X8&lv
z!Ec=$k^U-p(%Cge^ccne66I@*;SmWV-(ql%^wa(${y6e|2^;L%pK!*x9>%4A;gOT0
z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3
zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e
zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg
z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mY<R>T8)SViUf}%2
zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p
zIl#3z8Vl<miY8lh@}KLc7GGoimNPgu13C}b1AXTlfLgr4^hsyufIq%-UlDz@BasO-
z`i;kV|0O>T3jBdQ_aBuXVL3zNWS<A})w0k10QBACpLOZ*cgh*q>BWnFWH}S#$6NG>
z<?J0t3Cn@=VLAK8pKQU99~<)IH)6x*N@4&NBmI53Cp6{k+TBK<5Mxx|9M8+WVdQ(F
zMV}r%cbnKn`zh(Wp?}Zb79IL$MIXURp8Vg}qTeIV-7WS|2V(l=UV;1#P{(6<qgNdL
zIshK)ns|lk7u+ubK}B-Qd8)rbFaOBLloRgt^y9pqYl^U^f8!=`w8_daNqwJbzANZ{
zy0!JVCN9Oq9*+)|{{z^~#z)tWGVtHX0Pb3X&TDYYDMXAO9(;rN{ezy9yUBl3$({ND
zdFpim7FIt@O#&mK3=$9B!l}*BpnD39IOsv22%n^;g|RFz-W=q;X&d?PD*4`kKmZg+
zwvqpV`j?xZ2LuA3cxoH@&-F`}M<_wQ5tP48(rP{yULL<uVkQ;A-6QJ7{_$lA)aLw1
ztj{X>?<hRU^L;14_q_DK!~P!i!zbaBr=@G(-N(xcH%CO@m~nqNDV%HHm2>z1AncRQ
zIiD;oU3r&}<f){B^3FE$$CQ1)Je5RH{+Vs$m%GT5pFw&0KQgEgJLos8Z>+oVD*^JK
z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB`
z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD>
N+sV24aj&l2zW{T-tULe!

diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
deleted file mode 100644
index e91fc32554a..00000000000
--- a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp
+++ /dev/null
@@ -1,965 +0,0 @@
-
-#include <torch/csrc/inductor/aoti_include/cuda.h>
-// Definition of AOTI runtime interface functions
-
-#include <torch/csrc/inductor/aoti_runtime/interface.h>
-#include <torch/csrc/inductor/aoti_runtime/model_container.h>
-
-#include <iostream>
-#include <vector>
-
-#define CONVERT_EXCEPTION_TO_ERROR_CODE(...)      \
-  try {                                           \
-    __VA_ARGS__                                   \
-  } catch (const std::exception& e) {             \
-    std::cerr << "Error: " << e.what() << '\n';   \
-    return AOTI_RUNTIME_FAILURE;                  \
-  } catch (...) {                                 \
-    std::cerr << "Unknown exception occurred.\n"; \
-    return AOTI_RUNTIME_FAILURE;                  \
-  }                                               \
-  return AOTI_RUNTIME_SUCCESS;
-
-#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name)  \
-  do {                                                            \
-    AOTI_RUNTIME_CHECK(                                           \
-        actual_size == expected_size,                             \
-        "expected " + std::string(name) + " vector size to be " + \
-            std::to_string(expected_size) + ", but got " +        \
-            std::to_string(actual_size));                         \
-  } while (0)
-
-// AOTInductor uses at::addmm_out, which doesn't supports
-// arguments that requires gradient. For this reason, we
-// enforce no_grad context for run APIs.
-//
-// A RAII, thread local (!) guard that enables or disables grad mode upon
-// construction, and sets it back to the original value upon destruction.
-struct AOTINoGradGuard {
-  AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(false);
-  }
-  AOTINoGradGuard(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete;
-  ~AOTINoGradGuard() {
-    aoti_torch_grad_mode_set_enabled(prev_mode);
-  }
-  AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete;
-  AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete;
-  bool prev_mode{aoti_torch_grad_mode_is_enabled()};
-};
-
-extern "C" {
-
-AOTIRuntimeError AOTInductorModelContainerCreate(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    bool is_cpu,
-    const char* cubin_dir) {
-      return AOTInductorModelContainerCreateWithDevice(
-        container_handle,
-        num_models,
-        is_cpu ? "cpu" : "cuda",
-        cubin_dir);
-}
-
-AOTIRuntimeError AOTInductorModelContainerCreateWithDevice(
-    AOTInductorModelContainerHandle* container_handle,
-    size_t num_models,
-    const char* device_str,
-    const char* cubin_dir) {
-  if (num_models == 0) {
-    std::cerr << "Error: num_models must be positive, but got 0\n";
-    return AOTI_RUNTIME_FAILURE;
-  }
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    std::optional<std::string> cubin_dir_opt;
-    if (cubin_dir != nullptr) {
-      cubin_dir_opt.emplace(cubin_dir);
-    }
-    auto* container = new torch::aot_inductor::AOTInductorModelContainer(
-        num_models, std::string(device_str), cubin_dir_opt);
-    *container_handle =
-        reinterpret_cast<AOTInductorModelContainerHandle>(container);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerDelete(
-    AOTInductorModelContainerHandle container_handle) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto* container =
-        reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-            container_handle);
-    delete container;
-  });
-}
-
-AOTIRuntimeError AOTInductorModelContainerRun(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded(
-    AOTInductorModelContainerHandle container_handle,
-    AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles
-                                     // are stolen; the array itself is borrowed
-    size_t num_inputs,
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    size_t num_outputs,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs");
-  AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs");
-
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_single_threaded(
-        input_handles, output_handles, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumConstants(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* num_constants) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *num_constants = container->num_constants(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** name) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *name = container->constant_name(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    const char** original_fqn) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *original_fqn = container->constant_original_fqn(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    bool* from_folded) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantType(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* type) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDtype(
-    AOTInductorModelContainerHandle container_handle,
-    size_t idx,
-    int32_t* dtype) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *dtype = container->constant_dtype(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize(
-  AOTInductorModelContainerHandle container_handle,
-  size_t idx,
-  size_t* data_size) {
-  auto* container =
-    reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-        container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { *data_size = container->constant_data_size(idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto constants_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-    { const auto ret = container->extract_constants_map(use_inactive);
-      for (const auto& pair: ret) {
-        constants_map->emplace(pair.first, pair.second);
-      }
-    })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update, /* user_managed = */ true);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle,
-    bool use_inactive,
-    bool validate_full_update) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->update_constant_buffer(
-        *input_map, use_inactive, validate_full_update);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  return AOTInductorModelContainerUpdateConstantBuffer(container_handle,
-          constant_map_handle,
-          /*use_inactive*/ true,
-          /*validate_full_update*/ true);
-}
-
-AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->free_inactive_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerRunConstantFolding(
-    AOTInductorModelContainerHandle container_handle,
-    bool use_inactive,
-    AOTInductorStreamHandle stream_handle,
-    AOTIProxyExecutorHandle proxy_executor_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  auto stream =
-      reinterpret_cast<torch::aot_inductor::DeviceStreamType>(stream_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    container->run_const_fold(use_inactive, stream, proxy_executor_handle);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer(
-    AOTInductorModelContainerHandle container_handle) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    container->swap_constant_buffer();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumInputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_inputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_inputs = container->num_inputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetInputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t input_idx,
-    const char** ret_input_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_input_names = container->input_name(input_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetNumOutputs(
-    AOTInductorModelContainerHandle container_handle,
-    size_t* ret_num_outputs) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_num_outputs = container->num_outputs(); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetOutputName(
-    AOTInductorModelContainerHandle container_handle,
-    size_t output_idx,
-    const char** ret_output_names) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE(
-      { *ret_output_names = container->output_name(output_idx); })
-}
-
-AOTIRuntimeError AOTInductorModelContainerGetCallSpec(
-    AOTInductorModelContainerHandle container_handle,
-    const char** in_spec,
-    const char** out_spec) {
-  auto* container =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModelContainer*>(
-          container_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    *in_spec = container->get_in_spec();
-    *out_spec = container->get_out_spec();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelCreate(
-    AOTInductorModelHandle* model_handle,
-    AOTInductorConstantMapHandle constant_map_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-      auto constant_array = std::make_shared<std::vector<torch::aot_inductor::ConstantHandle>>();
-      auto input_map = reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(constant_map_handle);
-
-      auto model = new torch::aot_inductor::AOTInductorModel(
-          constant_map,
-          constant_array,
-          "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models
-          ""
-      );
-
-      if (input_map) {
-        for (auto const& kv : *input_map) {
-          constant_map->emplace(kv.first, kv.second);
-        }
-      } else {
-        model->load_constants();
-      }
-
-      *model_handle = reinterpret_cast<AOTInductorModelHandle>(model);
-    })}
-
-AOTIRuntimeError AOTInductorModelRun(
-    AOTInductorModelHandle model_handle,
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    AOTINoGradGuard guard;
-    model->run_impl(
-        input_handles,
-        output_handles,
-        (torch::aot_inductor::DeviceStreamType) nullptr,
-        nullptr);
-  })
-}
-
-AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){
-    CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(
-          model_handle);
-      delete model;
-    })}
-
-AOTIRuntimeError AOTInductorModelGetNumOutputs(
-    AOTInductorModelHandle model_handle,
-    size_t* ret_num_outputs) {
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-      auto model = reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-      *ret_num_outputs = model->num_outputs();
-  })
-}
-
-AOTIRuntimeError AOTInductorModelUpdateConstantsMap(
-    AOTInductorModelHandle model_handle,
-    AOTInductorConstantMapHandle constant_map_handle) {
-  auto model =
-      reinterpret_cast<torch::aot_inductor::AOTInductorModel*>(model_handle);
-  CONVERT_EXCEPTION_TO_ERROR_CODE({
-    auto constant_map = std::make_shared<torch::aot_inductor::ConstantMap>();
-    auto input_map =
-        reinterpret_cast<std::unordered_map<std::string, AtenTensorHandle>*>(
-            constant_map_handle);
-
-    for (auto const& kv : *input_map) {
-      constant_map->emplace(kv.first, kv.second);
-    }
-    model->update_constants_map(std::move(constant_map));
-  })
-}
-
-} // extern "C"
-
-
-#define CUDA_DRIVER_CHECK(EXPR)                    \
-do {                                               \
-    CUresult code = EXPR;                          \
-    const char *msg;                               \
-    CUresult code_get_error = cuGetErrorString(code, &msg); \
-    if (code_get_error != CUDA_SUCCESS) {          \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string("invalid error code!"));   \
-    }                                              \
-    if (code != CUDA_SUCCESS) {                    \
-        throw std::runtime_error(                  \
-            std::string("CUDA driver error: ") +   \
-            std::string(msg));                     \
-    }                                              \
-} while (0);
-
-static inline CUfunction loadKernel(
-        std::string filePath,
-        const std::string &funcName,
-        uint32_t sharedMemBytes,
-        const std::optional<std::string> &cubinDir = std::nullopt) {
-    if (cubinDir) {
-        std::filesystem::path p1{*cubinDir};
-        std::filesystem::path p2{filePath};
-        filePath = (p1 / p2.filename()).string();
-    }
-
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str()));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
-    CUmodule mod;
-    CUfunction func;
-    CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
-    CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
-    if (sharedMemBytes > 0) {
-        CUDA_DRIVER_CHECK(cuFuncSetAttribute(
-            func,
-            CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-            sharedMemBytes
-        ))
-    }
-    return func;
-}
-
-static inline void launchKernel(
-        CUfunction func,
-        uint32_t gridX,
-        uint32_t gridY,
-        uint32_t gridZ,
-        uint32_t numWarps,
-        uint32_t sharedMemBytes,
-        void* args[],
-        cudaStream_t stream) {
-    CUDA_DRIVER_CHECK(cuLaunchKernel(
-        func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr
-    ));
-}
-CACHE_TORCH_DTYPE(float32);
-CACHE_TORCH_DEVICE(cuda);
-CACHE_TORCH_LAYOUT(strided);
-namespace torch::aot_inductor {
-namespace {
-class AOTInductorModelKernels : public AOTInductorModelKernelsBase {
-  public:
-    CUfunction triton_poi_fused_convolution_0{nullptr};
-    CUfunction triton_poi_fused_convolution_1{nullptr};
-    CUfunction triton_poi_fused_convolution_2{nullptr};
-};
-}  // namespace
-
-
-
-AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
-                                   std::shared_ptr<std::vector<ConstantHandle>> constants_array,
-                                   const std::string& device_str,
-                                   std::optional<std::string> cubin_dir)
-    : AOTInductorModelBase(1,
-                           1,
-                           1,
-                           device_str,
-                           std::move(cubin_dir),
-                           true) {
-    inputs_info_[0].name = "arg2_1";
-    constants_info_[0].name = "conv_weight";
-    constants_info_[0].dtype = static_cast<int32_t>(cached_torch_dtype_float32);
-    constants_info_[0].offset = 0;
-    constants_info_[0].data_size = 540;
-    constants_info_[0].from_folded = false;
-    constants_info_[0].type = static_cast<int32_t>(torch::aot_inductor::ConstantType::Parameter);
-    constants_info_[0].shape = {5, 3, 3, 3};
-    constants_info_[0].stride = {27, 9, 3, 1};
-    constants_info_[0].layout = static_cast<int32_t>(cached_torch_layout_strided);
-    constants_info_[0].original_fqn = "conv.weight";
-    update_constants_map(std::move(constants_map));
-    update_constants_array(std::move(constants_array));
-    in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])";
-    out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])";
-    outputs_info_[0].name = "output0";
-    this->kernels_ = std::make_unique<AOTInductorModelKernels>();
-}
-
-std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor,
-    bool initialization
-) {
-
-    if (!initialization) {
-        std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: "
-                  << "aot_inductor.use_runtime_constant_folding=False\n";
-    }
-    return {};
-}
-} // namespace torch::aot_inductor
-using namespace torch::aot_inductor;
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_0(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_0', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 12
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (64 - 1)) / (64));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_0 == nullptr) {
-        kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); 
-    }
-    CUdeviceptr var_0 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_1 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_2 = ynumel;
-    int var_3 = xnumel;
-    CUdeviceptr global_scratch_4 = 0;
-    void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4};
-    launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_1(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_1', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 15
-        xnumel = 9
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y3 = yindex
-        y0 = (yindex % 3)
-        y1 = yindex // 3
-        tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last')
-        tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (16 - 1)) / (16));
-    uint32_t grid_1 = ((ynumel + (16 - 1)) / (16));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_1 == nullptr) {
-        kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); 
-    }
-    CUdeviceptr var_5 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_6 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_7 = ynumel;
-    int var_8 = xnumel;
-    CUdeviceptr global_scratch_9 = 0;
-    void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9};
-    launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_);
-}
-
-template <typename in_ptr0_type_, typename out_ptr0_type_, typename kernels_type_>
-static inline void call_triton_poi_fused_convolution_2(
-    const in_ptr0_type_& in_ptr0,
-    const out_ptr0_type_& out_ptr0,
-    int64_t ynumel,
-    int64_t xnumel,
-    int32_t device_idx_,
-    cudaStream_t stream_,
-    kernels_type_& kernels_,
-    const std::optional<std::string>& cubin_dir_ = std::nullopt
-){
-    /*
-    async_compile.triton('triton_poi_fused_convolution_2', '''
-    import triton
-    import triton.language as tl
-
-    from torch._inductor.runtime import triton_helpers, triton_heuristics
-    from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
-    from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
-    triton_helpers.set_driver_to_gpu()
-
-    @triton_heuristics.pointwise(
-        size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE,
-        filename=__file__,
-        triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]},
-        inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}},
-        min_elem_per_thread=0
-    )
-    @triton.jit
-    def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr):
-        ynumel = 20
-        xnumel = 64
-        yoffset = tl.program_id(1) * YBLOCK
-        yindex = yoffset + tl.arange(0, YBLOCK)[:, None]
-        ymask = yindex < ynumel
-        xoffset = tl.program_id(0) * XBLOCK
-        xindex = xoffset + tl.arange(0, XBLOCK)[None, :]
-        xmask = xindex < xnumel
-        x2 = xindex
-        y0 = (yindex % 5)
-        y1 = yindex // 5
-        y3 = yindex
-        tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last')
-        tmp1 = y0
-        tmp2 = tl.full([1, 1], 2, tl.int64)
-        tmp3 = tmp1 < tmp2
-        tmp4 = tl.full([1, 1], 1, tl.int64)
-        tmp5 = tmp1 < tmp4
-        tmp6 = 0.1508762389421463
-        tmp7 = -0.15852206945419312
-        tmp8 = tl.where(tmp5, tmp6, tmp7)
-        tmp9 = tl.full([1, 1], 3, tl.int64)
-        tmp10 = tmp1 < tmp9
-        tmp11 = tl.full([1, 1], 4, tl.int64)
-        tmp12 = tmp1 < tmp11
-        tmp13 = -0.047068577259778976
-        tmp14 = 0.010523972101509571
-        tmp15 = tl.where(tmp12, tmp13, tmp14)
-        tmp16 = 0.07869197428226471
-        tmp17 = tl.where(tmp10, tmp16, tmp15)
-        tmp18 = tl.where(tmp3, tmp8, tmp17)
-        tmp19 = tmp0 + tmp18
-        tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask)
-    ''', device_str='cuda')
-    */
-    uint32_t grid_0 = ((xnumel + (32 - 1)) / (32));
-    uint32_t grid_1 = ((ynumel + (32 - 1)) / (32));
-    uint32_t grid_2 = 1;
-    if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return;
-    if (kernels_.triton_poi_fused_convolution_2 == nullptr) {
-        kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); 
-    }
-    CUdeviceptr var_10 = reinterpret_cast<CUdeviceptr>(in_ptr0.data_ptr());
-    CUdeviceptr var_11 = reinterpret_cast<CUdeviceptr>(out_ptr0.data_ptr());
-    int var_12 = ynumel;
-    int var_13 = xnumel;
-    CUdeviceptr global_scratch_14 = 0;
-    void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14};
-    launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_);
-}
-
-namespace torch::aot_inductor {
-
-void AOTInductorModel::_const_run_impl(
-    std::vector<AtenTensorHandle>& output_handles,
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {}
-
-AOTI_NOINLINE static void check_input_0(
-    AtenTensorHandle* input_handles
-) {
-    ConstantHandle arg2_1 = ConstantHandle(input_handles[0]);
-    int32_t arg2_1_dtype;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype));
-
-    int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32();
-    if (arg2_1_expected_dtype != arg2_1_dtype) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dtype, "
-           << "expected: " << arg2_1_expected_dtype << "(at::kFloat), "
-           << "but got: " << arg2_1_dtype << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_size = arg2_1.sizes();
-
-    if (4 != arg2_1_size[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 0, "
-           << "expected: 4, " << "but got: " << arg2_1_size[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (3 != arg2_1_size[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 1, "
-           << "expected: 3, " << "but got: " << arg2_1_size[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_size[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_size[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched dim value at 3, "
-           << "expected: 8, " << "but got: " << arg2_1_size[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    auto arg2_1_stride = arg2_1.strides();
-
-    if (192 != arg2_1_stride[0]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 0, "
-           << "expected: 192, " << "but got: " << arg2_1_stride[0]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (64 != arg2_1_stride[1]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 1, "
-           << "expected: 64, " << "but got: " << arg2_1_stride[1]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (8 != arg2_1_stride[2]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 2, "
-           << "expected: 8, " << "but got: " << arg2_1_stride[2]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-
-    if (1 != arg2_1_stride[3]) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched stride value at 3, "
-           << "expected: 1, " << "but got: " << arg2_1_stride[3]
-           << "\n";
-        throw std::runtime_error(ss.str());
-    }
-    int32_t arg2_1_device_type;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type));
-
-    int32_t arg2_1_expected_device_type = 1;
-    if (arg2_1_expected_device_type != arg2_1_device_type) {
-        std::stringstream ss;
-        ss << "input_handles[0]: unmatched device type, "
-        << "expected: " << arg2_1_expected_device_type << "1(cuda), "
-        << "but got: " << arg2_1_device_type << "\n";
-        throw std::runtime_error(ss.str());
-    }
-}
-
-static bool _check_aoti_runtime_check_inputs_env() {
-    const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS");
-    const static bool result = env_var_value != nullptr && env_var_value[0] != '0';
-    return result;
-}
-
-AOTI_NOINLINE static void __check_inputs_outputs(
-    AtenTensorHandle* input_handles,
-    AtenTensorHandle* output_handles) {
-    if (!_check_aoti_runtime_check_inputs_env()){
-        return;
-    }
-    check_input_0(input_handles);
-}
-
-void AOTInductorModel::run_impl(
-    AtenTensorHandle*
-        input_handles, // array of input AtenTensorHandle; handles
-                        // are stolen; the array itself is borrowed
-    AtenTensorHandle*
-        output_handles, // array for writing output AtenTensorHandle; handles
-                        // will be stolen by the caller; the array itself is
-                        // borrowed
-    DeviceStreamType stream,
-    AOTIProxyExecutorHandle proxy_executor
-) {
-    __check_inputs_outputs(input_handles, output_handles);
-
-    auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1);
-    auto arg2_1 = std::move(inputs[0]);
-    [[maybe_unused]] auto& conv_weight = constants_->at(0);
-
-    if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) {
-        AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit.");
-        AtenTensorHandle arg2_1_aligned;
-        aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned);
-        arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned));
-    }
-    inputs.clear();
-    [[maybe_unused]] auto& kernels = static_cast<AOTInductorModelKernels&>(*this->kernels_.get());
-
-    AOTICudaStreamGuard stream_guard(stream, this->device_idx_);
-    static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L};
-    static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L};
-    AtenTensorHandle buf0_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle));
-    RAIIAtenTensorHandle buf0(buf0_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    arg2_1.reset();
-    static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L};
-    static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L};
-    AtenTensorHandle buf1_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle));
-    RAIIAtenTensorHandle buf1(buf1_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    AtenTensorHandle buf2_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, std::array<int64_t, 2>{1L, 1L}.cbegin(), 2, 0, std::array<int64_t, 2>{0L, 0L}.cbegin(), 2, 1L, &buf2_handle));
-    RAIIAtenTensorHandle buf2(buf2_handle);
-    buf0.reset();
-    buf1.reset();
-    static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L};
-    static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L};
-    AtenTensorHandle buf3_handle;
-    AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle));
-    RAIIAtenTensorHandle buf3(buf3_handle);
-    // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution]
-    call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_);
-    buf2.reset();
-    output_handles[0] = buf3.release();
-} // AOTInductorModel::run_impl
-} // namespace torch::aot_inductor
-
-
-
-
-// Compile cmd
-// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -c -o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o
-// Link cmd
-// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D  C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D  USE_CUDA  -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp  -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include   -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma  -o /home/gasoonjia/executorch/aoti.so  -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda  -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs 
diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json
deleted file mode 100644
index bd5d2c60334..00000000000
--- a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json
+++ /dev/null
@@ -1 +0,0 @@
-{"AOTI_DEVICE_KEY": "cuda"}
\ No newline at end of file

From 3ec8a38160213e37f0652296a938f3ecdad02773 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 20 Aug 2025 12:16:59 -0700
Subject: [PATCH 17/50] add get_device_type and get_device_index shim layer

---
 backends/aoti/aoti_backend.py                 |  2 ++
 backends/aoti/runtime/shims/memory.cpp        | 12 +++++++----
 .../aoti/runtime/shims/tensor_attribute.cpp   | 20 +++++++++++++++++++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index 5aa547d789c..a60806815ef 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -45,6 +45,8 @@ def preprocess(
         options: dict[str, typing.Any] = {
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
+            "aot_inductor.debug_compile": True,
+            "aot_inductor.repro_level": 3
         }
         so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
 
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index ab5d35efd9f..83030647691 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -179,14 +179,18 @@ AOTITorchError aoti_torch_empty_strided(
 
   // Store the tensor
   tensors.insert(tensor);
+  *ret_new_tensor = tensor.get();
+  is_tensor_own_memory[tensor.get()] = true;
 
-  std::cout << "sizes.data(): " << sizes.data()
+  std::cout << "Finished. Created tensor " << tensor.get() << " with sizes "
+            << std::endl
+            << "sizes.data(): " << sizes.data()
             << ", tensor->sizes().data(): " << tensor->sizes().data()
             << std::endl;
   std::cout << "Size[0] of tensor " << tensor.get() << " is "
-            << tensor->sizes()[0] << std::endl;
-  *ret_new_tensor = tensor.get();
-  is_tensor_own_memory[tensor.get()] = true;
+            << tensor->sizes()[0] << std::endl
+            << std::endl;
+
   return Error::Ok;
 }
 
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index b5333f50ea9..a75af9ae128 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -103,6 +103,26 @@ AOTITorchError aoti_torch_get_storage_size(
   throw std::runtime_error("Cannot get storage size on ETensor");
 }
 
+AOTITorchError aoti_torch_get_device_type(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_type) {
+  // Let's assume all tensors AOTI using are on CUDA device
+  *ret_device_type = aoti_torch_device_type_cuda(); // CUDA device type
+  std::cout << "getting device_type from tensor " << tensor << " = "
+            << *ret_device_type << std::endl;
+  return Error::Ok;
+}
+
+AOTITorchError aoti_torch_get_device_index(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_index) {
+  // Let's assume all tensors AOTI using are on CUDA:0
+  *ret_device_index = 0;
+  std::cout << "getting device_index from tensor " << tensor << " = "
+            << *ret_device_index << std::endl;
+  return Error::Ok;
+}
+
 int32_t aoti_torch_device_type_cpu() {
   // Let's say cpu is 0 for ET as well
   return 0;

From 28d1294ed88ca85884a5b46c694c2f693a0e4817 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 Aug 2025 10:57:34 -0700
Subject: [PATCH 18/50] e2e runnable on conv2d

---
 backends/aoti/aoti_backend.py                 |   7 +-
 backends/aoti/runtime/shims/cuda_ops.cpp      | 282 ++++++++++++++++++
 backends/aoti/runtime/shims/cuda_ops.h        |  54 ++++
 backends/aoti/runtime/shims/memory.cpp        |   3 +-
 .../aoti/runtime/shims/tensor_attribute.cpp   |   2 +-
 export_aoti.py                                |   5 +
 6 files changed, 350 insertions(+), 3 deletions(-)
 create mode 100644 backends/aoti/runtime/shims/cuda_ops.cpp
 create mode 100644 backends/aoti/runtime/shims/cuda_ops.h

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a60806815ef..a0793bdb80f 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -46,8 +46,13 @@ def preprocess(
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
             "aot_inductor.debug_compile": True,
-            "aot_inductor.repro_level": 3
+            "aot_inductor.repro_level": 3,
+            "aot_inductor.debug_intermediate_value_printer": "3",
+            "max_autotune": True,
+            "max_autotune_gemm_backends": "TRITON",
+            "max_autotune_conv_backends": "TRITON",
         }
+
         so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
 
         assert so_path == output_path, f"Expected {output_path} but got {so_path}"
diff --git a/backends/aoti/runtime/shims/cuda_ops.cpp b/backends/aoti/runtime/shims/cuda_ops.cpp
new file mode 100644
index 00000000000..d89f1d36e7e
--- /dev/null
+++ b/backends/aoti/runtime/shims/cuda_ops.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cuda_ops.h"
+#include "memory.h"
+#include "tensor_attribute.h"
+#include <iostream>
+#include <cudnn.h>
+#include <cublas_v2.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+// Global cuDNN handle
+static cudnnHandle_t cudnn_handle = nullptr;
+
+// Initialize cuDNN handle
+static void init_cudnn() {
+    if (cudnn_handle == nullptr) {
+        cudnnCreate(&cudnn_handle);
+    }
+}
+
+extern "C" {
+
+AOTITorchError aoti_torch_cuda_addmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat1,
+    AtenTensorHandle mat2,
+    double beta,
+    double alpha) {
+    
+    std::cout << "aoti_torch_cuda_addmm_out called with beta=" << beta << ", alpha=" << alpha << std::endl;
+    
+    // Get tensor dimensions
+    auto mat1_sizes = mat1->sizes();
+    auto mat2_sizes = mat2->sizes();
+    auto self_sizes = self->sizes();
+    auto out_sizes = out->sizes();
+    
+    // mat1: [M, K], mat2: [K, N], result: [M, N]
+    int64_t M = mat1_sizes[0];
+    int64_t K = mat1_sizes[1];
+    int64_t N = mat2_sizes[1];
+    
+    std::cout << "ADDMM: mat1[" << M << "," << K << "] @ mat2[" << K << "," << N << "] -> out[" << M << "," << N << "]" << std::endl;
+    
+    // Use cuBLAS for matrix multiplication
+    cublasHandle_t cublas_handle;
+    cublasStatus_t cublas_status = cublasCreate(&cublas_handle);
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        std::cerr << "Failed to create cuBLAS handle" << std::endl;
+        return Error::Internal;
+    }
+    
+    // Set cuBLAS to use tensor op math for better performance
+    cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
+    
+    const float f_alpha = static_cast<float>(alpha);
+    const float f_beta = static_cast<float>(beta);
+    
+    // Perform: out = beta * self + alpha * (mat1 @ mat2)
+    // First: out = beta * self (copy self to out and scale)
+    if (beta != 0.0) {
+        Error copy_err = aoti_torch_copy_(out, self, 0);
+        if (copy_err != Error::Ok) {
+            cublasDestroy(cublas_handle);
+            return copy_err;
+        }
+        
+        // Scale by beta if not 1.0
+        if (beta != 1.0) {
+            cublas_status = cublasSscal(cublas_handle, M * N, &f_beta, 
+                                       static_cast<float*>(out->mutable_data_ptr()), 1);
+            if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+                std::cerr << "cuBLAS scale failed" << std::endl;
+                cublasDestroy(cublas_handle);
+                return Error::Internal;
+            }
+        }
+    } else {
+        // Zero out the output tensor
+        cudaMemset(out->mutable_data_ptr(), 0, M * N * sizeof(float));
+    }
+    
+    // Then: out += alpha * (mat1 @ mat2)
+    // cuBLAS uses column-major, so we compute: C = alpha * A^T * B^T + beta * C
+    // Which gives us: out = alpha * mat1 @ mat2 + beta * out
+    const float gemm_beta = 1.0f; // Since we already handled the beta scaling above
+    
+    cublas_status = cublasSgemm(
+        cublas_handle,
+        CUBLAS_OP_N, CUBLAS_OP_N,  // No transpose for column-major interpretation
+        N, M, K,                   // Dimensions swapped for column-major
+        &f_alpha,                  // alpha
+        static_cast<const float*>(mat2->data_ptr()), N,  // B matrix (mat2)
+        static_cast<const float*>(mat1->data_ptr()), K,  // A matrix (mat1)  
+        &gemm_beta,                // beta (1.0 since we pre-scaled)
+        static_cast<float*>(out->mutable_data_ptr()), N  // C matrix (out)
+    );
+    
+    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
+        std::cerr << "cuBLAS GEMM failed: " << cublas_status << std::endl;
+        cublasDestroy(cublas_handle);
+        return Error::Internal;
+    }
+    
+    cublasDestroy(cublas_handle);
+    
+    std::cout << "aoti_torch_cuda_addmm_out completed successfully" << std::endl;
+    return Error::Ok;
+}
+
+AOTITorchError aoti_torch_cuda_convolution(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle* bias,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int32_t transposed,
+    const int64_t* output_padding,
+    int64_t output_padding_len_,
+    int64_t groups,
+    AtenTensorHandle* ret0) {
+    
+    std::cout << "aoti_torch_cuda_convolution called" << std::endl;
+    
+    init_cudnn();
+    
+    // Get input dimensions
+    auto input_sizes = input->sizes();
+    auto weight_sizes = weight->sizes();
+    
+    int batch_size = input_sizes[0];
+    int input_channels = input_sizes[1];
+    int input_height = input_sizes[2];
+    int input_width = input_sizes[3];
+    
+    int output_channels = weight_sizes[0];
+    int kernel_height = weight_sizes[2];
+    int kernel_width = weight_sizes[3];
+    
+    // Calculate output dimensions
+    int output_height = (input_height + 2 * padding[0] - dilation[0] * (kernel_height - 1) - 1) / stride[0] + 1;
+    int output_width = (input_width + 2 * padding[1] - dilation[1] * (kernel_width - 1) - 1) / stride[1] + 1;
+    
+    std::cout << "Conv2d: input[" << batch_size << "," << input_channels << "," << input_height << "," << input_width << "]"
+              << " -> output[" << batch_size << "," << output_channels << "," << output_height << "," << output_width << "]" << std::endl;
+    
+    // Create output tensor
+    std::vector<int64_t> output_sizes = {batch_size, output_channels, output_height, output_width};
+    
+    AOTITensorHandle output_handle;
+    Error create_err = aoti_torch_empty_strided(
+        output_sizes.size(),
+        output_sizes.data(),
+        nullptr, // use default strides
+        6, // float32 dtype
+        1, // cuda device
+        0, // device index
+        &output_handle);
+    
+    if (create_err != Error::Ok) {
+        std::cerr << "Failed to create output tensor for convolution" << std::endl;
+        return create_err;
+    }
+    
+    // Setup cuDNN descriptors
+    cudnnTensorDescriptor_t input_desc, output_desc, bias_desc;
+    cudnnFilterDescriptor_t weight_desc;
+    cudnnConvolutionDescriptor_t conv_desc;
+    
+    cudnnCreateTensorDescriptor(&input_desc);
+    cudnnCreateTensorDescriptor(&output_desc);
+    cudnnCreateTensorDescriptor(&bias_desc);
+    cudnnCreateFilterDescriptor(&weight_desc);
+    cudnnCreateConvolutionDescriptor(&conv_desc);
+    
+    // Set tensor descriptors
+    cudnnSetTensorNdDescriptor(input_desc, CUDNN_DATA_FLOAT, 4, 
+                               (int*)input_sizes.data(), 
+                               (int*)input->strides().data());
+    
+    cudnnSetTensorNdDescriptor(output_desc, CUDNN_DATA_FLOAT, 4,
+                               (int*)output_sizes.data(),
+                               (int*)output_handle->strides().data());
+    
+    cudnnSetFilterNdDescriptor(weight_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4,
+                               (int*)weight_sizes.data());
+    
+    // Set convolution descriptor
+    cudnnSetConvolutionNdDescriptor(conv_desc, 2,
+                                    (int*)padding, (int*)stride, (int*)dilation,
+                                    CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
+    
+    if (groups > 1) {
+        cudnnSetConvolutionGroupCount(conv_desc, groups);
+    }
+    
+    // Find best convolution algorithm
+    cudnnConvolutionFwdAlgo_t algo;
+    cudnnGetConvolutionForwardAlgorithm(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc,
+                                        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
+    
+    // Get workspace size
+    size_t workspace_size;
+    cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, algo, &workspace_size);
+    
+    // Allocate workspace
+    void* workspace = nullptr;
+    if (workspace_size > 0) {
+        cudaMalloc(&workspace, workspace_size);
+    }
+    
+    // Perform convolution
+    const float alpha = 1.0f, beta = 0.0f;
+    cudnnStatus_t conv_status = cudnnConvolutionForward(
+        cudnn_handle,
+        &alpha,
+        input_desc, input->data_ptr(),
+        weight_desc, weight->data_ptr(),
+        conv_desc, algo,
+        workspace, workspace_size,
+        &beta,
+        output_desc, output_handle->mutable_data_ptr());
+    
+    if (conv_status != CUDNN_STATUS_SUCCESS) {
+        std::cerr << "cuDNN convolution failed: " << cudnnGetErrorString(conv_status) << std::endl;
+        if (workspace) cudaFree(workspace);
+        cudnnDestroyTensorDescriptor(input_desc);
+        cudnnDestroyTensorDescriptor(output_desc);
+        cudnnDestroyTensorDescriptor(bias_desc);
+        cudnnDestroyFilterDescriptor(weight_desc);
+        cudnnDestroyConvolutionDescriptor(conv_desc);
+        aoti_torch_delete_tensor_object(output_handle);
+        return Error::Internal;
+    }
+    
+    // Add bias if present
+    if (bias && *bias) {
+        auto bias_sizes = (*bias)->sizes();
+        cudnnSetTensorNdDescriptor(bias_desc, CUDNN_DATA_FLOAT, 4,
+                                   (int*)bias_sizes.data(),
+                                   (int*)(*bias)->strides().data());
+        
+        cudnnAddTensor(cudnn_handle, &alpha, bias_desc, (*bias)->data_ptr(),
+                       &alpha, output_desc, output_handle->mutable_data_ptr());
+    }
+    
+    // Cleanup
+    if (workspace) cudaFree(workspace);
+    cudnnDestroyTensorDescriptor(input_desc);
+    cudnnDestroyTensorDescriptor(output_desc);
+    cudnnDestroyTensorDescriptor(bias_desc);
+    cudnnDestroyFilterDescriptor(weight_desc);
+    cudnnDestroyConvolutionDescriptor(conv_desc);
+    
+    *ret0 = output_handle;
+    
+    std::cout << "aoti_torch_cuda_convolution completed successfully" << std::endl;
+    return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/aoti/runtime/shims/cuda_ops.h b/backends/aoti/runtime/shims/cuda_ops.h
new file mode 100644
index 00000000000..699c87322d2
--- /dev/null
+++ b/backends/aoti/runtime/shims/cuda_ops.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include "tensor_attribute.h"
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// CUDA addmm operation: out = beta * self + alpha * (mat1 @ mat2)
+AOTITorchError aoti_torch_cuda_addmm_out(
+    AtenTensorHandle out,
+    AtenTensorHandle self,
+    AtenTensorHandle mat1,
+    AtenTensorHandle mat2,
+    double beta,
+    double alpha);
+
+// CUDA convolution operation
+AOTITorchError aoti_torch_cuda_convolution(
+    AtenTensorHandle input,
+    AtenTensorHandle weight,
+    AtenTensorHandle* bias,
+    const int64_t* stride,
+    int64_t stride_len_,
+    const int64_t* padding,
+    int64_t padding_len_,
+    const int64_t* dilation,
+    int64_t dilation_len_,
+    int32_t transposed,
+    const int64_t* output_padding,
+    int64_t output_padding_len_,
+    int64_t groups,
+    AtenTensorHandle* ret0);
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 83030647691..2d2bf940833 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -174,10 +174,11 @@ AOTITorchError aoti_torch_empty_strided(
   for (int i = 0; i < ndim; i++) {
     sizes[i] = sizes_ptr[i];
   }
+
   // ETensor creation
   auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr);
 
-  // Store the tensor
+  // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
   *ret_new_tensor = tensor.get();
   is_tensor_own_memory[tensor.get()] = true;
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index a75af9ae128..eb3d0e22371 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -107,7 +107,7 @@ AOTITorchError aoti_torch_get_device_type(
     AOTITensorHandle tensor,
     int32_t* ret_device_type) {
   // Let's assume all tensors AOTI using are on CUDA device
-  *ret_device_type = aoti_torch_device_type_cuda(); // CUDA device type
+  *ret_device_type = aoti_torch_device_type_cuda();
   std::cout << "getting device_type from tensor " << tensor << " = "
             << *ret_device_type << std::endl;
   return Error::Ok;
diff --git a/export_aoti.py b/export_aoti.py
index 229d6e567e3..3ca5287b3b9 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -122,6 +122,11 @@ def get_model_and_inputs(
 
 def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     """Export model through the AOTI pipeline."""
+    all_one_input = tuple(
+        torch.ones_like(example_input) for example_input in example_inputs
+    )
+
+    print("label", model(*all_one_input))
 
     print(f"Starting export process...")
 

From df1bec55defe32519bb22481fe1ed52fde2e722b Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 Aug 2025 11:25:22 -0700
Subject: [PATCH 19/50] remove extra code

---
 backends/aoti/runtime/shims/cuda_ops.cpp | 282 -----------------------
 backends/aoti/runtime/shims/cuda_ops.h   |  54 -----
 2 files changed, 336 deletions(-)
 delete mode 100644 backends/aoti/runtime/shims/cuda_ops.cpp
 delete mode 100644 backends/aoti/runtime/shims/cuda_ops.h

diff --git a/backends/aoti/runtime/shims/cuda_ops.cpp b/backends/aoti/runtime/shims/cuda_ops.cpp
deleted file mode 100644
index d89f1d36e7e..00000000000
--- a/backends/aoti/runtime/shims/cuda_ops.cpp
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "cuda_ops.h"
-#include "memory.h"
-#include "tensor_attribute.h"
-#include <iostream>
-#include <cudnn.h>
-#include <cublas_v2.h>
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
-// Global cuDNN handle
-static cudnnHandle_t cudnn_handle = nullptr;
-
-// Initialize cuDNN handle
-static void init_cudnn() {
-    if (cudnn_handle == nullptr) {
-        cudnnCreate(&cudnn_handle);
-    }
-}
-
-extern "C" {
-
-AOTITorchError aoti_torch_cuda_addmm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    AtenTensorHandle mat1,
-    AtenTensorHandle mat2,
-    double beta,
-    double alpha) {
-    
-    std::cout << "aoti_torch_cuda_addmm_out called with beta=" << beta << ", alpha=" << alpha << std::endl;
-    
-    // Get tensor dimensions
-    auto mat1_sizes = mat1->sizes();
-    auto mat2_sizes = mat2->sizes();
-    auto self_sizes = self->sizes();
-    auto out_sizes = out->sizes();
-    
-    // mat1: [M, K], mat2: [K, N], result: [M, N]
-    int64_t M = mat1_sizes[0];
-    int64_t K = mat1_sizes[1];
-    int64_t N = mat2_sizes[1];
-    
-    std::cout << "ADDMM: mat1[" << M << "," << K << "] @ mat2[" << K << "," << N << "] -> out[" << M << "," << N << "]" << std::endl;
-    
-    // Use cuBLAS for matrix multiplication
-    cublasHandle_t cublas_handle;
-    cublasStatus_t cublas_status = cublasCreate(&cublas_handle);
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        std::cerr << "Failed to create cuBLAS handle" << std::endl;
-        return Error::Internal;
-    }
-    
-    // Set cuBLAS to use tensor op math for better performance
-    cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH);
-    
-    const float f_alpha = static_cast<float>(alpha);
-    const float f_beta = static_cast<float>(beta);
-    
-    // Perform: out = beta * self + alpha * (mat1 @ mat2)
-    // First: out = beta * self (copy self to out and scale)
-    if (beta != 0.0) {
-        Error copy_err = aoti_torch_copy_(out, self, 0);
-        if (copy_err != Error::Ok) {
-            cublasDestroy(cublas_handle);
-            return copy_err;
-        }
-        
-        // Scale by beta if not 1.0
-        if (beta != 1.0) {
-            cublas_status = cublasSscal(cublas_handle, M * N, &f_beta, 
-                                       static_cast<float*>(out->mutable_data_ptr()), 1);
-            if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-                std::cerr << "cuBLAS scale failed" << std::endl;
-                cublasDestroy(cublas_handle);
-                return Error::Internal;
-            }
-        }
-    } else {
-        // Zero out the output tensor
-        cudaMemset(out->mutable_data_ptr(), 0, M * N * sizeof(float));
-    }
-    
-    // Then: out += alpha * (mat1 @ mat2)
-    // cuBLAS uses column-major, so we compute: C = alpha * A^T * B^T + beta * C
-    // Which gives us: out = alpha * mat1 @ mat2 + beta * out
-    const float gemm_beta = 1.0f; // Since we already handled the beta scaling above
-    
-    cublas_status = cublasSgemm(
-        cublas_handle,
-        CUBLAS_OP_N, CUBLAS_OP_N,  // No transpose for column-major interpretation
-        N, M, K,                   // Dimensions swapped for column-major
-        &f_alpha,                  // alpha
-        static_cast<const float*>(mat2->data_ptr()), N,  // B matrix (mat2)
-        static_cast<const float*>(mat1->data_ptr()), K,  // A matrix (mat1)  
-        &gemm_beta,                // beta (1.0 since we pre-scaled)
-        static_cast<float*>(out->mutable_data_ptr()), N  // C matrix (out)
-    );
-    
-    if (cublas_status != CUBLAS_STATUS_SUCCESS) {
-        std::cerr << "cuBLAS GEMM failed: " << cublas_status << std::endl;
-        cublasDestroy(cublas_handle);
-        return Error::Internal;
-    }
-    
-    cublasDestroy(cublas_handle);
-    
-    std::cout << "aoti_torch_cuda_addmm_out completed successfully" << std::endl;
-    return Error::Ok;
-}
-
-AOTITorchError aoti_torch_cuda_convolution(
-    AtenTensorHandle input,
-    AtenTensorHandle weight,
-    AtenTensorHandle* bias,
-    const int64_t* stride,
-    int64_t stride_len_,
-    const int64_t* padding,
-    int64_t padding_len_,
-    const int64_t* dilation,
-    int64_t dilation_len_,
-    int32_t transposed,
-    const int64_t* output_padding,
-    int64_t output_padding_len_,
-    int64_t groups,
-    AtenTensorHandle* ret0) {
-    
-    std::cout << "aoti_torch_cuda_convolution called" << std::endl;
-    
-    init_cudnn();
-    
-    // Get input dimensions
-    auto input_sizes = input->sizes();
-    auto weight_sizes = weight->sizes();
-    
-    int batch_size = input_sizes[0];
-    int input_channels = input_sizes[1];
-    int input_height = input_sizes[2];
-    int input_width = input_sizes[3];
-    
-    int output_channels = weight_sizes[0];
-    int kernel_height = weight_sizes[2];
-    int kernel_width = weight_sizes[3];
-    
-    // Calculate output dimensions
-    int output_height = (input_height + 2 * padding[0] - dilation[0] * (kernel_height - 1) - 1) / stride[0] + 1;
-    int output_width = (input_width + 2 * padding[1] - dilation[1] * (kernel_width - 1) - 1) / stride[1] + 1;
-    
-    std::cout << "Conv2d: input[" << batch_size << "," << input_channels << "," << input_height << "," << input_width << "]"
-              << " -> output[" << batch_size << "," << output_channels << "," << output_height << "," << output_width << "]" << std::endl;
-    
-    // Create output tensor
-    std::vector<int64_t> output_sizes = {batch_size, output_channels, output_height, output_width};
-    
-    AOTITensorHandle output_handle;
-    Error create_err = aoti_torch_empty_strided(
-        output_sizes.size(),
-        output_sizes.data(),
-        nullptr, // use default strides
-        6, // float32 dtype
-        1, // cuda device
-        0, // device index
-        &output_handle);
-    
-    if (create_err != Error::Ok) {
-        std::cerr << "Failed to create output tensor for convolution" << std::endl;
-        return create_err;
-    }
-    
-    // Setup cuDNN descriptors
-    cudnnTensorDescriptor_t input_desc, output_desc, bias_desc;
-    cudnnFilterDescriptor_t weight_desc;
-    cudnnConvolutionDescriptor_t conv_desc;
-    
-    cudnnCreateTensorDescriptor(&input_desc);
-    cudnnCreateTensorDescriptor(&output_desc);
-    cudnnCreateTensorDescriptor(&bias_desc);
-    cudnnCreateFilterDescriptor(&weight_desc);
-    cudnnCreateConvolutionDescriptor(&conv_desc);
-    
-    // Set tensor descriptors
-    cudnnSetTensorNdDescriptor(input_desc, CUDNN_DATA_FLOAT, 4, 
-                               (int*)input_sizes.data(), 
-                               (int*)input->strides().data());
-    
-    cudnnSetTensorNdDescriptor(output_desc, CUDNN_DATA_FLOAT, 4,
-                               (int*)output_sizes.data(),
-                               (int*)output_handle->strides().data());
-    
-    cudnnSetFilterNdDescriptor(weight_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4,
-                               (int*)weight_sizes.data());
-    
-    // Set convolution descriptor
-    cudnnSetConvolutionNdDescriptor(conv_desc, 2,
-                                    (int*)padding, (int*)stride, (int*)dilation,
-                                    CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
-    
-    if (groups > 1) {
-        cudnnSetConvolutionGroupCount(conv_desc, groups);
-    }
-    
-    // Find best convolution algorithm
-    cudnnConvolutionFwdAlgo_t algo;
-    cudnnGetConvolutionForwardAlgorithm(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc,
-                                        CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo);
-    
-    // Get workspace size
-    size_t workspace_size;
-    cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, algo, &workspace_size);
-    
-    // Allocate workspace
-    void* workspace = nullptr;
-    if (workspace_size > 0) {
-        cudaMalloc(&workspace, workspace_size);
-    }
-    
-    // Perform convolution
-    const float alpha = 1.0f, beta = 0.0f;
-    cudnnStatus_t conv_status = cudnnConvolutionForward(
-        cudnn_handle,
-        &alpha,
-        input_desc, input->data_ptr(),
-        weight_desc, weight->data_ptr(),
-        conv_desc, algo,
-        workspace, workspace_size,
-        &beta,
-        output_desc, output_handle->mutable_data_ptr());
-    
-    if (conv_status != CUDNN_STATUS_SUCCESS) {
-        std::cerr << "cuDNN convolution failed: " << cudnnGetErrorString(conv_status) << std::endl;
-        if (workspace) cudaFree(workspace);
-        cudnnDestroyTensorDescriptor(input_desc);
-        cudnnDestroyTensorDescriptor(output_desc);
-        cudnnDestroyTensorDescriptor(bias_desc);
-        cudnnDestroyFilterDescriptor(weight_desc);
-        cudnnDestroyConvolutionDescriptor(conv_desc);
-        aoti_torch_delete_tensor_object(output_handle);
-        return Error::Internal;
-    }
-    
-    // Add bias if present
-    if (bias && *bias) {
-        auto bias_sizes = (*bias)->sizes();
-        cudnnSetTensorNdDescriptor(bias_desc, CUDNN_DATA_FLOAT, 4,
-                                   (int*)bias_sizes.data(),
-                                   (int*)(*bias)->strides().data());
-        
-        cudnnAddTensor(cudnn_handle, &alpha, bias_desc, (*bias)->data_ptr(),
-                       &alpha, output_desc, output_handle->mutable_data_ptr());
-    }
-    
-    // Cleanup
-    if (workspace) cudaFree(workspace);
-    cudnnDestroyTensorDescriptor(input_desc);
-    cudnnDestroyTensorDescriptor(output_desc);
-    cudnnDestroyTensorDescriptor(bias_desc);
-    cudnnDestroyFilterDescriptor(weight_desc);
-    cudnnDestroyConvolutionDescriptor(conv_desc);
-    
-    *ret0 = output_handle;
-    
-    std::cout << "aoti_torch_cuda_convolution completed successfully" << std::endl;
-    return Error::Ok;
-}
-
-} // extern "C"
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
\ No newline at end of file
diff --git a/backends/aoti/runtime/shims/cuda_ops.h b/backends/aoti/runtime/shims/cuda_ops.h
deleted file mode 100644
index 699c87322d2..00000000000
--- a/backends/aoti/runtime/shims/cuda_ops.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/core/error.h>
-#include "tensor_attribute.h"
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
-extern "C" {
-
-// CUDA addmm operation: out = beta * self + alpha * (mat1 @ mat2)
-AOTITorchError aoti_torch_cuda_addmm_out(
-    AtenTensorHandle out,
-    AtenTensorHandle self,
-    AtenTensorHandle mat1,
-    AtenTensorHandle mat2,
-    double beta,
-    double alpha);
-
-// CUDA convolution operation
-AOTITorchError aoti_torch_cuda_convolution(
-    AtenTensorHandle input,
-    AtenTensorHandle weight,
-    AtenTensorHandle* bias,
-    const int64_t* stride,
-    int64_t stride_len_,
-    const int64_t* padding,
-    int64_t padding_len_,
-    const int64_t* dilation,
-    int64_t dilation_len_,
-    int32_t transposed,
-    const int64_t* output_padding,
-    int64_t output_padding_len_,
-    int64_t groups,
-    AtenTensorHandle* ret0);
-
-} // extern "C"
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
\ No newline at end of file

From db8a40070c37ffb04edef4907afc3a8a7afcf4e3 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 Aug 2025 13:25:08 -0700
Subject: [PATCH 20/50] solved crash when destroying backend

---
 backends/aoti/runtime/aoti_backend.cpp | 52 ++++++++++++--------------
 export_aoti.py                         |  4 +-
 2 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 4c065fbeeb6..425e078c549 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -166,33 +166,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     size_t n_inputs;
     AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
 
-    // for (int i = 0; i < n_inputs; i++) {
-    //   const char* input_name;
-    //   AOTInductorModelContainerGetInputName(
-    //       handle->container_handle, i, &input_name);
-    //   ET_LOG(Debug, "AOTIBackend %d-th input name %s", i, input_name);
-    // }
-
-    // AOTInductorModelContainerGetNumConstants(
-    //     handle->container_handle, &n_constants);
-    // size_t n_user_inputs = n_inputs - n_constants;
-
-    // if (n_user_inputs != n_inputs) {
-    //   ET_LOG(
-    //       Error,
-    //       "number of user input does not match number of inputs.
-    //       n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.",
-    //       n_user_inputs,
-    //       n_constants,
-    //       n_inputs);
-    //   return Error::InvalidArgument;
-    // }
-
-    // ET_LOG(
-    //     Debug,
-    //     "AOTIBackend n_inputs %zd generated, where %zd is constant input,
-    //     %zd is user input", n_inputs, n_constants, n_user_inputs);
-
     size_t n_outputs;
     AOTInductorModelContainerGetNumOutputs(
         handle->container_handle, &n_outputs);
@@ -381,8 +354,29 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   void destroy(DelegateHandle* handle_) const override {
     ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
-    dlclose(handle->so_handle);
-    AOTInductorModelContainerDelete(handle->container_handle);
+
+    // Delete the container BEFORE closing the shared library
+    if (handle->container_handle != nullptr) {
+      AOTIRuntimeError delete_result =
+          AOTInductorModelContainerDelete(handle->container_handle);
+      if (delete_result != Error::Ok) {
+        ET_LOG(
+            Error,
+            "AOTInductorModelContainerDelete failed with error code %d",
+            delete_result);
+      } else {
+        ET_LOG(
+            Debug,
+            "AOTIBackend container_handle %p deleted",
+            handle->container_handle);
+      }
+    }
+
+    // Now close the shared library
+    if (handle->so_handle != nullptr) {
+      dlclose(handle->so_handle);
+    }
+
     free(handle);
     cleanup_memory();
     cleanup_tensor_metadata();
diff --git a/export_aoti.py b/export_aoti.py
index 3ca5287b3b9..ff78ce3be95 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -41,7 +41,7 @@ def forward(self, x: torch.Tensor):
 class Linear(torch.nn.Module):
     def __init__(self):
         super(Linear, self).__init__()
-        self.linear = nn.Linear(3, 5)
+        self.linear = nn.Linear(7, 101)
 
     def forward(self, x: torch.Tensor):
         return self.linear(x)
@@ -76,7 +76,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
     },
     "linear": {
         "model_class": Linear,
-        "input_shapes": [(4, 3)],
+        "input_shapes": [(127, 7)],
         "device": "cuda",
         "description": "Simple linear layer model",
     },

From ece2776c20ea46087e4d9d4093d8602795b6e0e1 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 Aug 2025 14:42:50 -0700
Subject: [PATCH 21/50] change to use to_edge_transform_and_lower

---
 backends/aoti/runtime/aoti_backend.cpp |  7 +------
 export_aoti.py                         | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 425e078c549..453613d47f8 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -352,7 +352,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   }
 
   void destroy(DelegateHandle* handle_) const override {
-    ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
     // Delete the container BEFORE closing the shared library
@@ -364,11 +363,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
             Error,
             "AOTInductorModelContainerDelete failed with error code %d",
             delete_result);
-      } else {
-        ET_LOG(
-            Debug,
-            "AOTIBackend container_handle %p deleted",
-            handle->container_handle);
       }
     }
 
@@ -380,6 +374,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     free(handle);
     cleanup_memory();
     cleanup_tensor_metadata();
+    ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
   }
 };
 
diff --git a/export_aoti.py b/export_aoti.py
index ff78ce3be95..bfcf38408d7 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -21,7 +21,7 @@
 
 import torch
 from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
-from executorch.exir import to_edge
+from executorch.exir import to_edge_transform_and_lower
 from torch import nn
 from torch.export import export
 from torchvision import models
@@ -135,13 +135,17 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     aten_dialect = export(model, example_inputs)
 
     # 2. to_edge: Make optimizations for Edge devices
-    print("Step 2: Converting to Edge program...")
-    edge_program = to_edge(aten_dialect)
-    print(edge_program.exported_program().graph.print_tabular())
+    # print("Step 2: Converting to Edge program...")
+    # edge_program = to_edge(aten_dialect)
+    # print(edge_program.exported_program().graph.print_tabular())
 
-    print("Step 3: Converting to backend...")
-    edge_program = edge_program.to_backend(AotiPartitioner([]))
-    print("To backend done.")
+    # print("Step 3: Converting to backend...")
+    # edge_program = edge_program.to_backend(AotiPartitioner([]))
+    # print("To backend done.")
+
+    edge_program = to_edge_transform_and_lower(
+        aten_dialect, partitioner=[AotiPartitioner([])]
+    )
 
     # 3. to_executorch: Convert the graph to an ExecuTorch program
     print("Step 4: Converting to ExecuTorch program...")

From 8e42a30d96b4cad123a5296f10de183f93a58c02 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 21 Aug 2025 23:09:03 -0700
Subject: [PATCH 22/50] use aoti decomposition on lowable graph

---
 backends/aoti/aoti_partitioner.py          | 349 +++++++++++----------
 backends/arm/third-party/serialization_lib |   1 +
 export_aoti.py                             |   3 +
 3 files changed, 186 insertions(+), 167 deletions(-)
 create mode 160000 backends/arm/third-party/serialization_lib

diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index f72b97f0253..8a17a5364ae 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -7,7 +7,7 @@
 # pyre-unsafe
 
 import operator
-from typing import cast, final, List
+from typing import Callable, cast, Dict, final, List, Optional, Set, Tuple
 
 import torch
 from executorch.backends.aoti.aoti_backend import AotiBackend  # usort: skip
@@ -24,167 +24,168 @@
 
 from torch.fx.passes.operator_support import OperatorSupportBase
 
-supported_fallback_operators = []
-
-inductor_fallback_ops: dict[str, dict[str, list[str]]] = {
-    "aten._adaptive_avg_pool2d_backward.default": {},
-    "aten._adaptive_avg_pool2d.default": {},
-    "aten._adaptive_avg_pool3d_backward.default": {},
-    "aten._adaptive_avg_pool3d.default": {},
-    "aten._addmm_activation.default": {},
-    "aten._cdist_backward.default": {},
-    "aten._cdist_forward.default": {},
-    "aten._cudnn_rnn.default": {},
-    "aten._dyn_quant_matmul_4bit.default": {},
-    "aten._dyn_quant_pack_4bit_weight.default": {},
-    "aten._efficient_attention_backward.default": {},
-    "aten._efficient_attention_forward.default": {},
-    "aten._efficientzerotensor.default": {},
-    "aten._embedding_bag_dense_backward.default": {},
-    "aten._embedding_bag_forward_only.default": {},
-    "aten._embedding_bag_per_sample_weights_backward.default": {},
-    "aten._embedding_bag.default": {},
-    "aten._fft_c2c.default": {},
-    "aten._fft_r2c.default": {},
-    "aten._flash_attention_backward.default": {},
-    "aten._flash_attention_forward.default": {},
-    "aten._fused_moving_avg_obs_fq_helper_functional.default": {},
-    "aten._fused_moving_avg_obs_fq_helper.default": {},
-    "aten._fused_rms_norm.default": {},
-    "aten._histogramdd_from_bin_cts.default": {},
-    "aten._int_mm.out": {},
-    "aten._pdist_backward.default": {},
-    "aten._pdist_forward.default": {},
-    "aten._scaled_dot_product_attention_math_for_mps.default": {},
-    "aten._scaled_dot_product_cudnn_attention_backward.default": {},
-    "aten._scaled_dot_product_cudnn_attention.default": {},
-    "aten._scaled_dot_product_efficient_attention_backward.default": {},
-    "aten._scaled_dot_product_efficient_attention.default": {},
-    "aten._scaled_dot_product_flash_attention_backward.default": {},
-    "aten._scaled_dot_product_flash_attention_for_cpu_backward.default": {},
-    "aten._scaled_dot_product_flash_attention_for_cpu.default": {},
-    "aten._scaled_dot_product_flash_attention.default": {},
-    "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {},
-    "aten._scaled_dot_product_fused_attention_overrideable.default": {},
-    "aten._scaled_mm.default": {},
-    "aten._scaled_mm.out": {},
-    "aten._segment_reduce_backward.default": {},
-    "aten._thnn_fused_lstm_cell.default": {},
-    "aten._to_sparse.default": {},
-    "aten._trilinear.default": {},
-    "aten._weight_int4pack_mm.default": {},
-    "aten._weight_int8pack_mm.default": {},
-    "aten.abs.default": {},
-    "aten.adaptive_max_pool2d_backward.default": {},
-    "aten.adaptive_max_pool2d.default": {},
-    "aten.adaptive_max_pool3d_backward.default": {},
-    "aten.adaptive_max_pool3d.default": {},
-    "aten.add.Scalar": {},
-    "aten.add.Tensor": {},
-    "aten.addbmm.default": {},
-    "aten.addmm.out": {},
-    "aten.addmv.default": {},
-    "aten.angle.default": {},
-    "aten.avg_pool2d_backward.default": {},
-    "aten.avg_pool2d.default": {},
-    "aten.avg_pool3d_backward.default": {},
-    "aten.avg_pool3d.default": {},
-    "aten.baddbmm.out": {},
-    "aten.bernoulli_.float": {},
-    "aten.bernoulli_.Tensor": {},
-    "aten.bmm.out": {},
-    "aten.bucketize.Tensor": {},
-    "aten.cat.default": {},
-    "aten.cholesky_inverse.default": {},
-    "aten.cholesky_solve.default": {},
-    "aten.convolution_backward.default": {},
-    "aten.convolution.default": {},
-    "aten.cummax.default": {},
-    "aten.cummin.default": {},
-    "aten.cumprod.default": {},
-    "aten.cumsum.default": {},
-    "aten.exponential.default": {},
-    "aten.fill_.Scalar": {},
-    "aten.fractional_max_pool2d_backward.default": {},
-    "aten.fractional_max_pool2d.default": {},
-    "aten.fractional_max_pool3d_backward.default": {},
-    "aten.fractional_max_pool3d.default": {},
-    "aten.gcd.default": {},
-    "aten.geqrf.default": {},
-    "aten.grid_sampler_2d_backward.default": {},
-    "aten.hann_window.default": {},
-    "aten.histc.default": {},
-    "aten.histogram.bin_ct": {},
-    "aten.index_put.default": {},
-    "aten.index_reduce.default": {},
-    "aten.index.Tensor": {},
-    "aten.kthvalue.default": {},
-    "aten.logcumsumexp.default": {},
-    "aten.lu_unpack.default": {},
-    "aten.masked_scatter_backward.default": {},
-    "aten.masked_scatter.default": {},
-    "aten.masked_select.default": {},
-    "aten.max_pool2d_with_indices_backward.default": {},
-    "aten.max_pool2d_with_indices.default": {},
-    "aten.max_pool3d_with_indices_backward.default": {},
-    "aten.max_pool3d_with_indices.default": {},
-    "aten.max_unpool2d.default": {},
-    "aten.max_unpool3d.default": {},
-    "aten.median.default": {},
-    "aten.mm.out": {},
-    "aten.mode.default": {},
-    "aten.mul.Scalar": {},
-    "aten.mul.Tensor": {},
-    "aten.nanmedian.default": {},
-    "aten.narrow.default": {},
-    "aten.native_dropout.default": {},
-    "aten.nonzero.default": {},
-    "aten.normal_functional.default": {},
-    "aten.ormqr.default": {},
-    "aten.pad.default": {},
-    "aten.permute.default": {},
-    "aten.polar.default": {},
-    "aten.pow.Scalar": {},
-    "aten.pow.Tensor_Scalar": {},
-    "aten.pow.Tensor_Tensor": {},
-    "aten.rand.default": {},
-    "aten.rand.generator": {},
-    "aten.randint.default": {},
-    "aten.randint.generator": {},
-    "aten.randint.low_out": {},
-    "aten.randint.low": {},
-    "aten.randn.default": {},
-    "aten.randn.generator": {},
-    "aten.randperm.default": {},
-    "aten.repeat_interleave.Tensor": {},
-    "aten.replication_pad1d_backward.default": {},
-    "aten.replication_pad2d_backward.default": {},
-    "aten.reshape.default": {},
-    "aten.resize_.default": {},
-    "aten.resize_as_.default": {},
-    "aten.scatter_reduce.two_out": {},
-    "aten.scatter.src_out": {},
-    "aten.scatter.value_out": {},
-    "aten.searchsorted.Scalar": {},
-    "aten.searchsorted.Tensor": {},
-    "aten.segment_reduce.default": {},
-    "aten.set_.source_Tensor": {},
-    "aten.slice.Tensor": {},
-    "aten.soft_margin_loss_backward.default": {},
-    "aten.sort.default": {},
-    "aten.sort.stable": {},
-    "aten.squeeze.dim": {},
-    "aten.to_sparse.default": {},
-    "aten.topk.default": {},
-    "aten.triangular_solve.default": {},
-    "aten.uniform.default": {},
-    "aten.upsample_bicubic2d_backward.default": {},
-    "aten.upsample_linear1d_backward.default": {},
-    "aten.upsample_trilinear3d_backward.default": {},
-    "aten.view_as_complex.default": {},
-    "aten.view_as_real.default": {},
-    "aten.view.dtype": {},
-    "aten._weight_int4pack_mm_with_scales_and_zeros.default": {},
+# exist fallback operators in et namespace; should map to inductor_fallback_ops
+supported_fallback_operators: Dict[str, Dict[str, List[str]]] = {}
+
+inductor_fallback_ops: Set[str] = {
+    "aten._adaptive_avg_pool2d_backward.default",
+    "aten._adaptive_avg_pool2d.default",
+    "aten._adaptive_avg_pool3d_backward.default",
+    "aten._adaptive_avg_pool3d.default",
+    "aten._addmm_activation.default",
+    "aten._cdist_backward.default",
+    "aten._cdist_forward.default",
+    "aten._cudnn_rnn.default",
+    "aten._dyn_quant_matmul_4bit.default",
+    "aten._dyn_quant_pack_4bit_weight.default",
+    "aten._efficient_attention_backward.default",
+    "aten._efficient_attention_forward.default",
+    "aten._efficientzerotensor.default",
+    "aten._embedding_bag_dense_backward.default",
+    "aten._embedding_bag_forward_only.default",
+    "aten._embedding_bag_per_sample_weights_backward.default",
+    "aten._embedding_bag.default",
+    "aten._fft_c2c.default",
+    "aten._fft_r2c.default",
+    "aten._flash_attention_backward.default",
+    "aten._flash_attention_forward.default",
+    "aten._fused_moving_avg_obs_fq_helper_functional.default",
+    "aten._fused_moving_avg_obs_fq_helper.default",
+    "aten._fused_rms_norm.default",
+    "aten._histogramdd_from_bin_cts.default",
+    "aten._int_mm.out",
+    "aten._pdist_backward.default",
+    "aten._pdist_forward.default",
+    "aten._scaled_dot_product_attention_math_for_mps.default",
+    "aten._scaled_dot_product_cudnn_attention_backward.default",
+    "aten._scaled_dot_product_cudnn_attention.default",
+    "aten._scaled_dot_product_efficient_attention_backward.default",
+    "aten._scaled_dot_product_efficient_attention.default",
+    "aten._scaled_dot_product_flash_attention_backward.default",
+    "aten._scaled_dot_product_flash_attention_for_cpu_backward.default",
+    "aten._scaled_dot_product_flash_attention_for_cpu.default",
+    "aten._scaled_dot_product_flash_attention.default",
+    "aten._scaled_dot_product_fused_attention_overrideable_backward.default",
+    "aten._scaled_dot_product_fused_attention_overrideable.default",
+    "aten._scaled_mm.default",
+    "aten._scaled_mm.out",
+    "aten._segment_reduce_backward.default",
+    "aten._thnn_fused_lstm_cell.default",
+    "aten._to_sparse.default",
+    "aten._trilinear.default",
+    "aten._weight_int4pack_mm.default",
+    "aten._weight_int8pack_mm.default",
+    "aten.abs.default",
+    "aten.adaptive_max_pool2d_backward.default",
+    "aten.adaptive_max_pool2d.default",
+    "aten.adaptive_max_pool3d_backward.default",
+    "aten.adaptive_max_pool3d.default",
+    "aten.add.Scalar",
+    "aten.add.Tensor",
+    "aten.addbmm.default",
+    "aten.addmm.out",
+    "aten.addmv.default",
+    "aten.angle.default",
+    "aten.avg_pool2d_backward.default",
+    "aten.avg_pool2d.default",
+    "aten.avg_pool3d_backward.default",
+    "aten.avg_pool3d.default",
+    "aten.baddbmm.out",
+    "aten.bernoulli_.float",
+    "aten.bernoulli_.Tensor",
+    "aten.bmm.out",
+    "aten.bucketize.Tensor",
+    "aten.cat.default",
+    "aten.cholesky_inverse.default",
+    "aten.cholesky_solve.default",
+    "aten.convolution_backward.default",
+    "aten.convolution.default",
+    "aten.cummax.default",
+    "aten.cummin.default",
+    "aten.cumprod.default",
+    "aten.cumsum.default",
+    "aten.exponential.default",
+    "aten.fill_.Scalar",
+    "aten.fractional_max_pool2d_backward.default",
+    "aten.fractional_max_pool2d.default",
+    "aten.fractional_max_pool3d_backward.default",
+    "aten.fractional_max_pool3d.default",
+    "aten.gcd.default",
+    "aten.geqrf.default",
+    "aten.grid_sampler_2d_backward.default",
+    "aten.hann_window.default",
+    "aten.histc.default",
+    "aten.histogram.bin_ct",
+    "aten.index_put.default",
+    "aten.index_reduce.default",
+    "aten.index.Tensor",
+    "aten.kthvalue.default",
+    "aten.logcumsumexp.default",
+    "aten.lu_unpack.default",
+    "aten.masked_scatter_backward.default",
+    "aten.masked_scatter.default",
+    "aten.masked_select.default",
+    "aten.max_pool2d_with_indices_backward.default",
+    "aten.max_pool2d_with_indices.default",
+    "aten.max_pool3d_with_indices_backward.default",
+    "aten.max_pool3d_with_indices.default",
+    "aten.max_unpool2d.default",
+    "aten.max_unpool3d.default",
+    "aten.median.default",
+    "aten.mm.out",
+    "aten.mode.default",
+    "aten.mul.Scalar",
+    "aten.mul.Tensor",
+    "aten.nanmedian.default",
+    "aten.narrow.default",
+    "aten.native_dropout.default",
+    "aten.nonzero.default",
+    "aten.normal_functional.default",
+    "aten.ormqr.default",
+    "aten.pad.default",
+    "aten.permute.default",
+    "aten.polar.default",
+    "aten.pow.Scalar",
+    "aten.pow.Tensor_Scalar",
+    "aten.pow.Tensor_Tensor",
+    "aten.rand.default",
+    "aten.rand.generator",
+    "aten.randint.default",
+    "aten.randint.generator",
+    "aten.randint.low_out",
+    "aten.randint.low",
+    "aten.randn.default",
+    "aten.randn.generator",
+    "aten.randperm.default",
+    "aten.repeat_interleave.Tensor",
+    "aten.replication_pad1d_backward.default",
+    "aten.replication_pad2d_backward.default",
+    "aten.reshape.default",
+    "aten.resize_.default",
+    "aten.resize_as_.default",
+    "aten.scatter_reduce.two_out",
+    "aten.scatter.src_out",
+    "aten.scatter.value_out",
+    "aten.searchsorted.Scalar",
+    "aten.searchsorted.Tensor",
+    "aten.segment_reduce.default",
+    "aten.set_.source_Tensor",
+    "aten.slice.Tensor",
+    "aten.soft_margin_loss_backward.default",
+    "aten.sort.default",
+    "aten.sort.stable",
+    "aten.squeeze.dim",
+    "aten.to_sparse.default",
+    "aten.topk.default",
+    "aten.triangular_solve.default",
+    "aten.uniform.default",
+    "aten.upsample_bicubic2d_backward.default",
+    "aten.upsample_linear1d_backward.default",
+    "aten.upsample_trilinear3d_backward.default",
+    "aten.view_as_complex.default",
+    "aten.view_as_real.default",
+    "aten.view.dtype",
+    "aten._weight_int4pack_mm_with_scales_and_zeros.default",
 }
 
 
@@ -193,13 +194,9 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported = node.op == "call_function" and (
             node.target == operator.getitem
             or node.target._op not in inductor_fallback_ops
+            or node.target._op in supported_fallback_operators
         )
 
-        # if node.op == "call_function" and node.target != operator.getitem:
-        #     print(node.target._op)
-        #     print(supported)
-        #     print('------------------')
-
         return supported
 
     def is_node_supported_custom(self, node: torch.fx.Node) -> bool:
@@ -248,3 +245,21 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        """
+        Return a list of operations that should not be decomposed and let the AOT compiler handle them.
+        """
+        do_not_decompose = set()
+        op_support = AOTISupportedOperators()
+
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.add(node.target)
+        return list(do_not_decompose), None
diff --git a/backends/arm/third-party/serialization_lib b/backends/arm/third-party/serialization_lib
new file mode 160000
index 00000000000..187af0d41fe
--- /dev/null
+++ b/backends/arm/third-party/serialization_lib
@@ -0,0 +1 @@
+Subproject commit 187af0d41fe75d08d2a7ec84c1b4d24b9b641ed2
diff --git a/export_aoti.py b/export_aoti.py
index bfcf38408d7..d60c6eccad1 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -143,6 +143,9 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     # edge_program = edge_program.to_backend(AotiPartitioner([]))
     # print("To backend done.")
 
+    # aoti part should be decomposed by the internal torch._inductor.aot_compile
+    # we should preserve the lowerable part and waiting for aoti backend handle that
+    # Q: maybe need to turn on fallback_random?
     edge_program = to_edge_transform_and_lower(
         aten_dialect, partitioner=[AotiPartitioner([])]
     )

From aa94acf6ef97a8be852b05d5d199b176d8412f07 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Fri, 22 Aug 2025 13:48:08 -0700
Subject: [PATCH 23/50] update test script to support raw aoti

---
 backends/aoti/aoti_partitioner.py |   2 +
 export_and_run_aoti.sh            |  28 ++++---
 export_aoti.py                    | 126 +++++++++++++++++++++++++-----
 3 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index 8a17a5364ae..d490b261da6 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -196,6 +196,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             or node.target._op not in inductor_fallback_ops
             or node.target._op in supported_fallback_operators
         )
+        if supported and node.target != operator.getitem:
+            print(f"op {node.target._op} is supported: {supported}")
 
         return supported
 
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 7aa4950c790..54f1d0b5092 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -10,7 +10,7 @@
 #   ./export_and_run_aoti.sh conv2d inference          # Uses inference mode
 #   ./export_and_run_aoti.sh conv2d --mode=inference   # Alternative syntax
 #
-# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference
+# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference, export_aoti_only
 # model_arg: argument to pass to export_aoti.py
 
 set -e  # Exit on any error
@@ -26,7 +26,7 @@ for arg in "$@"; do
             MODE="${arg#*=}"
             shift
             ;;
-        reinstall_all|reinstall_aot|reinstall_runtime|inference)
+        reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only)
             # If it's the second argument and a valid mode, use it as mode
             if [[ "$arg" == "$2" ]]; then
                 MODE="$arg"
@@ -37,17 +37,18 @@ done
 
 # Validate mode
 case "$MODE" in
-    reinstall_all|reinstall_aot|reinstall_runtime|inference)
+    reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only)
         # Valid mode, continue
         ;;
     *)
         echo "Error: Unknown mode '$MODE'"
-        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference"
+        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference, export_aoti_only"
         echo ""
         echo "Usage examples:"
-        echo "  ./export_and_run_aoti.sh conv2d                    # Uses default mode"
-        echo "  ./export_and_run_aoti.sh conv2d inference          # Positional mode"
-        echo "  ./export_and_run_aoti.sh conv2d --mode=inference   # GNU-style mode"
+        echo "  ./export_and_run_aoti.sh conv2d                         # Uses default mode"
+        echo "  ./export_and_run_aoti.sh conv2d inference               # Positional mode"
+        echo "  ./export_and_run_aoti.sh conv2d --mode=inference        # GNU-style mode"
+        echo "  ./export_and_run_aoti.sh conv2d export_aoti_only        # Export AOTI only (no runtime)"
         exit 1
         ;;
 esac
@@ -94,8 +95,13 @@ install_executorch() {
 }
 
 export_aoti_model() {
+    local use_aoti_only=$1
     echo "Exporting AOTI model..."
-    python export_aoti.py $MODEL_ARG
+    if [[ "$use_aoti_only" == "--aoti_only" ]]; then
+        python export_aoti.py $MODEL_ARG --aoti_only
+    else
+        python export_aoti.py $MODEL_ARG
+    fi
 }
 
 clean_install_executorch() {
@@ -151,9 +157,13 @@ case "$MODE" in
         export_aoti_model
         run_inference
         ;;
+    "export_aoti_only")
+        echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)"
+        export_aoti_model "--aoti_only"
+        ;;
     *)
         echo "Error: Unknown mode '$MODE'"
-        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference"
+        echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference, export_aoti_only"
         exit 1
         ;;
 esac
diff --git a/export_aoti.py b/export_aoti.py
index d60c6eccad1..69cb0782c1b 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 """
 Unified export script for AOTI backend.
-Usage: python export_aoti.py <model_name>
+Usage:
+  python export_aoti.py <model_name>              # Uses export_model_to_et_aoti
+  python export_aoti.py <model_name> --aoti_only  # Uses export_model_to_pure_aoti
 
 Supported models:
 - mv2: MobileNetV2 model
@@ -10,6 +12,7 @@
 - add: Simple tensor addition model
 """
 
+import argparse
 import copy
 import os
 
@@ -66,6 +69,25 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         return x + y
 
 
+class DepthwiseConv(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # 32 input channels, 32 output channels, groups=32 for depthwise
+        self.conv = nn.Conv2d(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=32,
+            bias=False,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
 # Model registry mapping model names to their configurations
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
     "mv2": {
@@ -86,6 +108,12 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
         "device": "cuda",
         "description": "Single Conv2d layer model",
     },
+    "depthwise_conv": {
+        "model_class": DepthwiseConv,
+        "input_shapes": [(1, 32, 112, 112)],
+        "device": "cuda",
+        "description": "Single Depthwise Conv2d layer model",
+    },
     "add": {
         "model_class": Add,
         "input_shapes": [(10,), (10,)],
@@ -120,7 +148,7 @@ def get_model_and_inputs(
     return model, example_inputs
 
 
-def export_model(model, example_inputs, output_filename="aoti_model.pte"):
+def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.pte"):
     """Export model through the AOTI pipeline."""
     all_one_input = tuple(
         torch.ones_like(example_input) for example_input in example_inputs
@@ -135,14 +163,6 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     aten_dialect = export(model, example_inputs)
 
     # 2. to_edge: Make optimizations for Edge devices
-    # print("Step 2: Converting to Edge program...")
-    # edge_program = to_edge(aten_dialect)
-    # print(edge_program.exported_program().graph.print_tabular())
-
-    # print("Step 3: Converting to backend...")
-    # edge_program = edge_program.to_backend(AotiPartitioner([]))
-    # print("To backend done.")
-
     # aoti part should be decomposed by the internal torch._inductor.aot_compile
     # we should preserve the lowerable part and waiting for aoti backend handle that
     # Q: maybe need to turn on fallback_random?
@@ -163,21 +183,91 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"):
     print(f"Export completed successfully! Output saved to {output_filename}")
 
 
+def export_model_to_pure_aoti(model, example_inputs):
+    """Export model through the AOTI pipeline."""
+    all_one_input = tuple(
+        torch.ones_like(example_input) for example_input in example_inputs
+    )
+
+    print("label", model(*all_one_input))
+
+    print(f"Starting export process...")
+
+    # 1. torch.export: Defines the program with the ATen operator set.
+    print("Step 1: Converting to ATen dialect...")
+    aten_dialect = export(model, example_inputs)
+
+    # 2. torch._inductor.aot_compile to aoti delegate
+    aten_dialect_module = aten_dialect.module()
+
+    output_path = os.path.join(os.getcwd(), "aoti.so")
+
+    options: dict[str, Any] = {
+        "aot_inductor.package_constants_in_so": True,
+        "aot_inductor.output_path": output_path,
+        "aot_inductor.debug_compile": True,
+        "aot_inductor.repro_level": 3,
+        "aot_inductor.debug_intermediate_value_printer": "3",
+        "max_autotune": True,
+        "max_autotune_gemm_backends": "TRITON",
+        "max_autotune_conv_backends": "TRITON",
+    }
+
+    so_path = torch._inductor.aot_compile(aten_dialect_module, example_inputs, options=options)  # type: ignore[arg-type]
+
+    assert so_path == output_path, f"Expected {output_path} but got {so_path}"
+
+    check_call(
+        f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}",
+        shell=True,
+    )
+
+
 def main():
-    if len(sys.argv) != 2:
-        available_models = ", ".join(MODEL_REGISTRY.keys())
-        print("Usage: python export_aoti.py <model_name>")
-        print(f"Available models: {available_models}")
+    # Set up argument parser
+    parser = argparse.ArgumentParser(
+        description="Unified export script for AOTI backend",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    # Add model name as positional argument
+    parser.add_argument(
+        "model_name",
+        help="Name of the model to export",
+        choices=list(MODEL_REGISTRY.keys()),
+        metavar="model_name",
+    )
+
+    # Add the --aoti_only flag
+    parser.add_argument(
+        "--aoti_only",
+        action="store_true",
+        help="Use export_model_to_pure_aoti instead of export_model_to_et_aoti",
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    # Show available models and descriptions in help
+    if len(sys.argv) == 1:
+        parser.print_help()
+        print(f"\nAvailable models: {', '.join(MODEL_REGISTRY.keys())}")
         print("\nModel descriptions:")
         for name, config in MODEL_REGISTRY.items():
             print(f"  {name}: {config['description']}")
         sys.exit(1)
 
-    model_name = sys.argv[1]
-
     try:
-        model, example_inputs = get_model_and_inputs(model_name)
-        export_model(model, example_inputs)
+        model, example_inputs = get_model_and_inputs(args.model_name)
+
+        # Choose export function based on --aoti_only flag
+        if args.aoti_only:
+            print("Using export_model_to_pure_aoti...")
+            export_model_to_pure_aoti(model, example_inputs)
+        else:
+            print("Using export_model_to_et_aoti...")
+            export_model_to_et_aoti(model, example_inputs)
+
     except ValueError as e:
         print(f"Error: {e}")
         sys.exit(1)

From 189871e14c296a3ec8e199de75cf697394d4ec79 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 25 Aug 2025 10:36:16 -0700
Subject: [PATCH 24/50] temp commit

---
 backends/aoti/aoti_backend.py     |  2 ++
 backends/aoti/aoti_partitioner.py | 14 +++++------
 exir/emit/_emit_program.py        | 12 +++++++++
 export_aoti.py                    | 42 ++++++++++++++++++++++++++++++-
 4 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a0793bdb80f..d248ccfae82 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -62,4 +62,6 @@ def preprocess(
             shell=True,
         )
 
+        print("so_path", so_path)
+
         return PreprocessResult(so_path.encode("utf-8"))
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index d490b261da6..6dfe888fec8 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -191,13 +191,13 @@
 
 class AOTISupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        supported = node.op == "call_function" and (
-            node.target == operator.getitem
-            or node.target._op not in inductor_fallback_ops
-            or node.target._op in supported_fallback_operators
-        )
-        if supported and node.target != operator.getitem:
-            print(f"op {node.target._op} is supported: {supported}")
+        # supported = node.op == "call_function" and (
+        #     node.target == operator.getitem
+        #     or str(node.target._op) not in inductor_fallback_ops
+        #     or str(node.target._op) in supported_fallback_operators
+        # )
+
+        supported = node.op == "call_function"
 
         return supported
 
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 0618871bd40..61997e97687 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -156,8 +156,13 @@ def emit_program(
     instruction_id_to_num_outs_map = {}
     program_state = _ProgramState()
 
+    print(
+        "111111111111111111111111111111111111111111111111111111111111111111111111111111"
+    )
+
     # emit each entry point in order according to name.
     for name, exported_program in sorted(methods.items()):
+        print(name)
         # create empty state
         emitter_state = _EmitterState(
             values=[],
@@ -169,6 +174,8 @@ def emit_program(
             emit_mutable_buffer_names=emit_mutable_buffer_names,
         )
 
+        print("222222222222222222222222222222222222222222222222222222222222222222222")
+
         gm = _remove_non_user_outputs(exported_program)
 
         emitter = _TopLevelEmitter(
@@ -176,6 +183,9 @@ def emit_program(
         )
 
         emitter.run()
+
+        print("333333333333333333333333333333333333333333333333333333333333333333333")
+
         plans.append(emitter.plan())
 
         debug_handle_map[name] = emitter.debug_handle_map
@@ -192,6 +202,8 @@ def emit_program(
     if prim_getters is not None:
         plans.extend(emitter._emit_prim_getters(prim_getters))
 
+    print("333333333333333333333333333333333333333333333333333333333333333333333")
+
     return EmitterOutput(
         debug_handle_map=debug_handle_map,
         method_to_delegate_debug_id_map=method_to_delegate_debug_id_map,
diff --git a/export_aoti.py b/export_aoti.py
index 69cb0782c1b..a3924750362 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -24,11 +24,13 @@
 
 import torch
 from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
-from executorch.exir import to_edge_transform_and_lower
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge_transform_and_lower, to_edge
 from torch import nn
 from torch.export import export
 from torchvision import models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
+from torchvision.models.resnet import ResNet18_Weights
 
 
 # Model classes
@@ -41,6 +43,15 @@ def forward(self, x: torch.Tensor):
         return self.mv2(x)
 
 
+class ResNet18(torch.nn.Module):
+    def __init__(self):
+        super(ResNet18, self).__init__()
+        self.resnet18 = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
+
+    def forward(self, x: torch.Tensor):
+        return self.resnet18(x)
+
+
 class Linear(torch.nn.Module):
     def __init__(self):
         super(Linear, self).__init__()
@@ -88,6 +99,15 @@ def forward(self, x):
         return self.conv(x)
 
 
+class BatchNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bn = nn.BatchNorm2d(num_features=16)
+
+    def forward(self, x):
+        return self.bn(x)
+
+
 # Model registry mapping model names to their configurations
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
     "mv2": {
@@ -96,6 +116,12 @@ def forward(self, x):
         "device": "cuda",
         "description": "MobileNetV2 model",
     },
+    "resnet18": {
+        "model_class": ResNet18,
+        "input_shapes": [(1, 3, 224, 224)],
+        "device": "cpu",
+        "description": "ResNet18 model",
+    },
     "linear": {
         "model_class": Linear,
         "input_shapes": [(127, 7)],
@@ -120,6 +146,12 @@ def forward(self, x):
         "device": "cuda",
         "description": "Simple tensor addition model",
     },
+    "batchnorm": {
+        "model_class": BatchNorm,
+        "input_shapes": [(1, 16, 32, 32)],
+        "device": "cuda",
+        "description": "Single BatchNorm2d layer model",
+    },
 }
 
 
@@ -162,14 +194,22 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p
     print("Step 1: Converting to ATen dialect...")
     aten_dialect = export(model, example_inputs)
 
+    # print(aten_dialect)
+    # exit(0)
+
     # 2. to_edge: Make optimizations for Edge devices
     # aoti part should be decomposed by the internal torch._inductor.aot_compile
     # we should preserve the lowerable part and waiting for aoti backend handle that
     # Q: maybe need to turn on fallback_random?
+
     edge_program = to_edge_transform_and_lower(
         aten_dialect, partitioner=[AotiPartitioner([])]
     )
 
+    # edge_program = to_edge(aten_dialect)
+
+    print(edge_program.exported_program())
+
     # 3. to_executorch: Convert the graph to an ExecuTorch program
     print("Step 4: Converting to ExecuTorch program...")
     executorch_program = edge_program.to_executorch()

From 3ec8024897fd32f070004bdccebc7d58b8101c12 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 25 Aug 2025 10:38:16 -0700
Subject: [PATCH 25/50] merge to 0825 main

---
 extension/llm/tokenizers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers
index 4ed91cc545e..f09feca1584 160000
--- a/extension/llm/tokenizers
+++ b/extension/llm/tokenizers
@@ -1 +1 @@
-Subproject commit 4ed91cc545e9ed7098e53747656eb7eff24eb305
+Subproject commit f09feca15849a790c05b3b7855e7c62ce26ba94b

From 041f2b618176f49bc3826548f7e5e4de0509c7b6 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 25 Aug 2025 12:11:26 -0700
Subject: [PATCH 26/50] temp comit

---
 backends/arm/third-party/serialization_lib |  1 -
 install_requirements.py                    | 50 ----------------------
 2 files changed, 51 deletions(-)
 delete mode 160000 backends/arm/third-party/serialization_lib

diff --git a/backends/arm/third-party/serialization_lib b/backends/arm/third-party/serialization_lib
deleted file mode 160000
index 187af0d41fe..00000000000
--- a/backends/arm/third-party/serialization_lib
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 187af0d41fe75d08d2a7ec84c1b4d24b9b641ed2
diff --git a/install_requirements.py b/install_requirements.py
index 7e6d3010f93..0e0084fe3dd 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,58 +12,8 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-<<<<<<< HEAD
 # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
 TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly"
-=======
-def python_is_compatible():
-    # Scrape the version range from pyproject.toml, which should be in the current directory.
-    version_specifier = None
-    with open("pyproject.toml", "r") as file:
-        for line in file:
-            if line.startswith("requires-python"):
-                match = re.search(r'"([^"]*)"', line)
-                if match:
-                    version_specifier = match.group(1)
-                    break
-
-    if not version_specifier:
-        print(
-            "WARNING: Skipping python version check: version range not found",
-            file=sys.stderr,
-        )
-        return False
-
-    # Install the packaging module if necessary.
-    try:
-        import packaging
-    except ImportError:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "packaging"], check=True
-        )
-    # Compare the current python version to the range in version_specifier. Exits
-    # with status 1 if the version is not compatible, or with status 0 if the
-    # version is compatible or the logic itself fails.
-    try:
-        import packaging.specifiers
-        import packaging.version
-
-        python_version = packaging.version.parse(platform.python_version())
-        version_range = packaging.specifiers.SpecifierSet(version_specifier)
-        if python_version not in version_range:
-            print(
-                f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"',
-                file=sys.stderr,
-            )
-            return False
-    except Exception as e:
-        print(f"WARNING: Skipping python version check: {e}", file=sys.stderr)
-    return True
-
-
-# The pip repository that hosts nightly torch packages.
-TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cu126"
->>>>>>> fe438f9c92 (Add export_add.py)
 
 # Supported CUDA versions - modify this to add/remove supported versions
 # Format: tuple of (major, minor) version numbers

From 8c5bb3bc6de9efb5c9e435c4b65e222adb0509b7 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Mon, 25 Aug 2025 23:17:33 -0700
Subject: [PATCH 27/50] use env var to control debug mode

---
 backends/aoti/aoti_backend.py                 |   3 -
 backends/aoti/runtime/shims/memory.cpp        | 417 +++++++++++++++---
 backends/aoti/runtime/shims/memory.h          |   8 +
 .../aoti/runtime/shims/tensor_attribute.h     |   8 +
 exir/program/_program.py                      |   9 +
 export_and_run_aoti.sh                        |  42 +-
 export_aoti.py                                |   9 +-
 7 files changed, 430 insertions(+), 66 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index d248ccfae82..f785da00783 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -45,9 +45,6 @@ def preprocess(
         options: dict[str, typing.Any] = {
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
-            "aot_inductor.debug_compile": True,
-            "aot_inductor.repro_level": 3,
-            "aot_inductor.debug_intermediate_value_printer": "3",
             "max_autotune": True,
             "max_autotune_gemm_backends": "TRITON",
             "max_autotune_conv_backends": "TRITON",
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 2d2bf940833..77a1d26b040 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -166,8 +166,8 @@ AOTITorchError aoti_torch_empty_strided(
     throw std::runtime_error(
         "Need to implement empty_strided for non-CUDA non-CPU");
   }
-  std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr "
-            << sizes_ptr << std::endl;
+  std::cout << "////Allocated " << nbytes << " bytes at " << ptr
+            << ", sizes_ptr " << sizes_ptr << std::endl;
 
   // ETensor sizes
   std::vector<int32_t> sizes(ndim);
@@ -175,8 +175,31 @@ AOTITorchError aoti_torch_empty_strided(
     sizes[i] = sizes_ptr[i];
   }
 
+  std::cout << "Sizes: ";
+  for (int i = 0; i < ndim; i++) {
+    std::cout << sizes[i] << ", ";
+  }
+
+  std::cout << std::endl;
+
+  // ETensor strides
+  std::vector<int32_t> strides(ndim);
+  if (strides_ptr != nullptr) {
+    // Use provided strides
+    for (int i = 0; i < ndim; i++) {
+      strides[i] = strides_ptr[i];
+    }
+  } else {
+    // Calculate strides from sizes, assume it is in contiguous memory format
+    strides[ndim - 1] = 1; // Last dimension has stride 1
+    for (int i = ndim - 2; i >= 0; i--) {
+      strides[i] = strides[i + 1] * sizes_ptr[i + 1];
+    }
+  }
+  std::cout << std::endl;
+
   // ETensor creation
-  auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr);
+  auto tensor = executorch::extension::from_blob(ptr, sizes, strides);
 
   // Store the tensor so it doesn't get destroyed
   tensors.insert(tensor);
@@ -269,76 +292,259 @@ AOTITorchError aoti_torch_copy_(
     AOTITensorHandle self,
     AOTITensorHandle src,
     int32_t non_blocking) {
-  // check if size is the same
+  std::cout << "aoti_torch_copy_ called: self=" << self << ", src=" << src
+            << std::endl;
+
+  // assert same dim for now
   if (self->dim() != src->dim()) {
-    std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim()
-              << std::endl;
-    throw std::runtime_error("self.dim() != src.dim()");
-  }
-  std::cout << "self->data_ptr(): " << self->data_ptr()
-            << " sizes: " << self->sizes().data() << std::endl;
-  std::cout << "src->data_ptr(): " << src->data_ptr()
-            << " sizes: " << src->sizes().data() << std::endl;
-  for (int i = 0; i < self->dim(); i++) {
-    if (self->sizes()[i] != src->sizes()[i]) {
-      std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] "
-                << src->sizes()[i] << std::endl;
-      throw std::runtime_error("size mismatch");
+    std::cout << "Error: dimension mismatch. self.dim()=" << self->dim()
+              << ", src.dim()=" << src->dim() << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // only support float32 for now
+  int32_t self_dtype, src_dtype;
+  aoti_torch_get_dtype(self, &self_dtype);
+  aoti_torch_get_dtype(src, &src_dtype);
+
+  if (self_dtype != 6 || src_dtype != 6) { // 6 = float32
+    std::cout << "Error: Only float32 tensors supported. Got self.dtype="
+              << self_dtype << ", src.dtype=" << src_dtype << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Get stride information for layout validation
+  int64_t* self_strides;
+  int64_t* src_strides;
+  aoti_torch_get_strides(self, &self_strides);
+  aoti_torch_get_strides(src, &src_strides);
+
+  auto self_sizes = self->sizes();
+  auto src_sizes = src->sizes();
+
+  // contiguous or channel-last layouts allowed in ettensor
+  bool self_is_contiguous = true;
+  bool src_is_contiguous = true;
+  bool self_is_channels_last = false;
+  bool src_is_channels_last = false;
+
+  // Check if contiguous (strides decrease from left to right)
+  int64_t expected_stride = 1;
+  for (int i = self->dim() - 1; i >= 0; i--) {
+    if (self_strides[i] != expected_stride) {
+      self_is_contiguous = false;
     }
+    expected_stride *= self_sizes[i];
   }
 
-  int size = src->nbytes();
-  // should check for device
+  expected_stride = 1;
+  for (int i = src->dim() - 1; i >= 0; i--) {
+    if (src_strides[i] != expected_stride) {
+      src_is_contiguous = false;
+    }
+    expected_stride *= src_sizes[i];
+  }
+
+  // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C])
+  if (self->dim() == 4 && !self_is_contiguous) {
+    int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2],
+            C = self_sizes[3];
+    if ((self_strides[0] == H * W * C || N <= 1) && (self_strides[1] == W * C || H <= 1) &&
+        (self_strides[2] == C || W == 1) && (self_strides[3] == 1 || C == 1)) {
+      self_is_channels_last = true;
+    }
+  }
+
+  if (src->dim() == 4 && !src_is_contiguous) {
+    int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2],
+            C = src_sizes[3];
+    if ((src_strides[0] == H * W * C || N <= 1) &&( src_strides[1] == W * C || H <= 1) &&
+        (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) {
+      src_is_channels_last = true;
+    }
+  }
+
+  // Validate layout assumptions
+  if (!self_is_contiguous && !self_is_channels_last) {
+    std::cout << "Error: self tensor must be contiguous or channels-last. "
+              << "Got strides: [";
+    for (int i = 0; i < self->dim(); i++) {
+      std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : "");
+    }
+    std::cout << "]" << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  if (!src_is_contiguous && !src_is_channels_last) {
+    std::cout << "Error: src tensor must be contiguous or channels-last. "
+              << "Got strides: [";
+    for (int i = 0; i < src->dim(); i++) {
+      std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : "");
+    }
+    std::cout << "]" << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Determine device locations
   cudaPointerAttributes srcAttributes, dstAttributes;
   cudaError_t err;
-  // Get attributes of the source pointer
+
   err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr());
   checkCudaError(err, "Failed to get source pointer attributes");
-  // Get attributes of the destination pointer
+
   err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr());
   checkCudaError(err, "Failed to get destination pointer attributes");
+
   bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
   bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
-  // Determine the memory locations and perform the appropriate copy
-  if (srcIsDevice && dstIsDevice) {
-    // Device to Device copy
-    err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyDeviceToDevice);
-    checkCudaError(err, "Failed to copy from device to device");
-  } else if (srcIsDevice && !dstIsDevice) {
-    // Device to Host copy
+
+  std::cout << "Copy layout: src="
+            << (src_is_contiguous ? "contiguous" : "channels-last") << " ("
+            << (srcIsDevice ? "GPU" : "CPU") << ") -> "
+            << "dst=" << (self_is_contiguous ? "contiguous" : "channels-last")
+            << " (" << (dstIsDevice ? "GPU" : "CPU") << ")" << std::endl;
+
+  size_t total_bytes = src->nbytes();
+
+  // Check if we can do a simple memcpy (same layout)
+  bool same_layout = (self_is_contiguous && src_is_contiguous) ||
+      (self_is_channels_last && src_is_channels_last);
+
+  if (same_layout) {
+    std::cout << "Same layout - doing direct copy of " << total_bytes
+              << " bytes" << std::endl;
+
+    // Simple copy since layouts match
+    if (srcIsDevice && dstIsDevice) {
+      err = cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyDeviceToDevice);
+      checkCudaError(err, "Failed to copy from device to device");
+    } else if (srcIsDevice && !dstIsDevice) {
+      err = cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyDeviceToHost);
+      checkCudaError(err, "Failed to copy from device to host");
+    } else if (!srcIsDevice && dstIsDevice) {
+      err = cudaMemcpy(
+          self->mutable_data_ptr(),
+          src->data_ptr(),
+          total_bytes,
+          cudaMemcpyHostToDevice);
+      checkCudaError(err, "Failed to copy from host to device");
+    } else {
+      std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes);
+    }
+  } else {
+    // Layout conversion needed (contiguous <-> channels-last)
+    std::cout << "Layout conversion needed - doing element-wise copy"
+              << std::endl;
+
+    if (self->dim() != 4) {
+      std::cout << "Error: Layout conversion only supported for 4D tensors"
+                << std::endl;
+      return Error::NotImplemented;
+    }
+
+    // Get data to host for processing
+    size_t total_elements = total_bytes / sizeof(float);
+    float* src_host_data = nullptr;
+    float* dst_host_data = nullptr;
+    bool need_free_src = false;
+    bool need_free_dst = false;
+
+    if (srcIsDevice) {
+      src_host_data = new float[total_elements];
+      err = cudaMemcpy(
+          src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost);
+      checkCudaError(err, "Failed to copy src to host");
+      need_free_src = true;
+    } else {
+      src_host_data = static_cast<float*>(src->data_ptr());
+    }
+
+    if (dstIsDevice) {
+      dst_host_data = new float[total_elements];
+      need_free_dst = true;
+    } else {
+      dst_host_data = static_cast<float*>(self->mutable_data_ptr());
+    }
+
+    // Perform layout conversion (4D NCHW <-> NHWC)
+    int64_t N = self_sizes[0], C = self_sizes[1], H = self_sizes[2],
+            W = self_sizes[3];
+
+    for (int64_t n = 0; n < N; n++) {
+      for (int64_t c = 0; c < C; c++) {
+        for (int64_t h = 0; h < H; h++) {
+          for (int64_t w = 0; w < W; w++) {
+            size_t src_offset, dst_offset;
+
+            if (src_is_contiguous) {
+              // Source is NCHW
+              src_offset = n * C * H * W + c * H * W + h * W + w;
+            } else {
+              // Source is NHWC
+              src_offset = n * H * W * C + h * W * C + w * C + c;
+            }
+
+            if (self_is_contiguous) {
+              // Destination is NCHW
+              dst_offset = n * C * H * W + c * H * W + h * W + w;
+            } else {
+              // Destination is NHWC
+              dst_offset = n * H * W * C + h * W * C + w * C + c;
+            }
+
+            dst_host_data[dst_offset] = src_host_data[src_offset];
+          }
+        }
+      }
+    }
+
+    // Copy result back to device if needed
+    if (dstIsDevice) {
+      err = cudaMemcpy(
+          self->mutable_data_ptr(),
+          dst_host_data,
+          total_bytes,
+          cudaMemcpyHostToDevice);
+      checkCudaError(err, "Failed to copy result to device");
+    }
+
+    // Clean up temporary buffers
+    if (need_free_src)
+      delete[] src_host_data;
+    if (need_free_dst)
+      delete[] dst_host_data;
+  }
+
+  // Verify the copy by checking first element
+  float src_first, dst_first;
+  if (srcIsDevice) {
     err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyDeviceToHost);
-    std::cout << "Device to Host copy, self data: "
-              << ((float*)self->data_ptr())[0] << std::endl;
-    checkCudaError(err, "Failed to copy from device to host");
-  } else if (!srcIsDevice && dstIsDevice) {
-    // Host to Device copy
+        &src_first, src->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost);
+    checkCudaError(err, "Failed to copy first src element");
+  } else {
+    src_first = static_cast<const float*>(src->data_ptr())[0];
+  }
+
+  if (dstIsDevice) {
     err = cudaMemcpy(
-        self->mutable_data_ptr(),
-        src->data_ptr(),
-        size,
-        cudaMemcpyHostToDevice);
-    std::cout << "Host to Device copy, src data: "
-              << ((float*)src->data_ptr())[0] << std::endl;
-    checkCudaError(err, "Failed to copy from host to device");
-  } else if (!srcIsDevice && !dstIsDevice) {
-    // Host to Host copy
-    std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0]
-              << std::endl;
-    std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size);
+        &dst_first, self->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost);
+    checkCudaError(err, "Failed to copy first dst element");
   } else {
-    std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type
-              << ", src: " << srcAttributes.type << std::endl;
-    throw std::runtime_error("Unknown memory type");
+    dst_first = static_cast<const float*>(self->data_ptr())[0];
   }
-  // print first value of src and self
+
+  std::cout << "Copy verification: src[0]=" << src_first
+            << ", dst[0]=" << dst_first << std::endl;
+  std::cout << "aoti_torch_copy_ completed successfully" << std::endl;
+
   return Error::Ok;
 }
 
@@ -385,6 +591,103 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard(
   return Error::Ok;
 }
 
+AOTITorchError aoti_torch__reinterpret_tensor(
+    AOTITensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    AOTITensorHandle* ret_new_tensor) {
+  std::cout << "aoti_torch__reinterpret_tensor called with tensor " << self
+            << ", ndim: " << ndim << ", storage_offset: " << storage_offset
+            << std::endl;
+
+  for (int i = 0; i < ndim; i++) {
+    std::cout << "sizes[" << i << "]: " << sizes_ptr[i] << std::endl;
+  }
+  for (int i = 0; i < ndim; i++) {
+    std::cout << "strides[" << i << "]: " << strides_ptr[i] << std::endl;
+  }
+
+  // Check if storage_offset is not 0 - return error if not
+  if (storage_offset != 0) {
+    std::cout
+        << "Error: aoti_torch__reinterpret_tensor does not support non-zero storage_offset: "
+        << storage_offset << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Check if dimensions match
+  if (self->dim() != ndim) {
+    std::cout << "Error: tensor dimension mismatch. self->dim(): "
+              << self->dim() << ", provided ndim: " << ndim << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Get tensor properties from the input tensor
+  int32_t dtype;
+  AOTITorchError dtype_err = aoti_torch_get_dtype(self, &dtype);
+  if (dtype_err != Error::Ok) {
+    std::cout << "Error: failed to get dtype from input tensor" << std::endl;
+    return dtype_err;
+  }
+
+  int32_t device_type;
+  AOTITorchError device_type_err =
+      aoti_torch_get_device_type(self, &device_type);
+  if (device_type_err != Error::Ok) {
+    std::cout << "Error: failed to get device_type from input tensor"
+              << std::endl;
+    return device_type_err;
+  }
+
+  int32_t device_index;
+  AOTITorchError device_index_err =
+      aoti_torch_get_device_index(self, &device_index);
+  if (device_index_err != Error::Ok) {
+    std::cout << "Error: failed to get device_index from input tensor"
+              << std::endl;
+    return device_index_err;
+  }
+
+  std::cout << "Creating new tensor with dtype: " << dtype
+            << ", device_type: " << device_type
+            << ", device_index: " << device_index << std::endl;
+
+  // Create new tensor with the provided sizes and strides using
+  // aoti_torch_empty_strided
+  AOTITorchError create_err = aoti_torch_empty_strided(
+      ndim,
+      sizes_ptr,
+      strides_ptr,
+      dtype,
+      device_type,
+      device_index,
+      ret_new_tensor);
+
+  if (create_err != Error::Ok) {
+    std::cout << "Error: failed to create new tensor with empty_strided"
+              << std::endl;
+    return create_err;
+  }
+
+  // Copy data from source tensor to new tensor
+  AOTITorchError copy_err = aoti_torch_copy_(*ret_new_tensor, self, 0);
+  if (copy_err != Error::Ok) {
+    std::cout << "Error: failed to copy data from source tensor to new tensor"
+              << std::endl;
+    // Clean up the created tensor on failure
+    aoti_torch_delete_tensor_object(*ret_new_tensor);
+    *ret_new_tensor = nullptr;
+    return copy_err;
+  }
+
+  std::cout << "Successfully created reinterpreted tensor " << *ret_new_tensor
+            << " from source tensor " << self << std::endl;
+
+  return Error::Ok;
+}
+
 // Cleanup function for clearing global state
 void cleanup_memory() {
   is_tensor_own_memory.clear();
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index 996c729b4be..0b8af138c90 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -92,6 +92,14 @@ AOTITorchError aoti_torch_create_cuda_stream_guard(
 
 AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 
+AOTITorchError aoti_torch__reinterpret_tensor(
+    AOTITensorHandle self,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    AOTITensorHandle* ret_new_tensor);
+
 // Utility functions
 void checkCudaError(cudaError_t err, const char* msg);
 void cleanup_memory();
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h
index 3ed966f99dc..f419f7db632 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.h
+++ b/backends/aoti/runtime/shims/tensor_attribute.h
@@ -56,6 +56,14 @@ AOTITorchError aoti_torch_get_storage_size(
     AOTITensorHandle tensor,
     int64_t* ret_size);
 
+AOTITorchError aoti_torch_get_device_type(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_type);
+
+AOTITorchError aoti_torch_get_device_index(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_index);
+
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_device_type_cuda();
diff --git a/exir/program/_program.py b/exir/program/_program.py
index af94399a3ed..760056e32bb 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1697,8 +1697,17 @@ def to_executorch(  # noqa (FLAKE8) C901
             after it has been transformed to the ExecuTorch backend.
         """
         config = config if config else ExecutorchBackendConfig()
+
+        def exported_program_to_device(exported_program, device):
+            for _, param in exported_program.named_parameters():
+                param.data = param.data.to(device)
+            for _, buffer in exported_program.named_buffers():
+                buffer.data = buffer.data.to(device)
+            return exported_program
+
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
+            program = exported_program_to_device(program, "cpu")
             if config.do_quant_fusion_and_const_prop:
                 if program.graph_signature.backward_signature is not None:
                     raise Exception(
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 54f1d0b5092..e850c2bb6bb 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -18,14 +18,19 @@ set -e  # Exit on any error
 # Parse command line arguments
 MODE="reinstall_all"
 MODEL_ARG="$1"
+DEBUG_MODE=false
 
-# Parse arguments for mode
+# Parse arguments for mode and debug flag
 for arg in "$@"; do
     case $arg in
         --mode=*)
             MODE="${arg#*=}"
             shift
             ;;
+        --debug)
+            DEBUG_MODE=true
+            shift
+            ;;
         reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only)
             # If it's the second argument and a valid mode, use it as mode
             if [[ "$arg" == "$2" ]]; then
@@ -49,6 +54,7 @@ case "$MODE" in
         echo "  ./export_and_run_aoti.sh conv2d inference               # Positional mode"
         echo "  ./export_and_run_aoti.sh conv2d --mode=inference        # GNU-style mode"
         echo "  ./export_and_run_aoti.sh conv2d export_aoti_only        # Export AOTI only (no runtime)"
+        echo "  ./export_and_run_aoti.sh conv2d --mode=inference --debug # With debug options enabled"
         exit 1
         ;;
 esac
@@ -130,10 +136,30 @@ run_inference() {
     ./cmake-out/executor_runner --model_path aoti_model.pte
 }
 
+# Set up environment variables based on debug flag
+if [[ "$DEBUG_MODE" == true ]]; then
+    echo "Setting debug environment variables..."
+    export AOT_INDUCTOR_DEBUG_COMPILE="1"
+    export AOTINDUCTOR_REPRO_LEVEL=3
+    export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+    echo "Debug variables set:"
+    echo "  AOT_INDUCTOR_DEBUG_COMPILE=$AOT_INDUCTOR_DEBUG_COMPILE"
+    echo "  AOTINDUCTOR_REPRO_LEVEL=$AOTINDUCTOR_REPRO_LEVEL"
+    echo "  AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER"
+else
+    # Ensure debug variables are unset for non-debug modes
+    unset AOT_INDUCTOR_DEBUG_COMPILE
+    unset AOTINDUCTOR_REPRO_LEVEL
+    unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER
+fi
+
 # Execute based on mode
 case "$MODE" in
     "reinstall_all")
-        echo "Mode: reinstall_all - Full reinstall and run"
+        echo "Mode: $MODE - Full reinstall and run"
+        if [[ "$DEBUG_MODE" == true ]]; then
+            echo "Debug options enabled with AOT Inductor debug settings"
+        fi
         install_executorch
         export_aoti_model
         clean_install_executorch
@@ -142,23 +168,35 @@ case "$MODE" in
         ;;
     "reinstall_aot")
         echo "Mode: reinstall_aot - Reinstall AOT components and run e2e"
+        if [[ "$DEBUG_MODE" == true ]]; then
+            echo "Debug options enabled with AOT Inductor debug settings"
+        fi
         install_executorch
         export_aoti_model
         run_inference
         ;;
     "reinstall_runtime")
         echo "Mode: reinstall_runtime - Rebuild runtime and run e2e"
+        if [[ "$DEBUG_MODE" == true ]]; then
+            echo "Debug options enabled with AOT Inductor debug settings"
+        fi
         export_aoti_model
         build_runtime
         run_inference
         ;;
     "inference")
         echo "Mode: inference - Export model and run inference only"
+        if [[ "$DEBUG_MODE" == true ]]; then
+            echo "Debug options enabled with AOT Inductor debug settings"
+        fi
         export_aoti_model
         run_inference
         ;;
     "export_aoti_only")
         echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)"
+        if [[ "$DEBUG_MODE" == true ]]; then
+            echo "Debug options enabled with AOT Inductor debug settings"
+        fi
         export_aoti_model "--aoti_only"
         ;;
     *)
diff --git a/export_aoti.py b/export_aoti.py
index a3924750362..2550f33a55a 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -24,8 +24,9 @@
 
 import torch
 from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
-from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-from executorch.exir import to_edge_transform_and_lower, to_edge
+
+# from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import to_edge, to_edge_transform_and_lower
 from torch import nn
 from torch.export import export
 from torchvision import models
@@ -119,7 +120,7 @@ def forward(self, x):
     "resnet18": {
         "model_class": ResNet18,
         "input_shapes": [(1, 3, 224, 224)],
-        "device": "cpu",
+        "device": "cuda",
         "description": "ResNet18 model",
     },
     "linear": {
@@ -247,7 +248,7 @@ def export_model_to_pure_aoti(model, example_inputs):
         "aot_inductor.output_path": output_path,
         "aot_inductor.debug_compile": True,
         "aot_inductor.repro_level": 3,
-        "aot_inductor.debug_intermediate_value_printer": "3",
+        "aot_inductor.debug_intermediate_value_printer": "2",
         "max_autotune": True,
         "max_autotune_gemm_backends": "TRITON",
         "max_autotune_conv_backends": "TRITON",

From 687bcdf8c939406f03c7bb184efdd5cd9d6ce369 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 26 Aug 2025 14:08:38 -0700
Subject: [PATCH 28/50] enable dump runtime intermediate output for aoti
 delegated part

---
 .gitignore                                    |   1 +
 backends/aoti/CMakeLists.txt                  |   3 +-
 backends/aoti/runtime/aoti_backend.cpp        |   2 +
 backends/aoti/runtime/shims/memory.cpp        | 135 +++++++-----
 .../aoti/runtime/shims/tensor_attribute.cpp   |   7 +
 .../aoti/runtime/shims/tensor_attribute.h     |   2 +
 backends/aoti/runtime/shims/utils.cpp         | 202 ++++++++++++++++++
 backends/aoti/runtime/shims/utils.h           |  39 ++++
 backends/aoti/runtime/targets.bzl             |   2 +
 export_and_run_aoti.sh                        |  64 ++++--
 10 files changed, 393 insertions(+), 64 deletions(-)
 create mode 100644 backends/aoti/runtime/shims/utils.cpp
 create mode 100644 backends/aoti/runtime/shims/utils.h

diff --git a/.gitignore b/.gitignore
index 78268c70d8c..92b68cbc2d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ tokenizer.json
 *kernel.cpp
 *wrapper_metadata.json
 *wrapper.cpp
+aoti_intermediate_output.txt
 
 # Editor temporaries
 *.idea
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index 6922d5e9356..ca26f30d73e 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -31,7 +31,8 @@ set(_aoti_sources
     runtime/aoti_backend.cpp
     runtime/aoti_model_container.cpp
     runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp)
+    runtime/shims/tensor_attribute.cpp
+    runtime/shims/utils.cpp)
 add_library(aoti_backend STATIC ${_aoti_sources})
 target_include_directories(
   aoti_backend
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 453613d47f8..03c46c03bdd 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -29,6 +29,7 @@
 #include "aoti_model_container.h"
 #include "shims/memory.h"
 #include "shims/tensor_attribute.h"
+#include "shims/utils.h"
 
 // Include CUDA AOTI shims
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
@@ -374,6 +375,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     free(handle);
     cleanup_memory();
     cleanup_tensor_metadata();
+    cleanup_aoti_tensor_output();
     ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
   }
 };
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 77a1d26b040..09a773dd43e 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -322,67 +322,106 @@ AOTITorchError aoti_torch_copy_(
   auto self_sizes = self->sizes();
   auto src_sizes = src->sizes();
 
-  // contiguous or channel-last layouts allowed in ettensor
+  // Check if tensors have the same tensor schema (sizes, strides, dtype)
+  bool same_schema = true;
+
+  // Check sizes match
+  for (int i = 0; i < self->dim(); i++) {
+    if (self_sizes[i] != src_sizes[i]) {
+      same_schema = false;
+      break;
+    }
+  }
+
+  // Check strides match (only if sizes match)
+  if (same_schema) {
+    for (int i = 0; i < self->dim(); i++) {
+      if (self_strides[i] != src_strides[i]) {
+        same_schema = false;
+        break;
+      }
+    }
+  }
+
+  // Declare layout variables for both cases
   bool self_is_contiguous = true;
   bool src_is_contiguous = true;
   bool self_is_channels_last = false;
   bool src_is_channels_last = false;
 
-  // Check if contiguous (strides decrease from left to right)
-  int64_t expected_stride = 1;
-  for (int i = self->dim() - 1; i >= 0; i--) {
-    if (self_strides[i] != expected_stride) {
-      self_is_contiguous = false;
+  if (same_schema) {
+    std::cout << "Same tensor schema detected - enabling naive copy"
+              << std::endl;
+    // For same schema, we don't need to check memory formats - just use direct
+    // copy
+  } else {
+    // Different strides: check memory format and only support contiguous <->
+    // channels-last conversion
+    std::cout
+        << "Different tensor schemas - checking memory format compatibility"
+        << std::endl;
+
+    // Check if contiguous (strides decrease from left to right)
+    int64_t expected_stride = 1;
+    for (int i = self->dim() - 1; i >= 0; i--) {
+      if (self_strides[i] != expected_stride) {
+        self_is_contiguous = false;
+      }
+      expected_stride *= self_sizes[i];
     }
-    expected_stride *= self_sizes[i];
-  }
 
-  expected_stride = 1;
-  for (int i = src->dim() - 1; i >= 0; i--) {
-    if (src_strides[i] != expected_stride) {
-      src_is_contiguous = false;
+    expected_stride = 1;
+    for (int i = src->dim() - 1; i >= 0; i--) {
+      if (src_strides[i] != expected_stride) {
+        src_is_contiguous = false;
+      }
+      expected_stride *= src_sizes[i];
     }
-    expected_stride *= src_sizes[i];
-  }
 
-  // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C])
-  if (self->dim() == 4 && !self_is_contiguous) {
-    int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2],
-            C = self_sizes[3];
-    if ((self_strides[0] == H * W * C || N <= 1) && (self_strides[1] == W * C || H <= 1) &&
-        (self_strides[2] == C || W == 1) && (self_strides[3] == 1 || C == 1)) {
-      self_is_channels_last = true;
+    // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C])
+    if (self->dim() == 4 && !self_is_contiguous) {
+      int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2],
+              C = self_sizes[3];
+      if ((self_strides[0] == H * W * C || N <= 1) &&
+          (self_strides[1] == W * C || H <= 1) &&
+          (self_strides[2] == C || W == 1) &&
+          (self_strides[3] == 1 || C == 1)) {
+        self_is_channels_last = true;
+      }
     }
-  }
 
-  if (src->dim() == 4 && !src_is_contiguous) {
-    int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2],
-            C = src_sizes[3];
-    if ((src_strides[0] == H * W * C || N <= 1) &&( src_strides[1] == W * C || H <= 1) &&
-        (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) {
-      src_is_channels_last = true;
+    if (src->dim() == 4 && !src_is_contiguous) {
+      int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2],
+              C = src_sizes[3];
+      if ((src_strides[0] == H * W * C || N <= 1) &&
+          (src_strides[1] == W * C || H <= 1) &&
+          (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) {
+        src_is_channels_last = true;
+      }
     }
-  }
 
-  // Validate layout assumptions
-  if (!self_is_contiguous && !self_is_channels_last) {
-    std::cout << "Error: self tensor must be contiguous or channels-last. "
-              << "Got strides: [";
-    for (int i = 0; i < self->dim(); i++) {
-      std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : "");
+    // Validate layout assumptions only when schemas differ
+    if (!self_is_contiguous && !self_is_channels_last) {
+      std::cout
+          << "Error: self tensor must be contiguous or channels-last for stride conversion. "
+          << "Got strides: [";
+      for (int i = 0; i < self->dim(); i++) {
+        std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : "");
+      }
+      std::cout << "]" << std::endl;
+      return Error::InvalidArgument;
     }
-    std::cout << "]" << std::endl;
-    return Error::InvalidArgument;
-  }
 
-  if (!src_is_contiguous && !src_is_channels_last) {
-    std::cout << "Error: src tensor must be contiguous or channels-last. "
-              << "Got strides: [";
-    for (int i = 0; i < src->dim(); i++) {
-      std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : "");
+    if (!src_is_contiguous && !src_is_channels_last) {
+      std::cout
+          << "Error: src tensor must be contiguous or channels-last for stride conversion. "
+          << "Got strides: [";
+      for (int i = 0; i < src->dim(); i++) {
+        std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : "");
+      }
+      std::cout << "]" << std::endl;
+      return Error::InvalidArgument;
     }
-    std::cout << "]" << std::endl;
-    return Error::InvalidArgument;
   }
 
   // Determine device locations
@@ -406,11 +445,7 @@ AOTITorchError aoti_torch_copy_(
 
   size_t total_bytes = src->nbytes();
 
-  // Check if we can do a simple memcpy (same layout)
-  bool same_layout = (self_is_contiguous && src_is_contiguous) ||
-      (self_is_channels_last && src_is_channels_last);
-
-  if (same_layout) {
+  if (same_schema) {
     std::cout << "Same layout - doing direct copy of " << total_bytes
               << " bytes" << std::endl;
 
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index eb3d0e22371..dcea848597e 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -123,6 +123,13 @@ AOTITorchError aoti_torch_get_device_index(
   return Error::Ok;
 }
 
+AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) {
+  *ret_dim = tensor->dim();
+  std::cout << "getting dim from tensor " << tensor << " = " << *ret_dim
+            << std::endl;
+  return Error::Ok;
+}
+
 int32_t aoti_torch_device_type_cpu() {
   // Let's say cpu is 0 for ET as well
   return 0;
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h
index f419f7db632..ab4f8037ebf 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.h
+++ b/backends/aoti/runtime/shims/tensor_attribute.h
@@ -64,6 +64,8 @@ AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
     int32_t* ret_device_index);
 
+AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim);
+
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
 int32_t aoti_torch_device_type_cuda();
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
new file mode 100644
index 00000000000..10882c16cf4
--- /dev/null
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "utils.h"
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+namespace internal {
+// Constants for file operations
+const char* const TENSOR_OUTPUT_FILENAME =
+    "/home/gasoonjia/executorch/aoti_intermediate_output.txt";
+} // namespace internal
+
+extern "C" {
+
+void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
+  printf("Printing tensor handle: %p\n", self);
+
+  if (!self) {
+    throw std::runtime_error("Tensor handle is null");
+  }
+
+  printf("Tensor handle is not null\n");
+
+  // Get dtype and check if it's float32 (dtype 6 in PyTorch)
+  int32_t dtype = 0;
+  if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) {
+    throw std::runtime_error("Failed to get tensor dtype");
+  }
+
+  printf("Tensor dtype is: %d\n", dtype);
+
+  if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch
+    throw std::runtime_error(
+        "Tensor dtype is not float32. Expected dtype 6, got: " +
+        std::to_string(dtype));
+  }
+
+  printf("Tensor dtype is float32\n");
+
+  // Get data pointer
+  void* data_ptr = nullptr;
+  if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS ||
+      !data_ptr) {
+    throw std::runtime_error("Failed to get tensor data pointer");
+  }
+
+  printf("Tensor data pointer is %p not null\n", data_ptr);
+
+  // Get dimensions
+  int64_t dim = 0;
+  if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) {
+    throw std::runtime_error("Failed to get tensor dimensions");
+  }
+
+  printf("Tensor dimensions are: %ld\n", dim);
+
+  // Get sizes
+  int64_t* sizes = nullptr;
+  if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) {
+    throw std::runtime_error("Failed to get tensor sizes");
+  }
+
+  printf("Tensor sizes are: %ld\n", sizes);
+
+  // Calculate total number of elements
+  int64_t total_elements = 1;
+  for (int i = 0; i < dim; i++) {
+    total_elements *= sizes[i];
+  }
+
+  printf("Total elements in tensor: %ld\n", total_elements);
+
+  // Check device type to handle CUDA tensors properly
+  int32_t device_type = 0;
+  if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) {
+    throw std::runtime_error("Failed to get tensor device type");
+  }
+
+  printf("Tensor device type: %d\n", device_type);
+
+  AtenTensorHandle cpu_tensor = nullptr;
+  const float* float_data = nullptr;
+  bool need_cleanup = false;
+
+  // Check if tensor is on CUDA (device_type 1 is CUDA)
+  if (device_type == 1) {
+    printf("Tensor is on CUDA, copying to CPU...\n");
+
+    // Get strides for creating CPU tensor
+    int64_t* strides = nullptr;
+    if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS ||
+        !strides) {
+      throw std::runtime_error("Failed to get tensor strides");
+    }
+
+    // Create a CPU tensor with same shape and layout
+    if (aoti_torch_empty_strided(
+            dim, sizes, strides, dtype, 0, -1, &cpu_tensor) !=
+        AOTI_TORCH_SUCCESS) {
+      throw std::runtime_error("Failed to create CPU tensor");
+    }
+
+    // Copy data from CUDA to CPU tensor
+    if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) {
+      aoti_torch_delete_tensor_object(cpu_tensor);
+      throw std::runtime_error("Failed to copy tensor from CUDA to CPU");
+    }
+
+    // Get CPU data pointer
+    void* cpu_data_ptr = nullptr;
+    if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) !=
+            AOTI_TORCH_SUCCESS ||
+        !cpu_data_ptr) {
+      aoti_torch_delete_tensor_object(cpu_tensor);
+      throw std::runtime_error("Failed to get CPU tensor data pointer");
+    }
+
+    float_data = static_cast<const float*>(cpu_data_ptr);
+    need_cleanup = true;
+    printf("Successfully copied CUDA tensor to CPU\n");
+  } else {
+    // Tensor is already on CPU, use original data pointer
+    printf("Tensor is on CPU, using original data pointer\n");
+    float_data = static_cast<const float*>(data_ptr);
+  }
+
+  // Open file for writing (append mode to not overwrite previous outputs)
+  printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME);
+
+  std::ofstream output_file(
+      internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app);
+  if (!output_file.is_open()) {
+    if (need_cleanup) {
+      aoti_torch_delete_tensor_object(cpu_tensor);
+    }
+    throw std::runtime_error(
+        "Failed to open output file: " +
+        std::string(internal::TENSOR_OUTPUT_FILENAME));
+  }
+
+  printf("Successfully opened file for writing\n");
+
+  // Write message and tensor info to file
+  output_file << "=== " << msg << " ===" << std::endl;
+  output_file << "Device type: " << device_type << std::endl;
+  output_file << "Dimensions: " << dim << std::endl;
+  output_file << "Sizes: [";
+  for (int i = 0; i < dim; i++) {
+    output_file << sizes[i];
+    if (i < dim - 1)
+      output_file << ", ";
+  }
+  output_file << "]" << std::endl;
+  output_file << "Total elements: " << total_elements << std::endl;
+  output_file << "Data content:" << std::endl;
+
+  // Write tensor data to file (now safe to access)
+  for (int64_t i = 0; i < total_elements; i++) {
+    output_file << float_data[i] << " ";
+    if (i < total_elements - 1) {
+      output_file << ", ";
+      // Add newline every 10 elements for readability
+      if ((i + 1) % 10 == 0) {
+        output_file << std::endl;
+      }
+    }
+  }
+  output_file << std::endl << std::endl;
+
+  // Clean up CPU tensor if we created one
+  if (need_cleanup) {
+    aoti_torch_delete_tensor_object(cpu_tensor);
+    printf("Cleaned up temporary CPU tensor\n");
+  }
+
+  // File will be automatically closed when output_file goes out of scope
+}
+
+// Function to cleanup the tensor output file (to be called from
+// aoti_backend.cpp)
+void cleanup_aoti_tensor_output() {
+  // No cleanup needed since file is opened and closed on each call
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
new file mode 100644
index 00000000000..6bcd34efcfb
--- /dev/null
+++ b/backends/aoti/runtime/shims/utils.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Type definitions
+using AOTITensorHandle = Tensor*;
+using AOTITorchError = Error;
+
+// Utility function for printing tensor information
+void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg);
+
+// Cleanup function for tensor output file (called during backend destruction)
+void cleanup_aoti_tensor_output();
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl
index 28c9e893721..2c87ad68a2c 100644
--- a/backends/aoti/runtime/targets.bzl
+++ b/backends/aoti/runtime/targets.bzl
@@ -8,11 +8,13 @@ def define_common_targets():
             "aoti_model_container.cpp",
             "shims/memory.cpp",
             "shims/tensor_attribute.cpp",
+            "shims/utils.cpp",
         ],
         headers = [
             "aoti_model_container.h",
             "shims/memory.h",
             "shims/tensor_attribute.h",
+            "shims/utils.h",
         ],
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
         link_whole = True,
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index e850c2bb6bb..ebb1a44239e 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -3,14 +3,19 @@
 # Script to export and run AOTI with different modes
 # Usage:
 #   ./export_and_run_aoti.sh <model_arg> [mode]
-#   ./export_and_run_aoti.sh <model_arg> --mode=<mode>
+#   ./export_and_run_aoti.sh <model_arg> --mode=<mode> [--debug] [--dump]
 #
 # Examples:
-#   ./export_and_run_aoti.sh conv2d                    # Uses default mode (reinstall_all)
-#   ./export_and_run_aoti.sh conv2d inference          # Uses inference mode
-#   ./export_and_run_aoti.sh conv2d --mode=inference   # Alternative syntax
+#   ./export_and_run_aoti.sh conv2d                         # Uses default mode (reinstall_all)
+#   ./export_and_run_aoti.sh conv2d inference               # Uses inference mode
+#   ./export_and_run_aoti.sh conv2d --mode=inference        # Alternative syntax
+#   ./export_and_run_aoti.sh conv2d --mode=inference --dump # With AOTI intermediate output dumping
+#   ./export_and_run_aoti.sh conv2d --mode=inference --debug --dump # With both debug and dump
 #
 # Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference, export_aoti_only
+# Flags:
+#   --debug: Enable debug mode with extensive logging
+#   --dump:  Enable AOTI intermediate output dumping to aoti_intermediate_output.txt
 # model_arg: argument to pass to export_aoti.py
 
 set -e  # Exit on any error
@@ -19,6 +24,7 @@ set -e  # Exit on any error
 MODE="reinstall_all"
 MODEL_ARG="$1"
 DEBUG_MODE=false
+DUMP_MODE=false
 
 # Parse arguments for mode and debug flag
 for arg in "$@"; do
@@ -31,6 +37,10 @@ for arg in "$@"; do
             DEBUG_MODE=true
             shift
             ;;
+        --dump)
+            DUMP_MODE=true
+            shift
+            ;;
         reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only)
             # If it's the second argument and a valid mode, use it as mode
             if [[ "$arg" == "$2" ]]; then
@@ -87,6 +97,7 @@ cleanup_temp_files() {
     rm -f *kernel.cpp
     rm -f *wrapper_metadata.json
     rm -f *wrapper.cpp
+    rm -f aoti_intermediate_output.txt
 
     echo "Cleanup completed."
 }
@@ -121,12 +132,25 @@ build_runtime() {
     rm -rf cmake-out
     mkdir -p cmake-out
     cd cmake-out
-    cmake -DEXECUTORCH_BUILD_AOTI=ON \
-          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
-          -DEXECUTORCH_LOG_LEVEL=Debug \
-          -DCMAKE_BUILD_TYPE=Debug \
-          ..
+
+    if [[ "$DEBUG_MODE" == true ]]; then
+        echo "Building with debug configuration..."
+        cmake -DEXECUTORCH_BUILD_AOTI=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+              -DEXECUTORCH_LOG_LEVEL=Debug \
+              -DCMAKE_BUILD_TYPE=Debug \
+              ..
+    else
+        echo "Building with release configuration..."
+        cmake -DEXECUTORCH_BUILD_AOTI=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
+              -DEXECUTORCH_LOG_LEVEL=Info \
+              -DCMAKE_BUILD_TYPE=Release \
+              ..
+    fi
+
     cd ..
     cmake --build cmake-out -j9
 }
@@ -136,18 +160,32 @@ run_inference() {
     ./cmake-out/executor_runner --model_path aoti_model.pte
 }
 
-# Set up environment variables based on debug flag
+# Set up environment variables based on debug and dump flags
 if [[ "$DEBUG_MODE" == true ]]; then
     echo "Setting debug environment variables..."
     export AOT_INDUCTOR_DEBUG_COMPILE="1"
     export AOTINDUCTOR_REPRO_LEVEL=3
-    export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+
+    # Set intermediate value printer based on dump flag
+    if [[ "$DUMP_MODE" == true ]]; then
+        export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+        echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)"
+    else
+        export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="3"
+    fi
+
     echo "Debug variables set:"
     echo "  AOT_INDUCTOR_DEBUG_COMPILE=$AOT_INDUCTOR_DEBUG_COMPILE"
     echo "  AOTINDUCTOR_REPRO_LEVEL=$AOTINDUCTOR_REPRO_LEVEL"
     echo "  AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER"
+elif [[ "$DUMP_MODE" == true ]]; then
+    # Only dump mode enabled (without debug)
+    echo "Setting AOTI intermediate output dumping..."
+    export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+    echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)"
+    echo "  AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER"
 else
-    # Ensure debug variables are unset for non-debug modes
+    # Ensure debug variables are unset for non-debug/non-dump modes
     unset AOT_INDUCTOR_DEBUG_COMPILE
     unset AOTINDUCTOR_REPRO_LEVEL
     unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER

From eca93d1f9f884aadab74f6c6c04a0e2b55cdc4e0 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 28 Aug 2025 00:52:26 -0700
Subject: [PATCH 29/50] resnet18 works

---
 .gitignore                                    |   2 +-
 backends/aoti/runtime/shims/memory.cpp        | 179 ++++++++++++------
 .../aoti/runtime/shims/tensor_attribute.cpp   |  16 ++
 compare_outputs.py                            | 154 +++++++++++++++
 .../executor_runner/executor_runner.cpp       | 145 ++++----------
 export_and_run_aoti.sh                        |  17 ++
 export_aoti.py                                |  82 +++++++-
 7 files changed, 427 insertions(+), 168 deletions(-)
 create mode 100755 compare_outputs.py

diff --git a/.gitignore b/.gitignore
index 92b68cbc2d7..2e9b9c948a2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,7 +40,7 @@ tokenizer.json
 *kernel.cpp
 *wrapper_metadata.json
 *wrapper.cpp
-aoti_intermediate_output.txt
+aoti_debug_data*
 
 # Editor temporaries
 *.idea
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 09a773dd43e..25e750edb3c 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -26,6 +26,79 @@ namespace aoti {
 using executorch::runtime::Error;
 using executorch::runtime::etensor::Tensor;
 
+namespace { // Internal namespace for utility functions
+
+// Version 1: For use with int64_t sizes (e.g., from blob creation functions)
+// Check if tensor is in contiguous memory format (NCHW for 4D tensors)
+// Contiguous format means strides decrease from left to right:
+// For NCHW: strides = [C*H*W, H*W, W, 1]
+bool is_tensor_contiguous(
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides) {
+  int64_t expected_stride = 1;
+  for (int i = ndim - 1; i >= 0; i--) {
+    if (strides[i] != expected_stride) {
+      return false;
+    }
+    expected_stride *= sizes[i];
+  }
+  return true;
+}
+
+// Check if tensor is in channels-last format (NHWC for 4D tensors)
+// Channels-last format for 4D: strides = [H*W*C, 1, W*C, C]
+bool is_tensor_channels_last(
+    int64_t ndim,
+    const int64_t* sizes,
+    const int64_t* strides) {
+  if (ndim != 4) {
+    return false; // Channels-last only defined for 4D tensors
+  }
+
+  int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3];
+
+  // Check NHWC format: strides = [H*W*C, 1, W*C, C]
+  // Handle edge cases where dimensions might be 1
+  return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) &&
+      (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
+}
+
+// Version 2: For use with ExecutorTorch tensors (int32_t sizes)
+// Check if tensor is in contiguous memory format (NCHW for 4D tensors)
+bool is_tensor_contiguous(
+    int64_t ndim,
+    const int32_t* sizes,
+    const int64_t* strides) {
+  int64_t expected_stride = 1;
+  for (int i = ndim - 1; i >= 0; i--) {
+    if (strides[i] != expected_stride) {
+      return false;
+    }
+    expected_stride *= sizes[i];
+  }
+  return true;
+}
+
+// Check if tensor is in channels-last format (NHWC for 4D tensors)
+bool is_tensor_channels_last(
+    int64_t ndim,
+    const int32_t* sizes,
+    const int64_t* strides) {
+  if (ndim != 4) {
+    return false; // Channels-last only defined for 4D tensors
+  }
+
+  int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3];
+
+  // Check NHWC format: strides = [H*W*C, 1, W*C, C]
+  // Handle edge cases where dimensions might be 1
+  return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) &&
+      (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
+}
+
+} // anonymous namespace
+
 // Global storage for tensors and their metadata
 std::unordered_set<std::shared_ptr<Tensor>> tensors;
 std::unordered_map<Tensor*, bool> is_tensor_own_memory;
@@ -47,7 +120,21 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int64_t opaque_metadata_size) {
   std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim
             << ", dtype: " << dtype << ", device_type: " << device_type
-            << std::endl;
+            << ", storage_offset: " << storage_offset << std::endl;
+
+  // Only float32 tensors are supported
+  if (dtype != 6) { // 6 = float32
+    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
+              << dtype << " (expected: 6 for float32)" << std::endl;
+    return Error::InvalidArgument;
+  }
+
+  // Storage offset must always be 0
+  if (storage_offset != 0) {
+    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
+              << storage_offset << std::endl;
+    return Error::InvalidArgument;
+  }
 
   // Convert sizes to the format expected by ExecutorTorch
   std::vector<int32_t> sizes(ndim);
@@ -58,31 +145,15 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
 
   // check the tensor format
   // Only support contiguous format for now
-  int64_t expected_stride = 1;
-  for (int i = ndim - 1; i >= 0; --i) {
-    if (strides_ptr[i] != expected_stride) {
-      std::cout
-          << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error"
-          << std::endl;
-      return Error::InvalidArgument;
-    }
-    expected_stride *= sizes_ptr[i];
+  if (!is_tensor_contiguous(ndim, sizes_ptr, strides_ptr)) {
+    std::cout
+        << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error"
+        << std::endl;
+    return Error::InvalidArgument;
   }
 
-  // Adjust data pointer by storage_offset if needed
+  // Since storage_offset is guaranteed to be 0, use data pointer directly
   void* adjusted_data = data;
-  if (storage_offset > 0) {
-    // Calculate byte offset based on dtype size
-    size_t dtype_size =
-        4; // Assuming float32 for now, you may need to handle other dtypes
-    if (dtype == 6) { // float32
-      dtype_size = 4;
-    } else {
-      std::cout << "Error: Unhandled dtype " << dtype << std::endl;
-      return Error::NotImplemented;
-    }
-    adjusted_data = static_cast<char*>(data) + (storage_offset * dtype_size);
-  }
 
   // Create ExecutorTorch tensor that wraps the existing memory
   // Note: We're NOT copying the data, just wrapping it
@@ -362,42 +433,21 @@ AOTITorchError aoti_torch_copy_(
         << std::endl;
 
     // Check if contiguous (strides decrease from left to right)
-    int64_t expected_stride = 1;
-    for (int i = self->dim() - 1; i >= 0; i--) {
-      if (self_strides[i] != expected_stride) {
-        self_is_contiguous = false;
-      }
-      expected_stride *= self_sizes[i];
-    }
+    self_is_contiguous =
+        is_tensor_contiguous(self->dim(), self_sizes.data(), self_strides);
 
-    expected_stride = 1;
-    for (int i = src->dim() - 1; i >= 0; i--) {
-      if (src_strides[i] != expected_stride) {
-        src_is_contiguous = false;
-      }
-      expected_stride *= src_sizes[i];
-    }
+    src_is_contiguous =
+        is_tensor_contiguous(src->dim(), src_sizes.data(), src_strides);
 
-    // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C])
-    if (self->dim() == 4 && !self_is_contiguous) {
-      int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2],
-              C = self_sizes[3];
-      if ((self_strides[0] == H * W * C || N <= 1) &&
-          (self_strides[1] == W * C || H <= 1) &&
-          (self_strides[2] == C || W == 1) &&
-          (self_strides[3] == 1 || C == 1)) {
-        self_is_channels_last = true;
-      }
+    // Check if channels-last (4D: NHWC format)
+    if (!self_is_contiguous) {
+      self_is_channels_last =
+          is_tensor_channels_last(self->dim(), self_sizes.data(), self_strides);
     }
 
-    if (src->dim() == 4 && !src_is_contiguous) {
-      int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2],
-              C = src_sizes[3];
-      if ((src_strides[0] == H * W * C || N <= 1) &&
-          (src_strides[1] == W * C || H <= 1) &&
-          (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) {
-        src_is_channels_last = true;
-      }
+    if (!src_is_contiguous) {
+      src_is_channels_last =
+          is_tensor_channels_last(src->dim(), src_sizes.data(), src_strides);
     }
 
     // Validate layout assumptions only when schemas differ
@@ -409,17 +459,27 @@ AOTITorchError aoti_torch_copy_(
         std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : "");
       }
       std::cout << "]" << std::endl;
+      std::cout << "self_sizes: [";
+      for (int i = 0; i < self->dim(); i++) {
+        std::cout << self_sizes[i] << (i < self->dim() - 1 ? ", " : "");
+      }
+      std::cout << "]" << std::endl;
       return Error::InvalidArgument;
     }
 
     if (!src_is_contiguous && !src_is_channels_last) {
       std::cout
-          << "Error: src tensor must be contiguous or channels-last for stride conversion. "
+          << "Error: src tensor must be contiguous or channels-last for stride conversion. \n"
           << "Got strides: [";
       for (int i = 0; i < src->dim(); i++) {
         std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : "");
       }
       std::cout << "]" << std::endl;
+      std::cout << "src_sizes: [";
+      for (int i = 0; i < self->dim(); i++) {
+        std::cout << src_sizes[i] << (i < self->dim() - 1 ? ", " : "");
+      }
+      std::cout << "]" << std::endl;
       return Error::InvalidArgument;
     }
   }
@@ -667,6 +727,13 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     return dtype_err;
   }
 
+  if (dtype != 6) { // 6 = float32
+    std::cout
+        << "ERROR: Only float32 tensors are supported in reinterpret_tensor. Got dtype: "
+        << dtype << " (expected: 6 for float32)" << std::endl;
+    return Error::InvalidArgument;
+  }
+
   int32_t device_type;
   AOTITorchError device_type_err =
       aoti_torch_get_device_type(self, &device_type);
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index dcea848597e..1ffcdba381d 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -45,6 +45,14 @@ AOTITorchError aoti_torch_get_storage_offset(
     int64_t* ret_storage_offset) {
   // Storage offset is always 0 in ET
   *ret_storage_offset = 0;
+
+  // ASSERTION: Storage offset must always be 0
+  if (*ret_storage_offset != 0) {
+    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
+              << *ret_storage_offset << std::endl;
+    return Error::InvalidArgument;
+  }
+
   return Error::Ok;
 }
 
@@ -73,6 +81,14 @@ AOTITorchError aoti_torch_get_dtype(
     AOTITensorHandle tensor,
     int32_t* ret_dtype) {
   *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
+
+  // ASSERTION: Only float32 tensors are supported
+  if (*ret_dtype != 6) { // 6 = float32
+    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
+              << *ret_dtype << " (expected: 6 for float32)" << std::endl;
+    return Error::InvalidArgument;
+  }
+
   return Error::Ok;
 }
 
diff --git a/compare_outputs.py b/compare_outputs.py
new file mode 100755
index 00000000000..e83b701f73a
--- /dev/null
+++ b/compare_outputs.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+"""
+Comparison script to calculate max absolute tolerance (atol) and max relative tolerance (rtol)
+between runtime outputs and label outputs.
+"""
+
+import os
+import sys
+
+import numpy as np
+
+
+def read_csv_file(filepath):
+    """Read a comma-separated values file and return as numpy array."""
+    try:
+        with open(filepath, "r") as f:
+            content = f.read().strip()
+            if not content:
+                print(f"Warning: {filepath} is empty")
+                return np.array([])
+
+            # Split by comma and convert to float
+            values = [float(x.strip()) for x in content.split(",") if x.strip()]
+            return np.array(values)
+    except FileNotFoundError:
+        print(f"Error: {filepath} not found")
+        return None
+    except ValueError as e:
+        print(f"Error parsing {filepath}: {e}")
+        return None
+
+
+def calculate_tolerances(runtime_outputs, label_outputs):
+    """Calculate max absolute and relative tolerances."""
+    if runtime_outputs is None or label_outputs is None:
+        return None, None
+
+    if len(runtime_outputs) == 0 or len(label_outputs) == 0:
+        print("Warning: One of the output arrays is empty")
+        return None, None
+
+    if len(runtime_outputs) != len(label_outputs):
+        print(
+            f"Warning: Array lengths don't match: runtime={len(runtime_outputs)}, label={len(label_outputs)}"
+        )
+        # Pad shorter array with zeros or truncate longer array
+        min_len = min(len(runtime_outputs), len(label_outputs))
+        runtime_outputs = runtime_outputs[:min_len]
+        label_outputs = label_outputs[:min_len]
+
+    # Calculate absolute differences
+    abs_diff = np.abs(runtime_outputs - label_outputs)
+    max_atol = np.max(abs_diff)
+
+    # Calculate relative differences (avoid division by zero)
+    # rel_diff = |a - b| / max(|a|, |b|, eps) where eps is a small number
+    eps = 1e-8
+    denominator = np.maximum(
+        np.maximum(np.abs(runtime_outputs), np.abs(label_outputs)), eps
+    )
+    rel_diff = abs_diff / denominator
+    max_rtol = np.max(rel_diff)
+
+    return max_atol, max_rtol
+
+
+def main():
+    """Main function to compare outputs and print tolerances."""
+    # File paths
+    runtime_file = "aoti_debug_data/final_runtime_output.txt"
+    label_file = "aoti_debug_data/label_output.txt"
+
+    print("=" * 60)
+    print("AOTI Runtime vs Label Output Comparison")
+    print("=" * 60)
+
+    # Check if files exist
+    if not os.path.exists(runtime_file):
+        print(f"Error: {runtime_file} not found")
+        sys.exit(1)
+
+    if not os.path.exists(label_file):
+        print(f"Error: {label_file} not found")
+        sys.exit(1)
+
+    # Read the files
+    print(f"Reading runtime outputs from: {runtime_file}")
+    runtime_outputs = read_csv_file(runtime_file)
+
+    print(f"Reading label outputs from: {label_file}")
+    label_outputs = read_csv_file(label_file)
+
+    if runtime_outputs is None or label_outputs is None:
+        print("Failed to read one or both files")
+        sys.exit(1)
+
+    print(f"Runtime outputs shape: {runtime_outputs.shape}")
+    print(f"Label outputs shape: {label_outputs.shape}")
+
+    if runtime_outputs.shape != label_outputs.shape:
+        print("Error: Output shapes don't match")
+        sys.exit(1)
+
+    # Calculate tolerances
+    max_atol, max_rtol = calculate_tolerances(runtime_outputs, label_outputs)
+
+    if max_atol is None or max_rtol is None:
+        print("Failed to calculate tolerances")
+        sys.exit(1)
+
+    # Print results
+    print("-" * 60)
+    print("COMPARISON RESULTS:")
+    print(f"Max Absolute Tolerance (atol): {max_atol:.10f}")
+    print(f"Max Relative Tolerance (rtol): {max_rtol:.10f}")
+    print("-" * 60)
+
+    # Print some statistics
+    print("ADDITIONAL STATISTICS:")
+    print(f"Total elements compared: {len(runtime_outputs)}")
+    print(
+        f"Runtime output range: [{np.min(runtime_outputs):.6f}, {np.max(runtime_outputs):.6f}]"
+    )
+    print(
+        f"Label output range: [{np.min(label_outputs):.6f}, {np.max(label_outputs):.6f}]"
+    )
+
+    # Calculate mean absolute difference
+    abs_diff = np.abs(runtime_outputs - label_outputs)
+    mean_atol = np.mean(abs_diff)
+    print(f"Mean Absolute Tolerance: {mean_atol:.10f}")
+
+    # Check if outputs are close within common tolerances
+    is_close_1e5 = np.allclose(
+        runtime_outputs,
+        label_outputs,
+        atol=1e-5,
+        rtol=1e-5,
+    )
+    is_close_1e6 = np.allclose(
+        runtime_outputs,
+        label_outputs,
+        atol=1e-6,
+        rtol=1e-6,
+    )
+
+    print(f"Close within atol=1e-5, rtol=1e-5: {is_close_1e5}")
+    print(f"Close within atol=1e-6, rtol=1e-6: {is_close_1e6}")
+
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 5ce872eec8e..4a4b659c748 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -1,7 +1,7 @@
 /*
  * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
  * Copyright 2024-2025 Arm Limited and/or its affiliates.
+ * All rights reserved.
  *
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
@@ -50,16 +50,6 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
-DEFINE_string(inputs, "", "Comma-separated list of input files");
-DEFINE_string(
-    output_file,
-    "",
-    "Base name of output file. If not empty output will be written to the file(s).");
-
-DEFINE_bool(
-    print_all_output,
-    false,
-    "Prints all output. By default only first and last 100 elements are printed.");
 DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
@@ -69,8 +59,6 @@ DEFINE_int32(
     -1,
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
 using executorch::extension::FileDataLoader;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
@@ -83,8 +71,6 @@ using executorch::runtime::MethodMeta;
 using executorch::runtime::Program;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
-using executorch::runtime::Tag;
-using executorch::runtime::TensorInfo;
 
 /// Helper to manage resources for ETDump generation
 class EventTraceManager {
@@ -171,43 +157,6 @@ int main(int argc, char** argv) {
       "FileDataLoader::from() failed: 0x%" PRIx32,
       (uint32_t)loader.error());
 
-  std::vector<std::string> inputs_storage;
-  std::vector<std::pair<char*, size_t>> input_buffers;
-
-  std::stringstream list_of_input_files(FLAGS_inputs);
-  std::string path;
-
-  // First reserve memory for number of vector elements to avoid vector
-  // reallocations when emplacing back.
-  std::vector<std::string> file_paths;
-  while (std::getline(list_of_input_files, path, ',')) {
-    file_paths.push_back(std::move(path));
-  }
-  inputs_storage.reserve(file_paths.size());
-
-  for (const auto& file_path : file_paths) {
-    std::ifstream input_file_handle(
-        file_path, std::ios::binary | std::ios::ate);
-
-    if (!input_file_handle) {
-      ET_LOG(Error, "Failed to open input file: %s\n", file_path.c_str());
-      return 1;
-    }
-
-    std::streamsize file_size = input_file_handle.tellg();
-    input_file_handle.seekg(0, std::ios::beg);
-
-    // Reserve memory for actual file contents.
-    inputs_storage.emplace_back(file_size, '\0');
-
-    if (!input_file_handle.read(&inputs_storage.back()[0], file_size)) {
-      ET_LOG(Error, "Failed to read input file: %s\n", file_path.c_str());
-      return 1;
-    }
-
-    input_buffers.emplace_back(&inputs_storage.back()[0], file_size);
-  }
-
   // Parse the program file. This is immutable, and can also be reused between
   // multiple execution invocations across multiple threads.
   Result<Program> program = Program::load(&loader.get());
@@ -306,8 +255,7 @@ int main(int argc, char** argv) {
   // Run the model.
   for (uint32_t i = 0; i < FLAGS_num_executions; i++) {
     ET_LOG(Debug, "Preparing inputs.");
-    // Allocate input tensors and set all of their elements to 1 or to the
-    // contents of input_buffers if available. The `inputs`
+    // Allocate input tensors and set all of their elements to 1. The `inputs`
     // variable owns the allocated memory and must live past the last call to
     // `execute()`.
     //
@@ -315,8 +263,7 @@ int main(int argc, char** argv) {
     // because inputs whose space gets reused by memory planning (if
     // any such inputs exist) will not be preserved for the next
     // execution.
-    auto inputs = executorch::extension::prepare_input_tensors(
-        *method, {}, input_buffers);
+    auto inputs = executorch::extension::prepare_input_tensors(*method);
     ET_CHECK_MSG(
         inputs.ok(),
         "Could not prepare inputs: 0x%" PRIx32,
@@ -348,69 +295,47 @@ int main(int argc, char** argv) {
   std::vector<EValue> outputs(method->outputs_size());
   ET_LOG(Info, "%zu outputs: ", outputs.size());
   Error status = method->get_outputs(outputs.data(), outputs.size());
+
   ET_CHECK(status == Error::Ok);
 
-  if (FLAGS_output_file.size() > 0) {
-    for (int i = 0; i < outputs.size(); ++i) {
-      if (outputs[i].isTensor()) {
-        Tensor tensor = outputs[i].toTensor();
-
-        char out_filename[255];
-        snprintf(out_filename, 255, "%s-%d.bin", FLAGS_output_file.c_str(), i);
-        ET_LOG(Info, "Writing output to file: %s", out_filename);
-        FILE* out_file = fopen(out_filename, "wb");
-        fwrite(tensor.const_data_ptr<char>(), 1, tensor.nbytes(), out_file);
-        fclose(out_file);
-      }
-    }
+  // Open file to dump outputs
+  std::ofstream output_file("aoti_debug_data/final_runtime_output.txt");
+  if (!output_file.is_open()) {
+    ET_LOG(Error, "Failed to open output file for dumping");
   }
 
-  if (FLAGS_print_all_output) {
-    for (int i = 0; i < outputs.size(); ++i) {
-      if (outputs[i].isTensor()) {
-        Tensor tensor = outputs[i].toTensor();
-
-        for (int j = 0; j < tensor.numel(); ++j) {
-          if (tensor.scalar_type() == ScalarType::Int) {
-            printf(
-                "Output[%d][%d]: (int) %d\n",
-                i,
-                j,
-                tensor.const_data_ptr<int>()[j]);
-          } else if (tensor.scalar_type() == ScalarType::Float) {
-            printf(
-                "Output[%d][%d]: (float) %f\n",
-                i,
-                j,
-                tensor.const_data_ptr<float>()[j]);
-          } else if (tensor.scalar_type() == ScalarType::Char) {
-            printf(
-                "Output[%d][%d]: (char) %d\n",
-                i,
-                j,
-                tensor.const_data_ptr<int8_t>()[j]);
-          } else if (tensor.scalar_type() == ScalarType::Bool) {
-            printf(
-                "Output[%d][%d]: (bool) %s (0x%x)\n",
-                i,
-                j,
-                tensor.const_data_ptr<int8_t>()[j] ? "true " : "false",
-                tensor.const_data_ptr<int8_t>()[j]);
-          }
-        }
-      } else {
-        printf("Output[%d]: Not Tensor\n", i);
+  // Print the first and last 100 elements of long lists of scalars.
+  std::cout << executorch::extension::evalue_edge_items(100);
+  for (int i = 0; i < outputs.size(); ++i) {
+    std::cout << "Output " << i << ": " << outputs[i] << std::endl;
+
+    // Also dump to file - extract tensor data and write comma-separated values
+    if (output_file.is_open() && outputs[i].isTensor()) {
+      auto tensor = outputs[i].toTensor();
+      const void* data_ptr = tensor.const_data_ptr();
+
+      // assert output is in float different tensor types
+      const float* float_data = static_cast<const float*>(data_ptr);
+      size_t num_elements = tensor.numel();
+
+      for (size_t j = 0; j < num_elements; ++j) {
+        if (j > 0)
+          output_file << ",";
+        output_file << float_data[j];
       }
-    }
-  } else {
-    // Print the first and last 100 elements of long lists of scalars.
-    std::cout << executorch::extension::evalue_edge_items(100);
 
-    for (int i = 0; i < outputs.size(); ++i) {
-      std::cout << "OutputX " << i << ": " << outputs[i] << std::endl;
+      if (i < outputs.size() - 1)
+        output_file << ",";
     }
   }
 
+  if (output_file.is_open()) {
+    output_file.close();
+    ET_LOG(
+        Info,
+        "Runtime outputs dumped to aoti_debug_data/final_runtime_output.txt");
+  }
+
   if (tracer.get_event_tracer()) {
     // Dump ETDump data containing profiling/debugging data to file specified in
     // command line flag.
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index ebb1a44239e..7a60cb66be5 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -160,6 +160,11 @@ run_inference() {
     ./cmake-out/executor_runner --model_path aoti_model.pte
 }
 
+compare_outputs() {
+    echo "Comparing runtime outputs with label outputs..."
+    python compare_outputs.py
+}
+
 # Set up environment variables based on debug and dump flags
 if [[ "$DEBUG_MODE" == true ]]; then
     echo "Setting debug environment variables..."
@@ -169,7 +174,10 @@ if [[ "$DEBUG_MODE" == true ]]; then
     # Set intermediate value printer based on dump flag
     if [[ "$DUMP_MODE" == true ]]; then
         export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+        export INDUCTOR_PROVENANCE=1
+        export TORCH_TRACE="/home/gasoonjia/executorch/aoti_debug_data"
         echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)"
+        echo "Eager-AOTI relationship extration enabled (INDUCTOR_PROVENANCE=1), output to $TORCH_TRACE"
     else
         export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="3"
     fi
@@ -182,13 +190,18 @@ elif [[ "$DUMP_MODE" == true ]]; then
     # Only dump mode enabled (without debug)
     echo "Setting AOTI intermediate output dumping..."
     export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2"
+    export INDUCTOR_PROVENANCE=1
+    export TORCH_TRACE="/home/gasoonjia/executorch/aoti_debug_data"
     echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)"
     echo "  AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER"
+    echo "Eager-AOTI relationship extration enabled (INDUCTOR_PROVENANCE=1), output to $TORCH_TRACE"
 else
     # Ensure debug variables are unset for non-debug/non-dump modes
     unset AOT_INDUCTOR_DEBUG_COMPILE
     unset AOTINDUCTOR_REPRO_LEVEL
     unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER
+    unset INDUCTOR_PROVENANCE
+    unset TORCH_TRACE
 fi
 
 # Execute based on mode
@@ -203,6 +216,7 @@ case "$MODE" in
         clean_install_executorch
         build_runtime
         run_inference
+        compare_outputs
         ;;
     "reinstall_aot")
         echo "Mode: reinstall_aot - Reinstall AOT components and run e2e"
@@ -212,6 +226,7 @@ case "$MODE" in
         install_executorch
         export_aoti_model
         run_inference
+        compare_outputs
         ;;
     "reinstall_runtime")
         echo "Mode: reinstall_runtime - Rebuild runtime and run e2e"
@@ -221,6 +236,7 @@ case "$MODE" in
         export_aoti_model
         build_runtime
         run_inference
+        compare_outputs
         ;;
     "inference")
         echo "Mode: inference - Export model and run inference only"
@@ -229,6 +245,7 @@ case "$MODE" in
         fi
         export_aoti_model
         run_inference
+        compare_outputs
         ;;
     "export_aoti_only")
         echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)"
diff --git a/export_aoti.py b/export_aoti.py
index 2550f33a55a..9f9d4ce8e6c 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -34,6 +34,12 @@
 from torchvision.models.resnet import ResNet18_Weights
 
 
+# for maintaing precision of 32-bit float as much as possible
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+torch.backends.cudnn.conv.fp32_precision = "fp32"
+
+
 # Model classes
 class MV2(torch.nn.Module):
     def __init__(self):
@@ -109,6 +115,56 @@ def forward(self, x):
         return self.bn(x)
 
 
+class SingleResNetBlock(nn.Module):
+    def __init__(self, in_channels=64, out_channels=64, stride=1):
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False,
+        )
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(out_channels)
+
+        # Skip connection - identity mapping if same channels, 1x1 conv if different
+        self.skip_connection = None
+        if stride != 1 or in_channels != out_channels:
+            self.skip_connection = nn.Sequential(
+                nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=stride, bias=False
+                ),
+                nn.BatchNorm2d(out_channels),
+            )
+
+    def forward(self, x):
+        identity = x
+
+        # First conv block
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # Second conv block
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        # Skip connection
+        if self.skip_connection is not None:
+            identity = self.skip_connection(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
 # Model registry mapping model names to their configurations
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
     "mv2": {
@@ -153,6 +209,12 @@ def forward(self, x):
         "device": "cuda",
         "description": "Single BatchNorm2d layer model",
     },
+    "single_resnet_block": {
+        "model_class": SingleResNetBlock,
+        "input_shapes": [(1, 64, 8, 8)],
+        "device": "cuda",
+        "description": "Single ResNet block with skip connection",
+    },
 }
 
 
@@ -187,7 +249,25 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p
         torch.ones_like(example_input) for example_input in example_inputs
     )
 
-    print("label", model(*all_one_input))
+    label_output = model(*all_one_input)
+    print("label", label_output)
+
+    # Create directory if it doesn't exist
+    os.makedirs("aoti_debug_data", exist_ok=True)
+
+    # Dump label to file
+    with open("aoti_debug_data/label_output.txt", "w") as f:
+        if isinstance(label_output, tuple):
+            # Multiple outputs
+            all_elements = []
+            for tensor in label_output:
+                if tensor.numel() > 0:
+                    all_elements.extend(tensor.flatten().tolist())
+            f.write(",".join(map(str, all_elements)))
+        else:
+            # Single output
+            if label_output.numel() > 0:
+                f.write(",".join(map(str, label_output.flatten().tolist())))
 
     print(f"Starting export process...")
 

From 7962fb348fa092e8355aac2edd898d97a38f24e6 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 2 Sep 2025 22:35:01 -0700
Subject: [PATCH 30/50] centralize def type

---
 backends/aoti/runtime/shims/memory.cpp        |  3 -
 backends/aoti/runtime/shims/memory.h          | 67 +++++++------------
 .../aoti/runtime/shims/tensor_attribute.cpp   |  3 -
 .../aoti/runtime/shims/tensor_attribute.h     | 11 +--
 backends/aoti/runtime/shims/types.h           | 45 +++++++++++++
 backends/aoti/runtime/shims/utils.cpp         |  2 +-
 backends/aoti/runtime/shims/utils.h           | 14 +---
 backends/aoti/runtime/targets.bzl             |  1 +
 8 files changed, 76 insertions(+), 70 deletions(-)
 create mode 100644 backends/aoti/runtime/shims/types.h

diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 25e750edb3c..cbf52932268 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -23,9 +23,6 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
 namespace { // Internal namespace for utility functions
 
 // Version 1: For use with int64_t sizes (e.g., from blob creation functions)
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index 0b8af138c90..57058397972 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -8,66 +8,49 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/core/error.h>
 #include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "types.h"
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
 extern "C" {
 
-// Type definitions
-using AOTITensorHandle = Tensor*;
-using AOTIRuntimeError = Error;
-using AOTITorchError = Error;
-
-struct CUDAStreamGuardOpaque {
-  cudaStream_t original_stream;
-  int device_index;
-  cudaEvent_t sync_event;
-};
-using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
-
 // Global storage declarations
 extern std::unordered_map<Tensor*, bool> is_tensor_own_memory;
 extern std::unordered_set<std::shared_ptr<Tensor>> tensors;
 
 // Memory-related operations
-AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor,
-    int32_t layout,
-    const uint8_t* opaque_metadata,
-    int64_t opaque_metadata_size);
-
-AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor);
+// AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+//     void* data,
+//     int64_t ndim,
+//     const int64_t* sizes_ptr,
+//     const int64_t* strides_ptr,
+//     int64_t storage_offset,
+//     int32_t dtype,
+//     int32_t device_type,
+//     int32_t device_index,
+//     AOTITensorHandle* ret_new_tensor,
+//     int32_t layout,
+//     const uint8_t* opaque_metadata,
+//     int64_t opaque_metadata_size);
+
+// AOTITorchError aoti_torch_create_tensor_from_blob(
+//     void* data,
+//     int64_t ndim,
+//     const int64_t* sizes_ptr,
+//     const int64_t* strides_ptr,
+//     int64_t storage_offset,
+//     int32_t dtype,
+//     int32_t device_type,
+//     int32_t device_index,
+//     AOTITensorHandle* ret_new_tensor);
 
 AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index 1ffcdba381d..8e0097cd8bd 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -13,9 +13,6 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
 // Global storage for tensor metadata
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h
index ab4f8037ebf..387056a30fd 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.h
+++ b/backends/aoti/runtime/shims/tensor_attribute.h
@@ -8,8 +8,7 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/core/error.h>
+#include "types.h"
 #include <unordered_map>
 #include <vector>
 
@@ -17,16 +16,8 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
 extern "C" {
 
-// Type definitions
-using AOTITensorHandle = Tensor*;
-using AOTIRuntimeError = Error;
-using AOTITorchError = Error;
-
 // Global storage for tensor metadata
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h
new file mode 100644
index 00000000000..312d05a4d33
--- /dev/null
+++ b/backends/aoti/runtime/shims/types.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
+extern "C" {
+
+// Common AOTI type aliases
+// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
+// CUDA-specific types
+struct CUDAStreamGuardOpaque {
+  cudaStream_t original_stream;
+  int device_index;
+  cudaEvent_t sync_event;
+};
+using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
index 10882c16cf4..e81e141e7fd 100644
--- a/backends/aoti/runtime/shims/utils.cpp
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -25,7 +25,7 @@ const char* const TENSOR_OUTPUT_FILENAME =
 
 extern "C" {
 
-void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) {
+void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) {
   printf("Printing tensor handle: %p\n", self);
 
   if (!self) {
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
index 6bcd34efcfb..c0c2a59be0a 100644
--- a/backends/aoti/runtime/shims/utils.h
+++ b/backends/aoti/runtime/shims/utils.h
@@ -8,26 +8,18 @@
 
 #pragma once
 
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/core/error.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <cstdint>
+#include "types.h"
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
-using executorch::runtime::Error;
-using executorch::runtime::etensor::Tensor;
-
 extern "C" {
 
-// Type definitions
-using AOTITensorHandle = Tensor*;
-using AOTITorchError = Error;
-
 // Utility function for printing tensor information
-void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg);
+void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg);
 
 // Cleanup function for tensor output file (called during backend destruction)
 void cleanup_aoti_tensor_output();
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl
index 2c87ad68a2c..d57a187366f 100644
--- a/backends/aoti/runtime/targets.bzl
+++ b/backends/aoti/runtime/targets.bzl
@@ -14,6 +14,7 @@ def define_common_targets():
             "aoti_model_container.h",
             "shims/memory.h",
             "shims/tensor_attribute.h",
+            "shims/types.h",
             "shims/utils.h",
         ],
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)

From 23de936d876230c992d3f4d08fb804bf965fbfcc Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 3 Sep 2025 12:12:39 -0700
Subject: [PATCH 31/50] enabling llama31

---
 export_aoti.py       | 31 +++++++++++++++++++++++++++++++
 requirements-dev.txt |  1 +
 2 files changed, 32 insertions(+)

diff --git a/export_aoti.py b/export_aoti.py
index 9f9d4ce8e6c..66824bbc3b6 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -32,6 +32,7 @@
 from torchvision import models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 from torchvision.models.resnet import ResNet18_Weights
+from transformers import AutoModelForCausalLM
 
 
 # for maintaing precision of 32-bit float as much as possible
@@ -165,6 +166,30 @@ def forward(self, x):
         return out
 
 
+class Llama31(torch.nn.Module):
+    def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B"):
+        super(Llama31, self).__init__()
+        # Load Llama 3.1 model from HF
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.float32,
+            device_map="cuda",
+            # trust_remote_code=True,
+            use_cache=False,  # Turn off KV cache
+        )
+        self.model.eval()
+
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
+        # Disable KV cache for inference
+        with torch.no_grad():
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                use_cache=False,  # Explicitly turn off KV cache
+            )
+        return outputs.logits
+
+
 # Model registry mapping model names to their configurations
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
     "mv2": {
@@ -215,6 +240,12 @@ def forward(self, x):
         "device": "cuda",
         "description": "Single ResNet block with skip connection",
     },
+    "llama31": {
+        "model_class": Llama31,
+        "input_shapes": [(1, 128)],  # batch_size=1, sequence_length=128
+        "device": "cuda",
+        "description": "Llama 3.1 model with KV cache disabled",
+    },
 }
 
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 8c8f518a5ea..964bdecef76 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,3 +10,4 @@ certifi  # Imported by resolve_buck.py.
 lintrunner==0.12.7
 lintrunner-adapters==0.12.6
 patchelf
+transformers

From b792c7d973465cc24081e937f97fa1a73110051f Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 4 Sep 2025 00:07:23 -0700
Subject: [PATCH 32/50] add llama31 for test

---
 ...jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json |   1 +
 config.yaml                                   |   5 +
 export_aoti.py                                |  15 +-
 load_saved_config_example.py                  | 111 +++++++++++++
 .../2025-09-03/14-45-08/.hydra/config.yaml    |  73 ++++++++
 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml | 154 +++++++++++++++++
 .../2025-09-03/14-45-08/.hydra/overrides.yaml |   3 +
 outputs/2025-09-03/14-45-08/export_llm.log    |  40 +++++
 .../2025-09-03/15-17-23/.hydra/config.yaml    |  74 +++++++++
 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml | 157 ++++++++++++++++++
 .../2025-09-03/15-17-23/.hydra/overrides.yaml |   6 +
 outputs/2025-09-03/15-17-23/export_llm.log    |  38 +++++
 .../2025-09-03/15-30-13/.hydra/config.yaml    |  74 +++++++++
 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml | 157 ++++++++++++++++++
 .../2025-09-03/15-30-13/.hydra/overrides.yaml |   6 +
 outputs/2025-09-03/15-30-13/export_llm.log    |   0
 .../2025-09-03/16-25-46/.hydra/config.yaml    |  74 +++++++++
 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml | 157 ++++++++++++++++++
 .../2025-09-03/16-25-46/.hydra/overrides.yaml |   6 +
 outputs/2025-09-03/16-25-46/export_llm.log    |   0
 .../2025-09-03/16-29-28/.hydra/config.yaml    |  74 +++++++++
 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml | 157 ++++++++++++++++++
 .../2025-09-03/16-29-28/.hydra/overrides.yaml |   6 +
 outputs/2025-09-03/16-29-28/export_llm.log    |   0
 .../2025-09-03/16-30-46/.hydra/config.yaml    |  73 ++++++++
 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml | 156 +++++++++++++++++
 .../2025-09-03/16-30-46/.hydra/overrides.yaml |   5 +
 outputs/2025-09-03/16-30-46/export_llm.log    |  40 +++++
 saved_llm_config.yaml                         |  73 ++++++++
 29 files changed, 1732 insertions(+), 3 deletions(-)
 create mode 100644 clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json
 create mode 100644 config.yaml
 create mode 100644 load_saved_config_example.py
 create mode 100644 outputs/2025-09-03/14-45-08/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/14-45-08/export_llm.log
 create mode 100644 outputs/2025-09-03/15-17-23/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/15-17-23/export_llm.log
 create mode 100644 outputs/2025-09-03/15-30-13/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/15-30-13/export_llm.log
 create mode 100644 outputs/2025-09-03/16-25-46/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/16-25-46/export_llm.log
 create mode 100644 outputs/2025-09-03/16-29-28/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/16-29-28/export_llm.log
 create mode 100644 outputs/2025-09-03/16-30-46/.hydra/config.yaml
 create mode 100644 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
 create mode 100644 outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
 create mode 100644 outputs/2025-09-03/16-30-46/export_llm.log
 create mode 100644 saved_llm_config.yaml

diff --git a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json
new file mode 100644
index 00000000000..8c1a9f6d812
--- /dev/null
+++ b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json
@@ -0,0 +1 @@
+{"nodes": [{"name": "buf6", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf7"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf8", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf9"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf12", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf13"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf14", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf15"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf18", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf19"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf39", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf40"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf41", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf42"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf45", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf46"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf47", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf48"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf50", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf51"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf72", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf73"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf74", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf75"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf78", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf79"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf80", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf81"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf83", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf84"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf104", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf105"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf106", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf107"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf110", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf111"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf112", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf113"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf115", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf116"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf137", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf138"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf139", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf140"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf143", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf144"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf145", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf146"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf148", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf149"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf169", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf170"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf171", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf172"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf175", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf176"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf177", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf178"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf180", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf181"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf202", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf203"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf204", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf205"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf208", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf209"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf210", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf211"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf213", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf214"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf234", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf235"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf236", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf237"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf240", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf241"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf242", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf243"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf245", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf246"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf267", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf268"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf269", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf270"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf273", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf274"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf275", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf276"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf278", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf279"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf299", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf300"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf301", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf302"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf305", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf306"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf307", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf308"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf310", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf311"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf332", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf333"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf334", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf335"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf338", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf339"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf340", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf341"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf343", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf344"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf364", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf365"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf366", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf367"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf370", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf371"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf372", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf373"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf375", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf376"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf397", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf398"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf399", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf400"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf403", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf404"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf405", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf406"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf408", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf409"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf429", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf430"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf431", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf432"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf435", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf436"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf437", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf438"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf440", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf441"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf462", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf463"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf464", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf465"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf468", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf469"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf470", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf471"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf473", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf474"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf494", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf495"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf496", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf497"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf500", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf501"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf502", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf503"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf505", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf506"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf527", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf528"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf529", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf530"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf533", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf534"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf535", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf536"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf538", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf539"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf559", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf560"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf561", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf562"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf565", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf566"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf567", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf568"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf570", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf571"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf592", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf593"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf594", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf595"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf598", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf599"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf600", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf601"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf603", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf604"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf624", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf625"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf626", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf627"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf630", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf631"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf632", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf633"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf635", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf636"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf657", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf658"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf659", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf660"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf663", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf664"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf665", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf666"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf668", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf669"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf689", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf690"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf691", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf692"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf695", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf696"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf697", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf698"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf700", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf701"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf722", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf723"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf724", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf725"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf728", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf729"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf730", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf731"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf733", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf734"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf754", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf755"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf756", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf757"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf760", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf761"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf762", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf763"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf765", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf766"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf787", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf788"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf789", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf790"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf793", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf794"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf795", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf796"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf798", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf799"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf819", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf820"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf821", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf822"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf825", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf826"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf827", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf828"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf830", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf831"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf852", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf853"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf854", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf855"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf858", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf859"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf860", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf861"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf863", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf864"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf884", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf885"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf886", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf887"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf890", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf891"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf892", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf893"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf895", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf896"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf917", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf918"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf919", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf920"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf923", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf924"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf925", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf926"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf928", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf929"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf949", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf950"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf951", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf952"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf955", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf956"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf957", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf958"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf960", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf961"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf982", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf983"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf984", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf985"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf988", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf989"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf990", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf991"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf993", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf994"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1014", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1015"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1016", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1017"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1020", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1021"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1022", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1023"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1025", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1026"}}], "metadata": {}, "is_hop_single_tensor_return": null}}]}
\ No newline at end of file
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 00000000000..7fbd565cff5
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,5 @@
+base:
+  model_class: llama3_2
+  checkpoint: /home/gasoonjia//consolidated.00.pth
+  params: /home/gasoonjia/executorch/params.json
+  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
diff --git a/export_aoti.py b/export_aoti.py
index 66824bbc3b6..c1c24d212ef 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -242,7 +242,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
     },
     "llama31": {
         "model_class": Llama31,
-        "input_shapes": [(1, 128)],  # batch_size=1, sequence_length=128
+        "input_shapes": [(1, 32)],  # batch_size=1, sequence_length=128
         "device": "cuda",
         "description": "Llama 3.1 model with KV cache disabled",
     },
@@ -269,7 +269,14 @@ def get_model_and_inputs(
     model = model_class().to(device).eval()
 
     # Create example inputs (support multiple inputs)
-    example_inputs = tuple(torch.randn(*shape, device=device) for shape in input_shapes)
+    example_inputs = tuple(
+        (
+            torch.randint(0, 10000, size=shape, device=device)
+            if model_name == "llama31"
+            else torch.randn(*shape, device=device)
+        )
+        for shape in input_shapes
+    )
 
     return model, example_inputs
 
@@ -304,7 +311,9 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p
 
     # 1. torch.export: Defines the program with the ATen operator set.
     print("Step 1: Converting to ATen dialect...")
-    aten_dialect = export(model, example_inputs)
+    with torch.no_grad():
+        # from torch.export._trace import _export
+        aten_dialect = export(model, example_inputs, strict=False)
 
     # print(aten_dialect)
     # exit(0)
diff --git a/load_saved_config_example.py b/load_saved_config_example.py
new file mode 100644
index 00000000000..95c2c9a07bd
--- /dev/null
+++ b/load_saved_config_example.py
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""
+Example script showing how to load a saved LLM config and use it.
+"""
+
+import os
+import sys
+
+# Add the executorch path to import modules
+sys.path.append("/home/gasoonjia/executorch")
+
+from executorch.examples.models.llama.export_llama_lib import export_llama
+from executorch.extension.llm.export.config.llm_config import LlmConfig
+from executorch.extension.llm.export.export_llm import (
+    load_config_from_file,
+    save_config_to_file,
+)
+
+
+def load_and_use_saved_config():
+    """Load a previously saved config and use it for export."""
+
+    # Method 1: Load from a saved YAML file
+    try:
+        config_obj = load_config_from_file("used_config_llama3.yaml")
+        print("✓ Successfully loaded config from used_config_llama3.yaml")
+
+        # Optional: Modify the loaded config
+        print("Original quantization mode:", config_obj.quantization.qmode)
+        config_obj.quantization.qmode = "8da4w"  # Change quantization
+        config_obj.debug.verbose = True  # Enable verbose logging
+        print("Modified quantization mode:", config_obj.quantization.qmode)
+
+        # Use the config for export
+        print("Starting export with loaded config...")
+        output_file = export_llama(config_obj)
+        print(f"✓ Export completed! Output: {output_file}")
+
+    except FileNotFoundError:
+        print("❌ Config file 'used_config_llama3.yaml' not found.")
+        print("First save a config by running the main export script.")
+        return False
+
+    return True
+
+
+def create_and_save_custom_config():
+    """Create a custom config and save it."""
+
+    # Create a new config from scratch
+    custom_config = LlmConfig()
+
+    # Configure the model
+    custom_config.base.model_class = "llama3"
+    custom_config.base.checkpoint = (
+        "/path/to/your/checkpoint.pth"  # Set your checkpoint path
+    )
+
+    # Configure model settings
+    custom_config.model.use_kv_cache = True
+    custom_config.model.use_sdpa_with_kv_cache = True
+    custom_config.model.dtype_override = "fp32"
+
+    # Configure export settings
+    custom_config.export.max_seq_length = 2048
+    custom_config.export.output_dir = "./outputs"
+
+    # Configure backend
+    custom_config.backend.xnnpack.enabled = True
+    custom_config.backend.xnnpack.extended_ops = True
+
+    # Configure quantization
+    custom_config.quantization.qmode = "8da4w"
+
+    # Configure debug
+    custom_config.debug.verbose = True
+
+    # Save the custom config
+    config_filename = "my_custom_llama_config.yaml"
+    save_config_to_file(custom_config, config_filename)
+    print(f"✓ Custom config saved to {config_filename}")
+
+    # Load it back to verify
+    loaded_config = load_config_from_file(config_filename)
+    print("✓ Verified: Config loaded successfully")
+
+    return loaded_config
+
+
+def main():
+    print("=== LLM Config Load/Save Examples ===\n")
+
+    # Example 1: Try to load a previously saved config
+    print("1. Attempting to load saved config...")
+    success = load_and_use_saved_config()
+
+    if not success:
+        print("\n2. Creating and saving a custom config...")
+        custom_config = create_and_save_custom_config()
+
+        print("\n3. Using the custom config for export...")
+        try:
+            output_file = export_llama(custom_config)
+            print(f"✓ Export completed with custom config! Output: {output_file}")
+        except Exception as e:
+            print(f"❌ Export failed: {e}")
+            print("Make sure to set a valid checkpoint path in the config.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/outputs/2025-09-03/14-45-08/.hydra/config.yaml b/outputs/2025-09-03/14-45-08/.hydra/config.yaml
new file mode 100644
index 00000000000..34a34cf92f9
--- /dev/null
+++ b/outputs/2025-09-03/14-45-08/.hydra/config.yaml
@@ -0,0 +1,73 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
diff --git a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
new file mode 100644
index 00000000000..c2e16273566
--- /dev/null
+++ b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
@@ -0,0 +1,154 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/14-45-08
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
new file mode 100644
index 00000000000..acc7258c572
--- /dev/null
+++ b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
@@ -0,0 +1,3 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
diff --git a/outputs/2025-09-03/14-45-08/export_llm.log b/outputs/2025-09-03/14-45-08/export_llm.log
new file mode 100644
index 00000000000..574ad77780e
--- /dev/null
+++ b/outputs/2025-09-03/14-45-08/export_llm.log
@@ -0,0 +1,40 @@
+[2025-09-03 14:45:08,888][root][INFO] - Applying quantizers: []
+[2025-09-03 14:45:17,670][root][INFO] - Checkpoint dtype: torch.bfloat16
+[2025-09-03 14:45:17,672][root][INFO] - Model after source transforms: Transformer(
+  (tok_embeddings): Embedding(128256, 4096)
+  (layers): ModuleList(
+    (0-31): 32 x TransformerBlock(
+      (attention): AttentionMHA(
+        (wq): Linear(in_features=4096, out_features=4096, bias=False)
+        (wk): Linear(in_features=4096, out_features=1024, bias=False)
+        (wv): Linear(in_features=4096, out_features=1024, bias=False)
+        (wo): Linear(in_features=4096, out_features=4096, bias=False)
+        (rope): Rope(
+          (apply_rotary_emb): RotaryEmbedding()
+        )
+      )
+      (feed_forward): FeedForward(
+        (w1): Linear(in_features=4096, out_features=14336, bias=False)
+        (w2): Linear(in_features=14336, out_features=4096, bias=False)
+        (w3): Linear(in_features=4096, out_features=14336, bias=False)
+      )
+      (attention_norm): RMSNorm()
+      (ffn_norm): RMSNorm()
+    )
+  )
+  (rope): Rope(
+    (apply_rotary_emb): RotaryEmbedding()
+  )
+  (norm): RMSNorm()
+  (output): Linear(in_features=4096, out_features=128256, bias=False)
+)
+[2025-09-03 14:45:17,673][root][INFO] - Exporting with:
+[2025-09-03 14:45:17,674][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
+[2025-09-03 14:45:17,674][root][INFO] - kwargs: None
+[2025-09-03 14:45:17,674][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
+[2025-09-03 14:45:33,074][root][INFO] - Running canonical pass: RemoveRedundantTransposes
+[2025-09-03 14:45:33,152][root][INFO] - Using pt2e [] to quantizing the model...
+[2025-09-03 14:45:33,152][root][INFO] - No quantizer provided, passing...
+[2025-09-03 14:46:55,091][root][INFO] - Lowering model using following partitioner(s): 
+[2025-09-03 14:47:47,454][root][INFO] - Required memory for activation in bytes: [0, 26074624]
+[2025-09-03 14:48:03,642][root][INFO] - Saved exported program to ./llama3_1.pte
diff --git a/outputs/2025-09-03/15-17-23/.hydra/config.yaml b/outputs/2025-09-03/15-17-23/.hydra/config.yaml
new file mode 100644
index 00000000000..74c7f49c21f
--- /dev/null
+++ b/outputs/2025-09-03/15-17-23/.hydra/config.yaml
@@ -0,0 +1,74 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
+save_exported_program: true
diff --git a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
new file mode 100644
index 00000000000..d224649ae3a
--- /dev/null
+++ b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
@@ -0,0 +1,157 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    - ++model.use_kv_cache=False
+    - ++model.use_sdpa_with_kv_cache=False
+    - ++save_exported_program=True
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-17-23
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
new file mode 100644
index 00000000000..fccd73d94f1
--- /dev/null
+++ b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
@@ -0,0 +1,6 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+- ++model.use_kv_cache=False
+- ++model.use_sdpa_with_kv_cache=False
+- ++save_exported_program=True
diff --git a/outputs/2025-09-03/15-17-23/export_llm.log b/outputs/2025-09-03/15-17-23/export_llm.log
new file mode 100644
index 00000000000..9cdb4c31406
--- /dev/null
+++ b/outputs/2025-09-03/15-17-23/export_llm.log
@@ -0,0 +1,38 @@
+[2025-09-03 15:17:23,719][root][INFO] - Applying quantizers: []
+[2025-09-03 15:17:25,710][root][INFO] - Checkpoint dtype: torch.bfloat16
+[2025-09-03 15:17:25,711][root][INFO] - Model after source transforms: Transformer(
+  (tok_embeddings): Embedding(128256, 4096)
+  (layers): ModuleList(
+    (0-31): 32 x TransformerBlock(
+      (attention): AttentionMHA(
+        (wq): Linear(in_features=4096, out_features=4096, bias=False)
+        (wk): Linear(in_features=4096, out_features=1024, bias=False)
+        (wv): Linear(in_features=4096, out_features=1024, bias=False)
+        (wo): Linear(in_features=4096, out_features=4096, bias=False)
+        (rope): Rope(
+          (apply_rotary_emb): RotaryEmbedding()
+        )
+      )
+      (feed_forward): FeedForward(
+        (w1): Linear(in_features=4096, out_features=14336, bias=False)
+        (w2): Linear(in_features=14336, out_features=4096, bias=False)
+        (w3): Linear(in_features=4096, out_features=14336, bias=False)
+      )
+      (attention_norm): RMSNorm()
+      (ffn_norm): RMSNorm()
+    )
+  )
+  (rope): Rope(
+    (apply_rotary_emb): RotaryEmbedding()
+  )
+  (norm): RMSNorm()
+  (output): Linear(in_features=4096, out_features=128256, bias=False)
+)
+[2025-09-03 15:17:25,712][root][INFO] - Exporting with:
+[2025-09-03 15:17:25,712][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
+[2025-09-03 15:17:25,712][root][INFO] - kwargs: None
+[2025-09-03 15:17:25,713][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
+[2025-09-03 15:17:39,308][root][INFO] - Running canonical pass: RemoveRedundantTransposes
+[2025-09-03 15:17:39,376][root][INFO] - Using pt2e [] to quantizing the model...
+[2025-09-03 15:17:39,377][root][INFO] - No quantizer provided, passing...
+[2025-09-03 15:18:45,017][root][INFO] - Lowering model using following partitioner(s): 
diff --git a/outputs/2025-09-03/15-30-13/.hydra/config.yaml b/outputs/2025-09-03/15-30-13/.hydra/config.yaml
new file mode 100644
index 00000000000..74c7f49c21f
--- /dev/null
+++ b/outputs/2025-09-03/15-30-13/.hydra/config.yaml
@@ -0,0 +1,74 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
+save_exported_program: true
diff --git a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
new file mode 100644
index 00000000000..e13edc3e222
--- /dev/null
+++ b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
@@ -0,0 +1,157 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    - ++model.use_kv_cache=False
+    - ++model.use_sdpa_with_kv_cache=False
+    - ++save_exported_program=True
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-30-13
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
new file mode 100644
index 00000000000..fccd73d94f1
--- /dev/null
+++ b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
@@ -0,0 +1,6 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+- ++model.use_kv_cache=False
+- ++model.use_sdpa_with_kv_cache=False
+- ++save_exported_program=True
diff --git a/outputs/2025-09-03/15-30-13/export_llm.log b/outputs/2025-09-03/15-30-13/export_llm.log
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/outputs/2025-09-03/16-25-46/.hydra/config.yaml b/outputs/2025-09-03/16-25-46/.hydra/config.yaml
new file mode 100644
index 00000000000..74c7f49c21f
--- /dev/null
+++ b/outputs/2025-09-03/16-25-46/.hydra/config.yaml
@@ -0,0 +1,74 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
+save_exported_program: true
diff --git a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
new file mode 100644
index 00000000000..f3b218f45ca
--- /dev/null
+++ b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
@@ -0,0 +1,157 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    - ++model.use_kv_cache=False
+    - ++model.use_sdpa_with_kv_cache=False
+    - ++save_exported_program=True
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-25-46
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
new file mode 100644
index 00000000000..fccd73d94f1
--- /dev/null
+++ b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
@@ -0,0 +1,6 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+- ++model.use_kv_cache=False
+- ++model.use_sdpa_with_kv_cache=False
+- ++save_exported_program=True
diff --git a/outputs/2025-09-03/16-25-46/export_llm.log b/outputs/2025-09-03/16-25-46/export_llm.log
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/outputs/2025-09-03/16-29-28/.hydra/config.yaml b/outputs/2025-09-03/16-29-28/.hydra/config.yaml
new file mode 100644
index 00000000000..74c7f49c21f
--- /dev/null
+++ b/outputs/2025-09-03/16-29-28/.hydra/config.yaml
@@ -0,0 +1,74 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
+save_exported_program: true
diff --git a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
new file mode 100644
index 00000000000..8490cd4d2cd
--- /dev/null
+++ b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
@@ -0,0 +1,157 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    - ++model.use_kv_cache=False
+    - ++model.use_sdpa_with_kv_cache=False
+    - ++save_exported_program=True
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-29-28
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
new file mode 100644
index 00000000000..fccd73d94f1
--- /dev/null
+++ b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
@@ -0,0 +1,6 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+- ++model.use_kv_cache=False
+- ++model.use_sdpa_with_kv_cache=False
+- ++save_exported_program=True
diff --git a/outputs/2025-09-03/16-29-28/export_llm.log b/outputs/2025-09-03/16-29-28/export_llm.log
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/outputs/2025-09-03/16-30-46/.hydra/config.yaml b/outputs/2025-09-03/16-30-46/.hydra/config.yaml
new file mode 100644
index 00000000000..34a34cf92f9
--- /dev/null
+++ b/outputs/2025-09-03/16-30-46/.hydra/config.yaml
@@ -0,0 +1,73 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false
diff --git a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
new file mode 100644
index 00000000000..9960f35db88
--- /dev/null
+++ b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
@@ -0,0 +1,156 @@
+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+
+      Use --hydra-help to view Hydra specific help
+
+      '
+    template: '${hydra.help.header}
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (group=option)
+
+
+      $APP_CONFIG_GROUPS
+
+
+      == Config ==
+
+      Override anything in the config (foo.bar=value)
+
+
+      $CONFIG
+
+
+      ${hydra.help.footer}
+
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+
+      See https://hydra.cc for more info.
+
+
+      == Flags ==
+
+      $FLAGS_HELP
+
+
+      == Configuration groups ==
+
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+
+
+      $HYDRA_CONFIG_GROUPS
+
+
+      Use ''--cfg hydra'' to Show the Hydra config.
+
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - ++base.model_class=llama3_1
+    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+    - ++model.use_kv_cache=False
+    - ++model.use_sdpa_with_kv_cache=False
+  job:
+    name: export_llm
+    chdir: null
+    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False
+    id: ???
+    num: ???
+    config_name: llm_config
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /home/gasoonjia/executorch
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-30-46
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false
diff --git a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
new file mode 100644
index 00000000000..369364d85c9
--- /dev/null
+++ b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
@@ -0,0 +1,5 @@
+- ++base.model_class=llama3_1
+- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
+- ++model.use_kv_cache=False
+- ++model.use_sdpa_with_kv_cache=False
diff --git a/outputs/2025-09-03/16-30-46/export_llm.log b/outputs/2025-09-03/16-30-46/export_llm.log
new file mode 100644
index 00000000000..ebb9f84e570
--- /dev/null
+++ b/outputs/2025-09-03/16-30-46/export_llm.log
@@ -0,0 +1,40 @@
+[2025-09-03 16:30:46,353][root][INFO] - Applying quantizers: []
+[2025-09-03 16:30:52,013][root][INFO] - Checkpoint dtype: torch.bfloat16
+[2025-09-03 16:30:52,014][root][INFO] - Model after source transforms: Transformer(
+  (tok_embeddings): Embedding(128256, 4096)
+  (layers): ModuleList(
+    (0-31): 32 x TransformerBlock(
+      (attention): AttentionMHA(
+        (wq): Linear(in_features=4096, out_features=4096, bias=False)
+        (wk): Linear(in_features=4096, out_features=1024, bias=False)
+        (wv): Linear(in_features=4096, out_features=1024, bias=False)
+        (wo): Linear(in_features=4096, out_features=4096, bias=False)
+        (rope): Rope(
+          (apply_rotary_emb): RotaryEmbedding()
+        )
+      )
+      (feed_forward): FeedForward(
+        (w1): Linear(in_features=4096, out_features=14336, bias=False)
+        (w2): Linear(in_features=14336, out_features=4096, bias=False)
+        (w3): Linear(in_features=4096, out_features=14336, bias=False)
+      )
+      (attention_norm): RMSNorm()
+      (ffn_norm): RMSNorm()
+    )
+  )
+  (rope): Rope(
+    (apply_rotary_emb): RotaryEmbedding()
+  )
+  (norm): RMSNorm()
+  (output): Linear(in_features=4096, out_features=128256, bias=False)
+)
+[2025-09-03 16:30:52,015][root][INFO] - Exporting with:
+[2025-09-03 16:30:52,016][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
+[2025-09-03 16:30:52,016][root][INFO] - kwargs: None
+[2025-09-03 16:30:52,016][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
+[2025-09-03 16:31:06,978][root][INFO] - Running canonical pass: RemoveRedundantTransposes
+[2025-09-03 16:31:07,056][root][INFO] - Using pt2e [] to quantizing the model...
+[2025-09-03 16:31:07,056][root][INFO] - No quantizer provided, passing...
+[2025-09-03 16:32:22,170][root][INFO] - Lowering model using following partitioner(s): 
+[2025-09-03 16:33:19,737][root][INFO] - Required memory for activation in bytes: [0, 26074624]
+[2025-09-03 16:33:33,215][root][INFO] - Saved exported program to ./llama3_1.pte
diff --git a/saved_llm_config.yaml b/saved_llm_config.yaml
new file mode 100644
index 00000000000..34a34cf92f9
--- /dev/null
+++ b/saved_llm_config.yaml
@@ -0,0 +1,73 @@
+base:
+  model_class: llama3_1
+  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
+  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
+  checkpoint_dir: null
+  adapter_checkpoint: null
+  adapter_config: null
+  tokenizer_path: null
+  metadata: null
+  use_lora: 0
+  fairseq2: false
+  preq_mode: null
+  preq_group_size: 32
+  preq_embedding_quantize: 8,0
+model:
+  dtype_override: fp32
+  enable_dynamic_shape: true
+  use_shared_embedding: false
+  use_sdpa_with_kv_cache: false
+  expand_rope_table: false
+  use_attention_sink: null
+  output_prune_map: null
+  input_prune_map: null
+  use_kv_cache: false
+  quantize_kv_cache: false
+  local_global_attention: null
+export:
+  max_seq_length: 128
+  max_context_length: 128
+  output_dir: .
+  output_name: null
+  so_library: null
+  export_only: false
+  foundation_weights_file: null
+debug:
+  profile_memory: false
+  profile_path: null
+  generate_etrecord: false
+  generate_full_logits: false
+  verbose: false
+quantization:
+  qmode: null
+  embedding_quantize: null
+  pt2e_quantize: null
+  group_size: null
+  use_spin_quant: null
+  use_qat: false
+  calibration_tasks: null
+  calibration_limit: null
+  calibration_seq_length: null
+  calibration_data: Once upon a time
+backend:
+  xnnpack:
+    enabled: false
+    extended_ops: false
+  coreml:
+    enabled: false
+    enable_state: false
+    preserve_sdpa: false
+    quantize: null
+    ios: 15
+    compute_units: cpu_only
+  vulkan:
+    enabled: false
+  qnn:
+    enabled: false
+    use_sha: false
+    soc_model: SM8650
+    use_qnn_sha: false
+    optimized_rotation_path: null
+    num_sharding: 0
+  mps:
+    enabled: false

From 01b306d2e5522ee9396cdf714eaf86440db0e84b Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 4 Sep 2025 00:11:40 -0700
Subject: [PATCH 33/50] remove uncessary data from github

---
 .gitignore                                    |   2 +
 config.yaml                                   |   5 -
 export_and_run_aoti.sh                        |   1 +
 load_saved_config_example.py                  | 111 -------------
 .../2025-09-03/14-45-08/.hydra/config.yaml    |  73 --------
 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml | 154 -----------------
 .../2025-09-03/14-45-08/.hydra/overrides.yaml |   3 -
 outputs/2025-09-03/14-45-08/export_llm.log    |  40 -----
 .../2025-09-03/15-17-23/.hydra/config.yaml    |  74 ---------
 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml | 157 ------------------
 .../2025-09-03/15-17-23/.hydra/overrides.yaml |   6 -
 outputs/2025-09-03/15-17-23/export_llm.log    |  38 -----
 .../2025-09-03/15-30-13/.hydra/config.yaml    |  74 ---------
 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml | 157 ------------------
 .../2025-09-03/15-30-13/.hydra/overrides.yaml |   6 -
 outputs/2025-09-03/15-30-13/export_llm.log    |   0
 .../2025-09-03/16-25-46/.hydra/config.yaml    |  74 ---------
 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml | 157 ------------------
 .../2025-09-03/16-25-46/.hydra/overrides.yaml |   6 -
 outputs/2025-09-03/16-25-46/export_llm.log    |   0
 .../2025-09-03/16-29-28/.hydra/config.yaml    |  74 ---------
 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml | 157 ------------------
 .../2025-09-03/16-29-28/.hydra/overrides.yaml |   6 -
 outputs/2025-09-03/16-29-28/export_llm.log    |   0
 .../2025-09-03/16-30-46/.hydra/config.yaml    |  73 --------
 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml | 156 -----------------
 .../2025-09-03/16-30-46/.hydra/overrides.yaml |   5 -
 outputs/2025-09-03/16-30-46/export_llm.log    |  40 -----
 saved_llm_config.yaml                         |  73 --------
 29 files changed, 3 insertions(+), 1719 deletions(-)
 delete mode 100644 config.yaml
 delete mode 100644 load_saved_config_example.py
 delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/14-45-08/export_llm.log
 delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/15-17-23/export_llm.log
 delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/15-30-13/export_llm.log
 delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/16-25-46/export_llm.log
 delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/16-29-28/export_llm.log
 delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/config.yaml
 delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
 delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
 delete mode 100644 outputs/2025-09-03/16-30-46/export_llm.log
 delete mode 100644 saved_llm_config.yaml

diff --git a/.gitignore b/.gitignore
index 2e9b9c948a2..295c352adbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,8 @@ tokenizer.json
 *kernel.cpp
 *wrapper_metadata.json
 *wrapper.cpp
+*wrapper.json
+
 aoti_debug_data*
 
 # Editor temporaries
diff --git a/config.yaml b/config.yaml
deleted file mode 100644
index 7fbd565cff5..00000000000
--- a/config.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-base:
-  model_class: llama3_2
-  checkpoint: /home/gasoonjia//consolidated.00.pth
-  params: /home/gasoonjia/executorch/params.json
-  metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index 7a60cb66be5..cb5595fb8b5 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -97,6 +97,7 @@ cleanup_temp_files() {
     rm -f *kernel.cpp
     rm -f *wrapper_metadata.json
     rm -f *wrapper.cpp
+    rm -f *wrapper.json
     rm -f aoti_intermediate_output.txt
 
     echo "Cleanup completed."
diff --git a/load_saved_config_example.py b/load_saved_config_example.py
deleted file mode 100644
index 95c2c9a07bd..00000000000
--- a/load_saved_config_example.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/usr/bin/env python3
-"""
-Example script showing how to load a saved LLM config and use it.
-"""
-
-import os
-import sys
-
-# Add the executorch path to import modules
-sys.path.append("/home/gasoonjia/executorch")
-
-from executorch.examples.models.llama.export_llama_lib import export_llama
-from executorch.extension.llm.export.config.llm_config import LlmConfig
-from executorch.extension.llm.export.export_llm import (
-    load_config_from_file,
-    save_config_to_file,
-)
-
-
-def load_and_use_saved_config():
-    """Load a previously saved config and use it for export."""
-
-    # Method 1: Load from a saved YAML file
-    try:
-        config_obj = load_config_from_file("used_config_llama3.yaml")
-        print("✓ Successfully loaded config from used_config_llama3.yaml")
-
-        # Optional: Modify the loaded config
-        print("Original quantization mode:", config_obj.quantization.qmode)
-        config_obj.quantization.qmode = "8da4w"  # Change quantization
-        config_obj.debug.verbose = True  # Enable verbose logging
-        print("Modified quantization mode:", config_obj.quantization.qmode)
-
-        # Use the config for export
-        print("Starting export with loaded config...")
-        output_file = export_llama(config_obj)
-        print(f"✓ Export completed! Output: {output_file}")
-
-    except FileNotFoundError:
-        print("❌ Config file 'used_config_llama3.yaml' not found.")
-        print("First save a config by running the main export script.")
-        return False
-
-    return True
-
-
-def create_and_save_custom_config():
-    """Create a custom config and save it."""
-
-    # Create a new config from scratch
-    custom_config = LlmConfig()
-
-    # Configure the model
-    custom_config.base.model_class = "llama3"
-    custom_config.base.checkpoint = (
-        "/path/to/your/checkpoint.pth"  # Set your checkpoint path
-    )
-
-    # Configure model settings
-    custom_config.model.use_kv_cache = True
-    custom_config.model.use_sdpa_with_kv_cache = True
-    custom_config.model.dtype_override = "fp32"
-
-    # Configure export settings
-    custom_config.export.max_seq_length = 2048
-    custom_config.export.output_dir = "./outputs"
-
-    # Configure backend
-    custom_config.backend.xnnpack.enabled = True
-    custom_config.backend.xnnpack.extended_ops = True
-
-    # Configure quantization
-    custom_config.quantization.qmode = "8da4w"
-
-    # Configure debug
-    custom_config.debug.verbose = True
-
-    # Save the custom config
-    config_filename = "my_custom_llama_config.yaml"
-    save_config_to_file(custom_config, config_filename)
-    print(f"✓ Custom config saved to {config_filename}")
-
-    # Load it back to verify
-    loaded_config = load_config_from_file(config_filename)
-    print("✓ Verified: Config loaded successfully")
-
-    return loaded_config
-
-
-def main():
-    print("=== LLM Config Load/Save Examples ===\n")
-
-    # Example 1: Try to load a previously saved config
-    print("1. Attempting to load saved config...")
-    success = load_and_use_saved_config()
-
-    if not success:
-        print("\n2. Creating and saving a custom config...")
-        custom_config = create_and_save_custom_config()
-
-        print("\n3. Using the custom config for export...")
-        try:
-            output_file = export_llama(custom_config)
-            print(f"✓ Export completed with custom config! Output: {output_file}")
-        except Exception as e:
-            print(f"❌ Export failed: {e}")
-            print("Make sure to set a valid checkpoint path in the config.")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/outputs/2025-09-03/14-45-08/.hydra/config.yaml b/outputs/2025-09-03/14-45-08/.hydra/config.yaml
deleted file mode 100644
index 34a34cf92f9..00000000000
--- a/outputs/2025-09-03/14-45-08/.hydra/config.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
diff --git a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
deleted file mode 100644
index c2e16273566..00000000000
--- a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml
+++ /dev/null
@@ -1,154 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/14-45-08
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
deleted file mode 100644
index acc7258c572..00000000000
--- a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml
+++ /dev/null
@@ -1,3 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
diff --git a/outputs/2025-09-03/14-45-08/export_llm.log b/outputs/2025-09-03/14-45-08/export_llm.log
deleted file mode 100644
index 574ad77780e..00000000000
--- a/outputs/2025-09-03/14-45-08/export_llm.log
+++ /dev/null
@@ -1,40 +0,0 @@
-[2025-09-03 14:45:08,888][root][INFO] - Applying quantizers: []
-[2025-09-03 14:45:17,670][root][INFO] - Checkpoint dtype: torch.bfloat16
-[2025-09-03 14:45:17,672][root][INFO] - Model after source transforms: Transformer(
-  (tok_embeddings): Embedding(128256, 4096)
-  (layers): ModuleList(
-    (0-31): 32 x TransformerBlock(
-      (attention): AttentionMHA(
-        (wq): Linear(in_features=4096, out_features=4096, bias=False)
-        (wk): Linear(in_features=4096, out_features=1024, bias=False)
-        (wv): Linear(in_features=4096, out_features=1024, bias=False)
-        (wo): Linear(in_features=4096, out_features=4096, bias=False)
-        (rope): Rope(
-          (apply_rotary_emb): RotaryEmbedding()
-        )
-      )
-      (feed_forward): FeedForward(
-        (w1): Linear(in_features=4096, out_features=14336, bias=False)
-        (w2): Linear(in_features=14336, out_features=4096, bias=False)
-        (w3): Linear(in_features=4096, out_features=14336, bias=False)
-      )
-      (attention_norm): RMSNorm()
-      (ffn_norm): RMSNorm()
-    )
-  )
-  (rope): Rope(
-    (apply_rotary_emb): RotaryEmbedding()
-  )
-  (norm): RMSNorm()
-  (output): Linear(in_features=4096, out_features=128256, bias=False)
-)
-[2025-09-03 14:45:17,673][root][INFO] - Exporting with:
-[2025-09-03 14:45:17,674][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
-[2025-09-03 14:45:17,674][root][INFO] - kwargs: None
-[2025-09-03 14:45:17,674][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
-[2025-09-03 14:45:33,074][root][INFO] - Running canonical pass: RemoveRedundantTransposes
-[2025-09-03 14:45:33,152][root][INFO] - Using pt2e [] to quantizing the model...
-[2025-09-03 14:45:33,152][root][INFO] - No quantizer provided, passing...
-[2025-09-03 14:46:55,091][root][INFO] - Lowering model using following partitioner(s): 
-[2025-09-03 14:47:47,454][root][INFO] - Required memory for activation in bytes: [0, 26074624]
-[2025-09-03 14:48:03,642][root][INFO] - Saved exported program to ./llama3_1.pte
diff --git a/outputs/2025-09-03/15-17-23/.hydra/config.yaml b/outputs/2025-09-03/15-17-23/.hydra/config.yaml
deleted file mode 100644
index 74c7f49c21f..00000000000
--- a/outputs/2025-09-03/15-17-23/.hydra/config.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
-save_exported_program: true
diff --git a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
deleted file mode 100644
index d224649ae3a..00000000000
--- a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    - ++model.use_kv_cache=False
-    - ++model.use_sdpa_with_kv_cache=False
-    - ++save_exported_program=True
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-17-23
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
deleted file mode 100644
index fccd73d94f1..00000000000
--- a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-- ++model.use_kv_cache=False
-- ++model.use_sdpa_with_kv_cache=False
-- ++save_exported_program=True
diff --git a/outputs/2025-09-03/15-17-23/export_llm.log b/outputs/2025-09-03/15-17-23/export_llm.log
deleted file mode 100644
index 9cdb4c31406..00000000000
--- a/outputs/2025-09-03/15-17-23/export_llm.log
+++ /dev/null
@@ -1,38 +0,0 @@
-[2025-09-03 15:17:23,719][root][INFO] - Applying quantizers: []
-[2025-09-03 15:17:25,710][root][INFO] - Checkpoint dtype: torch.bfloat16
-[2025-09-03 15:17:25,711][root][INFO] - Model after source transforms: Transformer(
-  (tok_embeddings): Embedding(128256, 4096)
-  (layers): ModuleList(
-    (0-31): 32 x TransformerBlock(
-      (attention): AttentionMHA(
-        (wq): Linear(in_features=4096, out_features=4096, bias=False)
-        (wk): Linear(in_features=4096, out_features=1024, bias=False)
-        (wv): Linear(in_features=4096, out_features=1024, bias=False)
-        (wo): Linear(in_features=4096, out_features=4096, bias=False)
-        (rope): Rope(
-          (apply_rotary_emb): RotaryEmbedding()
-        )
-      )
-      (feed_forward): FeedForward(
-        (w1): Linear(in_features=4096, out_features=14336, bias=False)
-        (w2): Linear(in_features=14336, out_features=4096, bias=False)
-        (w3): Linear(in_features=4096, out_features=14336, bias=False)
-      )
-      (attention_norm): RMSNorm()
-      (ffn_norm): RMSNorm()
-    )
-  )
-  (rope): Rope(
-    (apply_rotary_emb): RotaryEmbedding()
-  )
-  (norm): RMSNorm()
-  (output): Linear(in_features=4096, out_features=128256, bias=False)
-)
-[2025-09-03 15:17:25,712][root][INFO] - Exporting with:
-[2025-09-03 15:17:25,712][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
-[2025-09-03 15:17:25,712][root][INFO] - kwargs: None
-[2025-09-03 15:17:25,713][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
-[2025-09-03 15:17:39,308][root][INFO] - Running canonical pass: RemoveRedundantTransposes
-[2025-09-03 15:17:39,376][root][INFO] - Using pt2e [] to quantizing the model...
-[2025-09-03 15:17:39,377][root][INFO] - No quantizer provided, passing...
-[2025-09-03 15:18:45,017][root][INFO] - Lowering model using following partitioner(s): 
diff --git a/outputs/2025-09-03/15-30-13/.hydra/config.yaml b/outputs/2025-09-03/15-30-13/.hydra/config.yaml
deleted file mode 100644
index 74c7f49c21f..00000000000
--- a/outputs/2025-09-03/15-30-13/.hydra/config.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
-save_exported_program: true
diff --git a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
deleted file mode 100644
index e13edc3e222..00000000000
--- a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    - ++model.use_kv_cache=False
-    - ++model.use_sdpa_with_kv_cache=False
-    - ++save_exported_program=True
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-30-13
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
deleted file mode 100644
index fccd73d94f1..00000000000
--- a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-- ++model.use_kv_cache=False
-- ++model.use_sdpa_with_kv_cache=False
-- ++save_exported_program=True
diff --git a/outputs/2025-09-03/15-30-13/export_llm.log b/outputs/2025-09-03/15-30-13/export_llm.log
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/outputs/2025-09-03/16-25-46/.hydra/config.yaml b/outputs/2025-09-03/16-25-46/.hydra/config.yaml
deleted file mode 100644
index 74c7f49c21f..00000000000
--- a/outputs/2025-09-03/16-25-46/.hydra/config.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
-save_exported_program: true
diff --git a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
deleted file mode 100644
index f3b218f45ca..00000000000
--- a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    - ++model.use_kv_cache=False
-    - ++model.use_sdpa_with_kv_cache=False
-    - ++save_exported_program=True
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-25-46
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
deleted file mode 100644
index fccd73d94f1..00000000000
--- a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-- ++model.use_kv_cache=False
-- ++model.use_sdpa_with_kv_cache=False
-- ++save_exported_program=True
diff --git a/outputs/2025-09-03/16-25-46/export_llm.log b/outputs/2025-09-03/16-25-46/export_llm.log
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/outputs/2025-09-03/16-29-28/.hydra/config.yaml b/outputs/2025-09-03/16-29-28/.hydra/config.yaml
deleted file mode 100644
index 74c7f49c21f..00000000000
--- a/outputs/2025-09-03/16-29-28/.hydra/config.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
-save_exported_program: true
diff --git a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
deleted file mode 100644
index 8490cd4d2cd..00000000000
--- a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml
+++ /dev/null
@@ -1,157 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    - ++model.use_kv_cache=False
-    - ++model.use_sdpa_with_kv_cache=False
-    - ++save_exported_program=True
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-29-28
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
deleted file mode 100644
index fccd73d94f1..00000000000
--- a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-- ++model.use_kv_cache=False
-- ++model.use_sdpa_with_kv_cache=False
-- ++save_exported_program=True
diff --git a/outputs/2025-09-03/16-29-28/export_llm.log b/outputs/2025-09-03/16-29-28/export_llm.log
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/outputs/2025-09-03/16-30-46/.hydra/config.yaml b/outputs/2025-09-03/16-30-46/.hydra/config.yaml
deleted file mode 100644
index 34a34cf92f9..00000000000
--- a/outputs/2025-09-03/16-30-46/.hydra/config.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false
diff --git a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
deleted file mode 100644
index 9960f35db88..00000000000
--- a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml
+++ /dev/null
@@ -1,156 +0,0 @@
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
-  sweep:
-    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
-    subdir: ${hydra.job.num}
-  launcher:
-    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
-  sweeper:
-    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
-    max_batch_size: null
-    params: null
-  help:
-    app_name: ${hydra.job.name}
-    header: '${hydra.help.app_name} is powered by Hydra.
-
-      '
-    footer: 'Powered by Hydra (https://hydra.cc)
-
-      Use --hydra-help to view Hydra specific help
-
-      '
-    template: '${hydra.help.header}
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (group=option)
-
-
-      $APP_CONFIG_GROUPS
-
-
-      == Config ==
-
-      Override anything in the config (foo.bar=value)
-
-
-      $CONFIG
-
-
-      ${hydra.help.footer}
-
-      '
-  hydra_help:
-    template: 'Hydra (${hydra.runtime.version})
-
-      See https://hydra.cc for more info.
-
-
-      == Flags ==
-
-      $FLAGS_HELP
-
-
-      == Configuration groups ==
-
-      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
-      to command line)
-
-
-      $HYDRA_CONFIG_GROUPS
-
-
-      Use ''--cfg hydra'' to Show the Hydra config.
-
-      '
-    hydra_help: ???
-  hydra_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][HYDRA] %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-    root:
-      level: INFO
-      handlers:
-      - console
-    loggers:
-      logging_example:
-        level: DEBUG
-    disable_existing_loggers: false
-  job_logging:
-    version: 1
-    formatters:
-      simple:
-        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
-    handlers:
-      console:
-        class: logging.StreamHandler
-        formatter: simple
-        stream: ext://sys.stdout
-      file:
-        class: logging.FileHandler
-        formatter: simple
-        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
-    root:
-      level: INFO
-      handlers:
-      - console
-      - file
-    disable_existing_loggers: false
-  env: {}
-  mode: RUN
-  searchpath: []
-  callbacks: {}
-  output_subdir: .hydra
-  overrides:
-    hydra:
-    - hydra.mode=RUN
-    task:
-    - ++base.model_class=llama3_1
-    - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-    - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-    - ++model.use_kv_cache=False
-    - ++model.use_sdpa_with_kv_cache=False
-  job:
-    name: export_llm
-    chdir: null
-    override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False
-    id: ???
-    num: ???
-    config_name: llm_config
-    env_set: {}
-    env_copy: []
-    config:
-      override_dirname:
-        kv_sep: '='
-        item_sep: ','
-        exclude_keys: []
-  runtime:
-    version: 1.3.2
-    version_base: '1.3'
-    cwd: /home/gasoonjia/executorch
-    config_sources:
-    - path: hydra.conf
-      schema: pkg
-      provider: hydra
-    - path: ''
-      schema: structured
-      provider: schema
-    output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-30-46
-    choices:
-      hydra/env: default
-      hydra/callbacks: null
-      hydra/job_logging: default
-      hydra/hydra_logging: default
-      hydra/hydra_help: default
-      hydra/help: default
-      hydra/sweeper: basic
-      hydra/launcher: basic
-      hydra/output: default
-  verbose: false
diff --git a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
deleted file mode 100644
index 369364d85c9..00000000000
--- a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-- ++base.model_class=llama3_1
-- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json
-- ++model.use_kv_cache=False
-- ++model.use_sdpa_with_kv_cache=False
diff --git a/outputs/2025-09-03/16-30-46/export_llm.log b/outputs/2025-09-03/16-30-46/export_llm.log
deleted file mode 100644
index ebb9f84e570..00000000000
--- a/outputs/2025-09-03/16-30-46/export_llm.log
+++ /dev/null
@@ -1,40 +0,0 @@
-[2025-09-03 16:30:46,353][root][INFO] - Applying quantizers: []
-[2025-09-03 16:30:52,013][root][INFO] - Checkpoint dtype: torch.bfloat16
-[2025-09-03 16:30:52,014][root][INFO] - Model after source transforms: Transformer(
-  (tok_embeddings): Embedding(128256, 4096)
-  (layers): ModuleList(
-    (0-31): 32 x TransformerBlock(
-      (attention): AttentionMHA(
-        (wq): Linear(in_features=4096, out_features=4096, bias=False)
-        (wk): Linear(in_features=4096, out_features=1024, bias=False)
-        (wv): Linear(in_features=4096, out_features=1024, bias=False)
-        (wo): Linear(in_features=4096, out_features=4096, bias=False)
-        (rope): Rope(
-          (apply_rotary_emb): RotaryEmbedding()
-        )
-      )
-      (feed_forward): FeedForward(
-        (w1): Linear(in_features=4096, out_features=14336, bias=False)
-        (w2): Linear(in_features=14336, out_features=4096, bias=False)
-        (w3): Linear(in_features=4096, out_features=14336, bias=False)
-      )
-      (attention_norm): RMSNorm()
-      (ffn_norm): RMSNorm()
-    )
-  )
-  (rope): Rope(
-    (apply_rotary_emb): RotaryEmbedding()
-  )
-  (norm): RMSNorm()
-  (output): Linear(in_features=4096, out_features=128256, bias=False)
-)
-[2025-09-03 16:30:52,015][root][INFO] - Exporting with:
-[2025-09-03 16:30:52,016][root][INFO] - inputs: (tensor([[1, 2, 3]]),)
-[2025-09-03 16:30:52,016][root][INFO] - kwargs: None
-[2025-09-03 16:30:52,016][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},)
-[2025-09-03 16:31:06,978][root][INFO] - Running canonical pass: RemoveRedundantTransposes
-[2025-09-03 16:31:07,056][root][INFO] - Using pt2e [] to quantizing the model...
-[2025-09-03 16:31:07,056][root][INFO] - No quantizer provided, passing...
-[2025-09-03 16:32:22,170][root][INFO] - Lowering model using following partitioner(s): 
-[2025-09-03 16:33:19,737][root][INFO] - Required memory for activation in bytes: [0, 26074624]
-[2025-09-03 16:33:33,215][root][INFO] - Saved exported program to ./llama3_1.pte
diff --git a/saved_llm_config.yaml b/saved_llm_config.yaml
deleted file mode 100644
index 34a34cf92f9..00000000000
--- a/saved_llm_config.yaml
+++ /dev/null
@@ -1,73 +0,0 @@
-base:
-  model_class: llama3_1
-  params: /home/gasoonjia/Llama-3.1-8B/original/params.json
-  checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth
-  checkpoint_dir: null
-  adapter_checkpoint: null
-  adapter_config: null
-  tokenizer_path: null
-  metadata: null
-  use_lora: 0
-  fairseq2: false
-  preq_mode: null
-  preq_group_size: 32
-  preq_embedding_quantize: 8,0
-model:
-  dtype_override: fp32
-  enable_dynamic_shape: true
-  use_shared_embedding: false
-  use_sdpa_with_kv_cache: false
-  expand_rope_table: false
-  use_attention_sink: null
-  output_prune_map: null
-  input_prune_map: null
-  use_kv_cache: false
-  quantize_kv_cache: false
-  local_global_attention: null
-export:
-  max_seq_length: 128
-  max_context_length: 128
-  output_dir: .
-  output_name: null
-  so_library: null
-  export_only: false
-  foundation_weights_file: null
-debug:
-  profile_memory: false
-  profile_path: null
-  generate_etrecord: false
-  generate_full_logits: false
-  verbose: false
-quantization:
-  qmode: null
-  embedding_quantize: null
-  pt2e_quantize: null
-  group_size: null
-  use_spin_quant: null
-  use_qat: false
-  calibration_tasks: null
-  calibration_limit: null
-  calibration_seq_length: null
-  calibration_data: Once upon a time
-backend:
-  xnnpack:
-    enabled: false
-    extended_ops: false
-  coreml:
-    enabled: false
-    enable_state: false
-    preserve_sdpa: false
-    quantize: null
-    ios: 15
-    compute_units: cpu_only
-  vulkan:
-    enabled: false
-  qnn:
-    enabled: false
-    use_sha: false
-    soc_model: SM8650
-    use_qnn_sha: false
-    optimized_rotation_path: null
-    num_sharding: 0
-  mps:
-    enabled: false

From 3f22996f7f3bb911ade7d0def9402ccf678351ca Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 4 Sep 2025 00:12:14 -0700
Subject: [PATCH 34/50] remove uncessary data from github

---
 ...akzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json

diff --git a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json
deleted file mode 100644
index 8c1a9f6d812..00000000000
--- a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json
+++ /dev/null
@@ -1 +0,0 @@
-{"nodes": [{"name": "buf6", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf7"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf8", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf9"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf12", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf13"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf14", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf15"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf18", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf19"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf39", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf40"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf41", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf42"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf45", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf46"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf47", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf48"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf50", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf51"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf72", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf73"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf74", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf75"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf78", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf79"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf80", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf81"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf83", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf84"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf104", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf105"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf106", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf107"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf110", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf111"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf112", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf113"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf115", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf116"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf137", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf138"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf139", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf140"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf143", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf144"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf145", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf146"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf148", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf149"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf169", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf170"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf171", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf172"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf175", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf176"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf177", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf178"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf180", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf181"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf202", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf203"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf204", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf205"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf208", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf209"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf210", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf211"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf213", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf214"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf234", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf235"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf236", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf237"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf240", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf241"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf242", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf243"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf245", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf246"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf267", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf268"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf269", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf270"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf273", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf274"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf275", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf276"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf278", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf279"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf299", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf300"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf301", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf302"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf305", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf306"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf307", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf308"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf310", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf311"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf332", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf333"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf334", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf335"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf338", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf339"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf340", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf341"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf343", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf344"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf364", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf365"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf366", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf367"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf370", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf371"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf372", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf373"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf375", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf376"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf397", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf398"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf399", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf400"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf403", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf404"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf405", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf406"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf408", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf409"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf429", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf430"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf431", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf432"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf435", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf436"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf437", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf438"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf440", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf441"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf462", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf463"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf464", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf465"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf468", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf469"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf470", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf471"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf473", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf474"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf494", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf495"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf496", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf497"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf500", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf501"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf502", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf503"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf505", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf506"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf527", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf528"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf529", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf530"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf533", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf534"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf535", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf536"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf538", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf539"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf559", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf560"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf561", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf562"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf565", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf566"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf567", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf568"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf570", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf571"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf592", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf593"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf594", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf595"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf598", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf599"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf600", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf601"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf603", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf604"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf624", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf625"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf626", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf627"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf630", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf631"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf632", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf633"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf635", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf636"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf657", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf658"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf659", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf660"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf663", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf664"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf665", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf666"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf668", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf669"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf689", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf690"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf691", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf692"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf695", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf696"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf697", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf698"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf700", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf701"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf722", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf723"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf724", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf725"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf728", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf729"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf730", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf731"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf733", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf734"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf754", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf755"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf756", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf757"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf760", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf761"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf762", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf763"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf765", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf766"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf787", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf788"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf789", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf790"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf793", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf794"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf795", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf796"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf798", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf799"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf819", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf820"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf821", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf822"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf825", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf826"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf827", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf828"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf830", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf831"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf852", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf853"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf854", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf855"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf858", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf859"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf860", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf861"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf863", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf864"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf884", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf885"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf886", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf887"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf890", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf891"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf892", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf893"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf895", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf896"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf917", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf918"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf919", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf920"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf923", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf924"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf925", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf926"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf928", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf929"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf949", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf950"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf951", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf952"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf955", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf956"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf957", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf958"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf960", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf961"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf982", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf983"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf984", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf985"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf988", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf989"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf990", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf991"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf993", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf994"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1014", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1015"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1016", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1017"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1020", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1021"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1022", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1023"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1025", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1026"}}], "metadata": {}, "is_hop_single_tensor_return": null}}]}
\ No newline at end of file

From cc06edacd6be47ba917757f7d4d7e3825664c0a4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 4 Sep 2025 16:30:55 -0700
Subject: [PATCH 35/50] add support fallback kernels check

---
 backends/aoti/aoti_backend.py         |  54 ++++-
 backends/aoti/runtime/shims/utils.cpp | 326 +++++++++++++-------------
 backends/aoti/runtime/shims/utils.h   |   4 +-
 requirements-dev.txt                  |   1 -
 requirements-examples.txt             |   2 +-
 5 files changed, 218 insertions(+), 169 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index f785da00783..d4d30773fb9 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -4,13 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import contextlib
 import copy
 import os
 import shutil
 import typing
 
 from subprocess import check_call
-from typing import final, List
+from typing import Any, Dict, final, List, Optional, Set
 
 import torch
 from executorch.exir.backend.backend_details import (
@@ -19,6 +20,48 @@
     PreprocessResult,
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+
+
+# exist fallback operators in et namespace;
+supported_fallback_kernels: Dict[str, Any] = {}
+
+# required fallback kernels but not supported
+missing_fallback_kernels: Set[str] = set()
+
+
+# context manager for non-fallback guarantee
+# it will raise exception when generating fallback kernels during aoti compile
+@contextlib.contextmanager
+def raise_on_generate_fall_back_call():
+    original_generate_c_shim_extern_kernel_call = (
+        CppWrapperCpu.generate_c_shim_extern_kernel_call
+    )
+
+    def generate_supported_c_shim_extern_kernel_call(
+        self,
+        kernel: str,
+        args: list[str],
+        device: str,
+        *,
+        debug_args: Optional[list[str]] = None,
+    ):
+        if kernel in supported_fallback_kernels:
+            original_generate_c_shim_extern_kernel_call(
+                self, kernel, args, device, debug_args=debug_args
+            )
+        else:
+            missing_fallback_kernels.add(kernel)
+
+    CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+        generate_supported_c_shim_extern_kernel_call
+    )
+    try:
+        yield
+    finally:
+        CppWrapperCpu.generate_c_shim_extern_kernel_call = (
+            original_generate_c_shim_extern_kernel_call
+        )
 
 
 @final
@@ -50,7 +93,14 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
+        with raise_on_generate_fall_back_call():
+            so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
+            if len(missing_fallback_kernels) > 0:
+                formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
+                raise RuntimeError(
+                    f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n  - {formatted_kernels}\n"
+                    "Please add them to the AOTI backend."
+                )
 
         assert so_path == output_path, f"Expected {output_path} but got {so_path}"
 
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
index e81e141e7fd..a9dc5c84eb7 100644
--- a/backends/aoti/runtime/shims/utils.cpp
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -25,169 +25,169 @@ const char* const TENSOR_OUTPUT_FILENAME =
 
 extern "C" {
 
-void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) {
-  printf("Printing tensor handle: %p\n", self);
-
-  if (!self) {
-    throw std::runtime_error("Tensor handle is null");
-  }
-
-  printf("Tensor handle is not null\n");
-
-  // Get dtype and check if it's float32 (dtype 6 in PyTorch)
-  int32_t dtype = 0;
-  if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) {
-    throw std::runtime_error("Failed to get tensor dtype");
-  }
-
-  printf("Tensor dtype is: %d\n", dtype);
-
-  if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch
-    throw std::runtime_error(
-        "Tensor dtype is not float32. Expected dtype 6, got: " +
-        std::to_string(dtype));
-  }
-
-  printf("Tensor dtype is float32\n");
-
-  // Get data pointer
-  void* data_ptr = nullptr;
-  if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS ||
-      !data_ptr) {
-    throw std::runtime_error("Failed to get tensor data pointer");
-  }
-
-  printf("Tensor data pointer is %p not null\n", data_ptr);
-
-  // Get dimensions
-  int64_t dim = 0;
-  if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) {
-    throw std::runtime_error("Failed to get tensor dimensions");
-  }
-
-  printf("Tensor dimensions are: %ld\n", dim);
-
-  // Get sizes
-  int64_t* sizes = nullptr;
-  if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) {
-    throw std::runtime_error("Failed to get tensor sizes");
-  }
-
-  printf("Tensor sizes are: %ld\n", sizes);
-
-  // Calculate total number of elements
-  int64_t total_elements = 1;
-  for (int i = 0; i < dim; i++) {
-    total_elements *= sizes[i];
-  }
-
-  printf("Total elements in tensor: %ld\n", total_elements);
-
-  // Check device type to handle CUDA tensors properly
-  int32_t device_type = 0;
-  if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) {
-    throw std::runtime_error("Failed to get tensor device type");
-  }
-
-  printf("Tensor device type: %d\n", device_type);
-
-  AtenTensorHandle cpu_tensor = nullptr;
-  const float* float_data = nullptr;
-  bool need_cleanup = false;
-
-  // Check if tensor is on CUDA (device_type 1 is CUDA)
-  if (device_type == 1) {
-    printf("Tensor is on CUDA, copying to CPU...\n");
-
-    // Get strides for creating CPU tensor
-    int64_t* strides = nullptr;
-    if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS ||
-        !strides) {
-      throw std::runtime_error("Failed to get tensor strides");
-    }
-
-    // Create a CPU tensor with same shape and layout
-    if (aoti_torch_empty_strided(
-            dim, sizes, strides, dtype, 0, -1, &cpu_tensor) !=
-        AOTI_TORCH_SUCCESS) {
-      throw std::runtime_error("Failed to create CPU tensor");
-    }
-
-    // Copy data from CUDA to CPU tensor
-    if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) {
-      aoti_torch_delete_tensor_object(cpu_tensor);
-      throw std::runtime_error("Failed to copy tensor from CUDA to CPU");
-    }
-
-    // Get CPU data pointer
-    void* cpu_data_ptr = nullptr;
-    if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) !=
-            AOTI_TORCH_SUCCESS ||
-        !cpu_data_ptr) {
-      aoti_torch_delete_tensor_object(cpu_tensor);
-      throw std::runtime_error("Failed to get CPU tensor data pointer");
-    }
-
-    float_data = static_cast<const float*>(cpu_data_ptr);
-    need_cleanup = true;
-    printf("Successfully copied CUDA tensor to CPU\n");
-  } else {
-    // Tensor is already on CPU, use original data pointer
-    printf("Tensor is on CPU, using original data pointer\n");
-    float_data = static_cast<const float*>(data_ptr);
-  }
-
-  // Open file for writing (append mode to not overwrite previous outputs)
-  printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME);
-
-  std::ofstream output_file(
-      internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app);
-  if (!output_file.is_open()) {
-    if (need_cleanup) {
-      aoti_torch_delete_tensor_object(cpu_tensor);
-    }
-    throw std::runtime_error(
-        "Failed to open output file: " +
-        std::string(internal::TENSOR_OUTPUT_FILENAME));
-  }
-
-  printf("Successfully opened file for writing\n");
-
-  // Write message and tensor info to file
-  output_file << "=== " << msg << " ===" << std::endl;
-  output_file << "Device type: " << device_type << std::endl;
-  output_file << "Dimensions: " << dim << std::endl;
-  output_file << "Sizes: [";
-  for (int i = 0; i < dim; i++) {
-    output_file << sizes[i];
-    if (i < dim - 1)
-      output_file << ", ";
-  }
-  output_file << "]" << std::endl;
-  output_file << "Total elements: " << total_elements << std::endl;
-  output_file << "Data content:" << std::endl;
-
-  // Write tensor data to file (now safe to access)
-  for (int64_t i = 0; i < total_elements; i++) {
-    output_file << float_data[i] << " ";
-    if (i < total_elements - 1) {
-      output_file << ", ";
-      // Add newline every 10 elements for readability
-      if ((i + 1) % 10 == 0) {
-        output_file << std::endl;
-      }
-    }
-  }
-  output_file << std::endl << std::endl;
-
-  // Clean up CPU tensor if we created one
-  if (need_cleanup) {
-    aoti_torch_delete_tensor_object(cpu_tensor);
-    printf("Cleaned up temporary CPU tensor\n");
-  }
-
-  // File will be automatically closed when output_file goes out of scope
-}
+// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) {
+//   printf("Printing tensor handle: %p\n", self);
+
+//   if (!self) {
+//     throw std::runtime_error("Tensor handle is null");
+//   }
+
+//   printf("Tensor handle is not null\n");
+
+//   // Get dtype and check if it's float32 (dtype 6 in PyTorch)
+//   int32_t dtype = 0;
+//   if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) {
+//     throw std::runtime_error("Failed to get tensor dtype");
+//   }
+
+//   printf("Tensor dtype is: %d\n", dtype);
+
+//   if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch
+//     throw std::runtime_error(
+//         "Tensor dtype is not float32. Expected dtype 6, got: " +
+//         std::to_string(dtype));
+//   }
+
+//   printf("Tensor dtype is float32\n");
+
+//   // Get data pointer
+//   void* data_ptr = nullptr;
+//   if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS ||
+//       !data_ptr) {
+//     throw std::runtime_error("Failed to get tensor data pointer");
+//   }
+
+//   printf("Tensor data pointer is %p not null\n", data_ptr);
+
+//   // Get dimensions
+//   int64_t dim = 0;
+//   if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) {
+//     throw std::runtime_error("Failed to get tensor dimensions");
+//   }
+
+//   printf("Tensor dimensions are: %ld\n", dim);
+
+//   // Get sizes
+//   int64_t* sizes = nullptr;
+//   if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) {
+//     throw std::runtime_error("Failed to get tensor sizes");
+//   }
+
+//   printf("Tensor sizes are: %ld\n", sizes);
+
+//   // Calculate total number of elements
+//   int64_t total_elements = 1;
+//   for (int i = 0; i < dim; i++) {
+//     total_elements *= sizes[i];
+//   }
+
+//   printf("Total elements in tensor: %ld\n", total_elements);
+
+//   // Check device type to handle CUDA tensors properly
+//   int32_t device_type = 0;
+//   if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) {
+//     throw std::runtime_error("Failed to get tensor device type");
+//   }
+
+//   printf("Tensor device type: %d\n", device_type);
+
+//   AtenTensorHandle cpu_tensor = nullptr;
+//   const float* float_data = nullptr;
+//   bool need_cleanup = false;
+
+//   // Check if tensor is on CUDA (device_type 1 is CUDA)
+//   if (device_type == 1) {
+//     printf("Tensor is on CUDA, copying to CPU...\n");
+
+//     // Get strides for creating CPU tensor
+//     int64_t* strides = nullptr;
+//     if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS ||
+//         !strides) {
+//       throw std::runtime_error("Failed to get tensor strides");
+//     }
+
+//     // Create a CPU tensor with same shape and layout
+//     if (aoti_torch_empty_strided(
+//             dim, sizes, strides, dtype, 0, -1, &cpu_tensor) !=
+//         AOTI_TORCH_SUCCESS) {
+//       throw std::runtime_error("Failed to create CPU tensor");
+//     }
+
+//     // Copy data from CUDA to CPU tensor
+//     if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) {
+//       aoti_torch_delete_tensor_object(cpu_tensor);
+//       throw std::runtime_error("Failed to copy tensor from CUDA to CPU");
+//     }
+
+//     // Get CPU data pointer
+//     void* cpu_data_ptr = nullptr;
+//     if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) !=
+//             AOTI_TORCH_SUCCESS ||
+//         !cpu_data_ptr) {
+//       aoti_torch_delete_tensor_object(cpu_tensor);
+//       throw std::runtime_error("Failed to get CPU tensor data pointer");
+//     }
+
+//     float_data = static_cast<const float*>(cpu_data_ptr);
+//     need_cleanup = true;
+//     printf("Successfully copied CUDA tensor to CPU\n");
+//   } else {
+//     // Tensor is already on CPU, use original data pointer
+//     printf("Tensor is on CPU, using original data pointer\n");
+//     float_data = static_cast<const float*>(data_ptr);
+//   }
+
+//   // Open file for writing (append mode to not overwrite previous outputs)
+//   printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME);
+
+//   std::ofstream output_file(
+//       internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app);
+//   if (!output_file.is_open()) {
+//     if (need_cleanup) {
+//       aoti_torch_delete_tensor_object(cpu_tensor);
+//     }
+//     throw std::runtime_error(
+//         "Failed to open output file: " +
+//         std::string(internal::TENSOR_OUTPUT_FILENAME));
+//   }
+
+//   printf("Successfully opened file for writing\n");
+
+//   // Write message and tensor info to file
+//   output_file << "=== " << msg << " ===" << std::endl;
+//   output_file << "Device type: " << device_type << std::endl;
+//   output_file << "Dimensions: " << dim << std::endl;
+//   output_file << "Sizes: [";
+//   for (int i = 0; i < dim; i++) {
+//     output_file << sizes[i];
+//     if (i < dim - 1)
+//       output_file << ", ";
+//   }
+//   output_file << "]" << std::endl;
+//   output_file << "Total elements: " << total_elements << std::endl;
+//   output_file << "Data content:" << std::endl;
+
+//   // Write tensor data to file (now safe to access)
+//   for (int64_t i = 0; i < total_elements; i++) {
+//     output_file << float_data[i] << " ";
+//     if (i < total_elements - 1) {
+//       output_file << ", ";
+//       // Add newline every 10 elements for readability
+//       if ((i + 1) % 10 == 0) {
+//         output_file << std::endl;
+//       }
+//     }
+//   }
+//   output_file << std::endl << std::endl;
+
+//   // Clean up CPU tensor if we created one
+//   if (need_cleanup) {
+//     aoti_torch_delete_tensor_object(cpu_tensor);
+//     printf("Cleaned up temporary CPU tensor\n");
+//   }
+
+//   // File will be automatically closed when output_file goes out of scope
+// }
 
 // Function to cleanup the tensor output file (to be called from
 // aoti_backend.cpp)
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
index c0c2a59be0a..06d2edce212 100644
--- a/backends/aoti/runtime/shims/utils.h
+++ b/backends/aoti/runtime/shims/utils.h
@@ -18,8 +18,8 @@ namespace aoti {
 
 extern "C" {
 
-// Utility function for printing tensor information
-void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg);
+// // Utility function for printing tensor information
+// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg);
 
 // Cleanup function for tensor output file (called during backend destruction)
 void cleanup_aoti_tensor_output();
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 964bdecef76..8c8f518a5ea 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -10,4 +10,3 @@ certifi  # Imported by resolve_buck.py.
 lintrunner==0.12.7
 lintrunner-adapters==0.12.6
 patchelf
-transformers
diff --git a/requirements-examples.txt b/requirements-examples.txt
index 0923cf8fefc..26ac1ad9279 100644
--- a/requirements-examples.txt
+++ b/requirements-examples.txt
@@ -4,4 +4,4 @@ datasets == 3.6.0 # 4.0.0 deprecates trust_remote_code and load scripts. For now
 timm == 1.0.7
 torchsr == 1.0.4
 torchtune >= 0.6.1
-transformers == 4.53.1
+transformers == 4.52.4

From ef191c4c3cd177e55c78b599dfaba058da3c2449 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 4 Sep 2025 17:03:17 -0700
Subject: [PATCH 36/50] collect missing kernels while always generated fallback

---
 backends/aoti/aoti_backend.py     |  18 ++--
 backends/aoti/aoti_partitioner.py | 164 ------------------------------
 export_aoti.py                    |   8 +-
 3 files changed, 13 insertions(+), 177 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index d4d30773fb9..e6244ae9346 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -33,12 +33,12 @@
 # context manager for non-fallback guarantee
 # it will raise exception when generating fallback kernels during aoti compile
 @contextlib.contextmanager
-def raise_on_generate_fall_back_call():
+def collect_unsupported_fallback_kernels():
     original_generate_c_shim_extern_kernel_call = (
         CppWrapperCpu.generate_c_shim_extern_kernel_call
     )
 
-    def generate_supported_c_shim_extern_kernel_call(
+    def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
         self,
         kernel: str,
         args: list[str],
@@ -46,15 +46,15 @@ def generate_supported_c_shim_extern_kernel_call(
         *,
         debug_args: Optional[list[str]] = None,
     ):
-        if kernel in supported_fallback_kernels:
-            original_generate_c_shim_extern_kernel_call(
-                self, kernel, args, device, debug_args=debug_args
-            )
-        else:
+        if kernel not in supported_fallback_kernels:
             missing_fallback_kernels.add(kernel)
 
+        original_generate_c_shim_extern_kernel_call(
+            self, kernel, args, device, debug_args=debug_args
+        )
+
     CppWrapperCpu.generate_c_shim_extern_kernel_call = (
-        generate_supported_c_shim_extern_kernel_call
+        generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels
     )
     try:
         yield
@@ -93,7 +93,7 @@ def preprocess(
             "max_autotune_conv_backends": "TRITON",
         }
 
-        with raise_on_generate_fall_back_call():
+        with collect_unsupported_fallback_kernels():
             so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options)  # type: ignore[arg-type]
             if len(missing_fallback_kernels) > 0:
                 formatted_kernels = "\n  - ".join(sorted(missing_fallback_kernels))
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index 6dfe888fec8..6aeb63f959d 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -24,170 +24,6 @@
 
 from torch.fx.passes.operator_support import OperatorSupportBase
 
-# exist fallback operators in et namespace; should map to inductor_fallback_ops
-supported_fallback_operators: Dict[str, Dict[str, List[str]]] = {}
-
-inductor_fallback_ops: Set[str] = {
-    "aten._adaptive_avg_pool2d_backward.default",
-    "aten._adaptive_avg_pool2d.default",
-    "aten._adaptive_avg_pool3d_backward.default",
-    "aten._adaptive_avg_pool3d.default",
-    "aten._addmm_activation.default",
-    "aten._cdist_backward.default",
-    "aten._cdist_forward.default",
-    "aten._cudnn_rnn.default",
-    "aten._dyn_quant_matmul_4bit.default",
-    "aten._dyn_quant_pack_4bit_weight.default",
-    "aten._efficient_attention_backward.default",
-    "aten._efficient_attention_forward.default",
-    "aten._efficientzerotensor.default",
-    "aten._embedding_bag_dense_backward.default",
-    "aten._embedding_bag_forward_only.default",
-    "aten._embedding_bag_per_sample_weights_backward.default",
-    "aten._embedding_bag.default",
-    "aten._fft_c2c.default",
-    "aten._fft_r2c.default",
-    "aten._flash_attention_backward.default",
-    "aten._flash_attention_forward.default",
-    "aten._fused_moving_avg_obs_fq_helper_functional.default",
-    "aten._fused_moving_avg_obs_fq_helper.default",
-    "aten._fused_rms_norm.default",
-    "aten._histogramdd_from_bin_cts.default",
-    "aten._int_mm.out",
-    "aten._pdist_backward.default",
-    "aten._pdist_forward.default",
-    "aten._scaled_dot_product_attention_math_for_mps.default",
-    "aten._scaled_dot_product_cudnn_attention_backward.default",
-    "aten._scaled_dot_product_cudnn_attention.default",
-    "aten._scaled_dot_product_efficient_attention_backward.default",
-    "aten._scaled_dot_product_efficient_attention.default",
-    "aten._scaled_dot_product_flash_attention_backward.default",
-    "aten._scaled_dot_product_flash_attention_for_cpu_backward.default",
-    "aten._scaled_dot_product_flash_attention_for_cpu.default",
-    "aten._scaled_dot_product_flash_attention.default",
-    "aten._scaled_dot_product_fused_attention_overrideable_backward.default",
-    "aten._scaled_dot_product_fused_attention_overrideable.default",
-    "aten._scaled_mm.default",
-    "aten._scaled_mm.out",
-    "aten._segment_reduce_backward.default",
-    "aten._thnn_fused_lstm_cell.default",
-    "aten._to_sparse.default",
-    "aten._trilinear.default",
-    "aten._weight_int4pack_mm.default",
-    "aten._weight_int8pack_mm.default",
-    "aten.abs.default",
-    "aten.adaptive_max_pool2d_backward.default",
-    "aten.adaptive_max_pool2d.default",
-    "aten.adaptive_max_pool3d_backward.default",
-    "aten.adaptive_max_pool3d.default",
-    "aten.add.Scalar",
-    "aten.add.Tensor",
-    "aten.addbmm.default",
-    "aten.addmm.out",
-    "aten.addmv.default",
-    "aten.angle.default",
-    "aten.avg_pool2d_backward.default",
-    "aten.avg_pool2d.default",
-    "aten.avg_pool3d_backward.default",
-    "aten.avg_pool3d.default",
-    "aten.baddbmm.out",
-    "aten.bernoulli_.float",
-    "aten.bernoulli_.Tensor",
-    "aten.bmm.out",
-    "aten.bucketize.Tensor",
-    "aten.cat.default",
-    "aten.cholesky_inverse.default",
-    "aten.cholesky_solve.default",
-    "aten.convolution_backward.default",
-    "aten.convolution.default",
-    "aten.cummax.default",
-    "aten.cummin.default",
-    "aten.cumprod.default",
-    "aten.cumsum.default",
-    "aten.exponential.default",
-    "aten.fill_.Scalar",
-    "aten.fractional_max_pool2d_backward.default",
-    "aten.fractional_max_pool2d.default",
-    "aten.fractional_max_pool3d_backward.default",
-    "aten.fractional_max_pool3d.default",
-    "aten.gcd.default",
-    "aten.geqrf.default",
-    "aten.grid_sampler_2d_backward.default",
-    "aten.hann_window.default",
-    "aten.histc.default",
-    "aten.histogram.bin_ct",
-    "aten.index_put.default",
-    "aten.index_reduce.default",
-    "aten.index.Tensor",
-    "aten.kthvalue.default",
-    "aten.logcumsumexp.default",
-    "aten.lu_unpack.default",
-    "aten.masked_scatter_backward.default",
-    "aten.masked_scatter.default",
-    "aten.masked_select.default",
-    "aten.max_pool2d_with_indices_backward.default",
-    "aten.max_pool2d_with_indices.default",
-    "aten.max_pool3d_with_indices_backward.default",
-    "aten.max_pool3d_with_indices.default",
-    "aten.max_unpool2d.default",
-    "aten.max_unpool3d.default",
-    "aten.median.default",
-    "aten.mm.out",
-    "aten.mode.default",
-    "aten.mul.Scalar",
-    "aten.mul.Tensor",
-    "aten.nanmedian.default",
-    "aten.narrow.default",
-    "aten.native_dropout.default",
-    "aten.nonzero.default",
-    "aten.normal_functional.default",
-    "aten.ormqr.default",
-    "aten.pad.default",
-    "aten.permute.default",
-    "aten.polar.default",
-    "aten.pow.Scalar",
-    "aten.pow.Tensor_Scalar",
-    "aten.pow.Tensor_Tensor",
-    "aten.rand.default",
-    "aten.rand.generator",
-    "aten.randint.default",
-    "aten.randint.generator",
-    "aten.randint.low_out",
-    "aten.randint.low",
-    "aten.randn.default",
-    "aten.randn.generator",
-    "aten.randperm.default",
-    "aten.repeat_interleave.Tensor",
-    "aten.replication_pad1d_backward.default",
-    "aten.replication_pad2d_backward.default",
-    "aten.reshape.default",
-    "aten.resize_.default",
-    "aten.resize_as_.default",
-    "aten.scatter_reduce.two_out",
-    "aten.scatter.src_out",
-    "aten.scatter.value_out",
-    "aten.searchsorted.Scalar",
-    "aten.searchsorted.Tensor",
-    "aten.segment_reduce.default",
-    "aten.set_.source_Tensor",
-    "aten.slice.Tensor",
-    "aten.soft_margin_loss_backward.default",
-    "aten.sort.default",
-    "aten.sort.stable",
-    "aten.squeeze.dim",
-    "aten.to_sparse.default",
-    "aten.topk.default",
-    "aten.triangular_solve.default",
-    "aten.uniform.default",
-    "aten.upsample_bicubic2d_backward.default",
-    "aten.upsample_linear1d_backward.default",
-    "aten.upsample_trilinear3d_backward.default",
-    "aten.view_as_complex.default",
-    "aten.view_as_real.default",
-    "aten.view.dtype",
-    "aten._weight_int4pack_mm_with_scales_and_zeros.default",
-}
-
 
 class AOTISupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
diff --git a/export_aoti.py b/export_aoti.py
index c1c24d212ef..d720086ac0f 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -167,15 +167,15 @@ def forward(self, x):
 
 
 class Llama31(torch.nn.Module):
-    def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B"):
+    def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B", use_cache=False):
         super(Llama31, self).__init__()
         # Load Llama 3.1 model from HF
+        self.use_cache = use_cache
         self.model = AutoModelForCausalLM.from_pretrained(
             model_id,
             torch_dtype=torch.float32,
             device_map="cuda",
-            # trust_remote_code=True,
-            use_cache=False,  # Turn off KV cache
+            use_cache=self.use_cache,  # Turn off KV cache
         )
         self.model.eval()
 
@@ -185,7 +185,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
             outputs = self.model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                use_cache=False,  # Explicitly turn off KV cache
+                use_cache=self.use_cache,  # Explicitly turn off KV cache
             )
         return outputs.logits
 

From 62fbd92df41c76dc2dea24a69238cc484d5cee67 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 9 Sep 2025 16:32:38 -0700
Subject: [PATCH 37/50] use ptd pipeline on .so file

---
 CMakeLists.txt                                |   6 +
 backends/aoti/aoti_backend.py                 |  15 +-
 backends/aoti/runtime/aoti_backend.cpp        |  28 +++-
 .../executor_runner/executor_runner.cpp       |  43 +++++-
 examples/portable/executor_runner/targets.bzl |   2 +
 export_and_run_aoti.sh                        |   6 +-
 export_aoti.py                                | 135 +++++++++++++++---
 7 files changed, 209 insertions(+), 26 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f98246e1851..e3debc9fcf5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,6 +107,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_SKIP_BUILD_RPATH OFF)
 # Don't use the install-rpath during the build phase
 set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
+
 # Automatically add all linked folders that are NOT in the build directory to
 # the rpath (per library?)
 #
@@ -1014,6 +1015,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
                             extension_runner_util gflags executorch_backends
   )
 
+  # Add flat tensor extension if it's built
+  if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+    list(APPEND _executor_runner_libs extension_flat_tensor)
+  endif()
+
   if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   elseif(EXECUTORCH_BUILD_CADENCE)
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index e6244ae9346..986aa938888 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -7,13 +7,13 @@
 import contextlib
 import copy
 import os
-import shutil
 import typing
 
 from subprocess import check_call
 from typing import Any, Dict, final, List, Optional, Set
 
 import torch
+from executorch.exir._serialize._named_data_store import NamedDataStore
 from executorch.exir.backend.backend_details import (
     BackendDetails,
     ExportedProgram,
@@ -72,6 +72,7 @@ def preprocess(
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
         print("entering  the lowerable parts in AotiBackend.preprocess....")
+        named_data_store = NamedDataStore()
 
         # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
@@ -88,6 +89,7 @@ def preprocess(
         options: dict[str, typing.Any] = {
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
+            "aot_inductor.force_mmap_weights": False,
             "max_autotune": True,
             "max_autotune_gemm_backends": "TRITON",
             "max_autotune_conv_backends": "TRITON",
@@ -111,4 +113,13 @@ def preprocess(
 
         print("so_path", so_path)
 
-        return PreprocessResult(so_path.encode("utf-8"))
+        with open(so_path, "rb") as f:
+            so_data = f.read()
+
+        named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob")
+
+        return PreprocessResult(
+            processed_bytes=b"",
+            debug_handle_map={},
+            data_store_output=named_data_store.get_named_data_store_output(),
+        )
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 03c46c03bdd..24d935d579e 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -51,6 +51,7 @@ using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::FreeableBuffer;
 using executorch::runtime::MemoryAllocator;
+using executorch::runtime::NamedDataMap;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
@@ -69,15 +70,34 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   // Once per loaded binary blob
   Result<DelegateHandle*> init(
       BackendInitContext& context,
-      FreeableBuffer* processed, // This will be the buffer from aoti_backend
+      FreeableBuffer* processed, // This will be a empty buffer
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    const char* so_path = static_cast<const char*>(processed->data());
+    // const char* so_path = static_cast<const char*>(processed->data());
 
-    printf("so path: %s\n", so_path);
+    // printf("so path: %s\n", so_path);
+
+    const NamedDataMap* named_data_map = context.get_named_data_map();
+
+    std::string so_path = "/tmp/test.so";
+    std::string so_blob_key = "so_blob";
+
+    Result<FreeableBuffer> aoti_cuda_buffer =
+        named_data_map->get_data(aoti_cuda_blob_name.c_str());
+
+    // Create a temporary file
+    std::ofstream outfile(so_path.c_str(), std::ios::binary);
+
+    // Write the ELF buffer to the temporary file
+    outfile.write(
+        (char*)aoti_cuda_buffer->data(),
+        sizeof(void*) * aoti_cuda_buffer->size());
+
+    // Finish writing the file to disk
+    outfile.close();
 
     // Load the ELF using dlopen
-    void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL);
+    void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL);
     if (so_handle == nullptr) {
       std::cout << dlerror() << std::endl;
       return Error::AccessFailed;
diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp
index 4a4b659c748..37029d150b8 100644
--- a/examples/portable/executor_runner/executor_runner.cpp
+++ b/examples/portable/executor_runner/executor_runner.cpp
@@ -26,6 +26,7 @@
 
 #include <executorch/extension/data_loader/file_data_loader.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
 #include <executorch/extension/runner_util/inputs.h>
 #include <executorch/runtime/core/event_tracer.h>
 #include <executorch/runtime/executor/method.h>
@@ -50,6 +51,10 @@ DEFINE_string(
     model_path,
     "model.pte",
     "Model serialized in flatbuffer format.");
+DEFINE_string(
+    data_path,
+    "",
+    "Path to external tensor data file (.ptd format). Optional.");
 DEFINE_uint32(num_executions, 1, "Number of times to run the model.");
 #ifdef ET_EVENT_TRACER_ENABLED
 DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path.");
@@ -60,6 +65,7 @@ DEFINE_int32(
     "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
 
 using executorch::extension::FileDataLoader;
+using executorch::extension::FlatTensorDataMap;
 using executorch::runtime::Error;
 using executorch::runtime::EValue;
 using executorch::runtime::EventTracer;
@@ -242,8 +248,43 @@ int main(int argc, char** argv) {
   // be used by a single thread at at time, but it can be reused.
   //
   EventTraceManager tracer;
+
+  // Handle optional external tensor data loading
+  std::unique_ptr<FileDataLoader> data_loader;
+  std::unique_ptr<FlatTensorDataMap> data_map;
+
+  if (!FLAGS_data_path.empty()) {
+    ET_LOG(
+        Info, "Loading external tensor data from %s", FLAGS_data_path.c_str());
+
+    // Create FileDataLoader for the PTD file
+    Result<FileDataLoader> data_loader_result =
+        FileDataLoader::from(FLAGS_data_path.c_str());
+    ET_CHECK_MSG(
+        data_loader_result.ok(),
+        "Failed to create FileDataLoader for data path %s: 0x%" PRIx32,
+        FLAGS_data_path.c_str(),
+        (uint32_t)data_loader_result.error());
+
+    data_loader =
+        std::make_unique<FileDataLoader>(std::move(data_loader_result.get()));
+
+    // Create FlatTensorDataMap from the loaded blob
+    Result<FlatTensorDataMap> data_map_result =
+        FlatTensorDataMap::load(data_loader.get());
+    ET_CHECK_MSG(
+        data_map_result.ok(),
+        "Failed to load FlatTensorDataMap from %s: 0x%" PRIx32,
+        FLAGS_data_path.c_str(),
+        (uint32_t)data_map_result.error());
+
+    data_map =
+        std::make_unique<FlatTensorDataMap>(std::move(data_map_result.get()));
+    ET_LOG(Info, "External tensor data loaded successfully");
+  }
+
   Result<Method> method = program->load_method(
-      method_name, &memory_manager, tracer.get_event_tracer());
+      method_name, &memory_manager, tracer.get_event_tracer(), data_map.get());
   ET_CHECK_MSG(
       method.ok(),
       "Loading of method %s failed with status 0x%" PRIx32,
diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl
index 0af45d85075..d1304a84bcb 100644
--- a/examples/portable/executor_runner/targets.bzl
+++ b/examples/portable/executor_runner/targets.bzl
@@ -19,6 +19,7 @@ def define_common_targets():
             "//executorch/devtools/etdump:etdump_flatcc",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
         ],
         external_deps = [
@@ -38,6 +39,7 @@ def define_common_targets():
             "//executorch/runtime/executor:program",
             "//executorch/extension/data_loader:file_data_loader",
             "//executorch/extension/evalue_util:print_evalue",
+            "//executorch/extension/flat_tensor:flat_tensor_data_map",
             "//executorch/extension/runner_util:inputs",
             "//executorch/extension/threadpool:cpuinfo_utils",
             "//executorch/extension/threadpool:threadpool",
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index cb5595fb8b5..a971df35b13 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -141,6 +141,8 @@ build_runtime() {
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Debug \
               -DCMAKE_BUILD_TYPE=Debug \
+              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
               ..
     else
         echo "Building with release configuration..."
@@ -149,6 +151,8 @@ build_runtime() {
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Info \
               -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
               ..
     fi
 
@@ -158,7 +162,7 @@ build_runtime() {
 
 run_inference() {
     echo "Running executor_runner with debug logging enabled..."
-    ./cmake-out/executor_runner --model_path aoti_model.pte
+    ./cmake-out/executor_runner --model_path aoti_model.pte --data_path aoti_cuda_blob.ptd
 }
 
 compare_outputs() {
diff --git a/export_aoti.py b/export_aoti.py
index d720086ac0f..8be26d0d258 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -29,10 +29,11 @@
 from executorch.exir import to_edge, to_edge_transform_and_lower
 from torch import nn
 from torch.export import export
+from torch.nn.attention import SDPBackend
 from torchvision import models
 from torchvision.models.mobilenetv2 import MobileNet_V2_Weights
 from torchvision.models.resnet import ResNet18_Weights
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, WhisperModel
 
 
 # for maintaing precision of 32-bit float as much as possible
@@ -190,6 +191,74 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
         return outputs.logits
 
 
+class Whisper(torch.nn.Module):
+    def __init__(self, model_name="openai/whisper-tiny"):
+        super(Whisper, self).__init__()
+        # 1. Load pre-trained Whisper model (tiny version is lightweight)
+        self.model = WhisperModel.from_pretrained(model_name)
+        self.model.eval()
+
+    def forward(self, input_features: torch.Tensor):
+        outputs = self.model.encoder(input_features=input_features)
+
+        # Return both encoder and decoder hidden states for compatibility
+        return outputs.last_hidden_state
+
+
+class MockConv1d(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            in_channels=80,
+            out_channels=384,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1,
+            groups=1,
+            bias=True,
+        )
+
+    def forward(self, x):
+        return self.conv(x)
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, embed_dim=256, num_heads=8, ff_dim=1024, dropout=0.1):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        # Multi-head self-attention
+        self.self_attn = nn.MultiheadAttention(
+            embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True
+        )
+
+        # Layer normalization layers
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+
+        # Feed-forward network
+        self.ffn = nn.Sequential(
+            nn.Linear(embed_dim, ff_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(ff_dim, embed_dim),
+            nn.Dropout(dropout),
+        )
+
+    def forward(self, x):
+        # Self-attention block with residual connection
+        attn_output, _ = self.self_attn(x, x, x)
+        x = self.norm1(x + attn_output)
+
+        # Feed-forward block with residual connection
+        ff_output = self.ffn(x)
+        x = self.norm2(x + ff_output)
+
+        return x
+
+
 # Model registry mapping model names to their configurations
 MODEL_REGISTRY: Dict[str, Dict[str, Any]] = {
     "mv2": {
@@ -246,6 +315,24 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
         "device": "cuda",
         "description": "Llama 3.1 model with KV cache disabled",
     },
+    "whisper": {
+        "model_class": Whisper,
+        "input_shapes": [(1, 80, 3000)],
+        "device": "cuda",
+        "description": "OpenAI Whisper ASR model. now is encoder only",
+    },
+    "conv1d": {
+        "model_class": MockConv1d,
+        "input_shapes": [(1, 80, 3000)],
+        "device": "cuda",
+        "description": "Conv1d layer with 80 input channels, 384 output channels",
+    },
+    "transformer_block": {
+        "model_class": TransformerBlock,
+        "input_shapes": [(4, 32, 256)],  # batch_size=4, seq_len=32, embed_dim=256
+        "device": "cuda",
+        "description": "Single transformer block with multi-head attention and feed-forward network",
+    },
 }
 
 
@@ -253,7 +340,7 @@ def get_model_and_inputs(
     model_name: str,
 ) -> Tuple[torch.nn.Module, Tuple[torch.Tensor, ...]]:
     """Get model and example inputs based on model name."""
-
+    #
     if model_name not in MODEL_REGISTRY:
         available_models = ", ".join(MODEL_REGISTRY.keys())
         raise ValueError(
@@ -281,7 +368,9 @@ def get_model_and_inputs(
     return model, example_inputs
 
 
-def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.pte"):
+def export_model_to_et_aoti(
+    model, example_inputs, output_pte_path="aoti_model.pte", output_data_dir=None
+):
     """Export model through the AOTI pipeline."""
     all_one_input = tuple(
         torch.ones_like(example_input) for example_input in example_inputs
@@ -309,23 +398,24 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p
 
     print(f"Starting export process...")
 
-    # 1. torch.export: Defines the program with the ATen operator set.
     print("Step 1: Converting to ATen dialect...")
-    with torch.no_grad():
-        # from torch.export._trace import _export
+    with torch.nn.attention.sdpa_kernel(
+        [SDPBackend.MATH]  # pyre-fixme[16]
+    ), torch.no_grad():
+        # 1. torch.export: Defines the program with the ATen operator set.
         aten_dialect = export(model, example_inputs, strict=False)
 
-    # print(aten_dialect)
-    # exit(0)
+        # print(aten_dialect)
+        # exit(0)
 
-    # 2. to_edge: Make optimizations for Edge devices
-    # aoti part should be decomposed by the internal torch._inductor.aot_compile
-    # we should preserve the lowerable part and waiting for aoti backend handle that
-    # Q: maybe need to turn on fallback_random?
+        # 2. to_edge: Make optimizations for Edge devices
+        # aoti part should be decomposed by the internal torch._inductor.aot_compile
+        # we should preserve the lowerable part and waiting for aoti backend handle that
+        # Q: maybe need to turn on fallback_random?
 
-    edge_program = to_edge_transform_and_lower(
-        aten_dialect, partitioner=[AotiPartitioner([])]
-    )
+        edge_program = to_edge_transform_and_lower(
+            aten_dialect, partitioner=[AotiPartitioner([])]
+        )
 
     # edge_program = to_edge(aten_dialect)
 
@@ -337,11 +427,20 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p
     print("To executorch done.")
 
     # 4. Save the compiled .pte program
-    print(f"Step 5: Saving to {output_filename}...")
-    with open(output_filename, "wb") as file:
+    if output_data_dir is None:
+        output_data_dir = os.getcwd()
+
+    print(f"Step 5: Saving pte to {output_pte_path} and ptd to {output_data_dir}")
+    with open(output_pte_path, "wb") as file:
         file.write(executorch_program.buffer)
 
-    print(f"Export completed successfully! Output saved to {output_filename}")
+    print(f"size of Named Data: {len(executorch_program._tensor_data)}")
+
+    executorch_program.write_tensor_data_to_file(output_data_dir)
+
+    print(
+        f"Export completed successfully! PTE saved to {output_pte_path} and ptd saved to {output_data_dir}"
+    )
 
 
 def export_model_to_pure_aoti(model, example_inputs):

From 057f1fad13d3ab92294b7112ab65498cac3c377f Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 10 Sep 2025 00:22:42 -0700
Subject: [PATCH 38/50] use cpu model as input

---
 backends/aoti/aoti_backend.py          | 20 ++++++++++++++++++++
 backends/aoti/runtime/aoti_backend.cpp |  2 +-
 exir/program/_program.py               |  2 +-
 export_aoti.py                         | 14 +-------------
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index 986aa938888..a64bb9c5cc5 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -21,6 +21,7 @@
 )
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu
+from torch.export.passes import move_to_device_pass
 
 
 # exist fallback operators in et namespace;
@@ -71,14 +72,33 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+
         print("entering  the lowerable parts in AotiBackend.preprocess....")
         named_data_store = NamedDataStore()
 
         # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
+
+        # Move the edge_program from CPU to CUDA using move_to_device_pass
+        copy_edge_program = move_to_device_pass(copy_edge_program, "cuda")
         # graph_module = copy_edge_program.graph_module
         edge_program_module = copy_edge_program.module()
         args, kwargs = copy_edge_program.example_inputs
+
+        # Deep copy args and move tensors to CUDA for aot_compile
+        def move_to_cuda(obj):
+            if isinstance(obj, torch.Tensor):
+                return obj.cuda()
+            elif isinstance(obj, (list, tuple)):
+                return type(obj)(move_to_cuda(item) for item in obj)
+            elif isinstance(obj, dict):
+                return {key: move_to_cuda(value) for key, value in obj.items()}
+            else:
+                return obj
+
+        args = move_to_cuda(copy.deepcopy(args))
+        kwargs = move_to_cuda(copy.deepcopy(kwargs))
+
         # print("args, kwargs", args, kwargs)
         print("len(args)", len(args))
         print("args[0].shape", args[0].shape)
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 24d935d579e..6ccd099da0f 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -83,7 +83,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     std::string so_blob_key = "so_blob";
 
     Result<FreeableBuffer> aoti_cuda_buffer =
-        named_data_map->get_data(aoti_cuda_blob_name.c_str());
+        named_data_map->get_data(so_blob_key.c_str());
 
     // Create a temporary file
     std::ofstream outfile(so_path.c_str(), std::ios::binary);
diff --git a/exir/program/_program.py b/exir/program/_program.py
index 760056e32bb..e3ada9301b7 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1707,7 +1707,7 @@ def exported_program_to_device(exported_program, device):
 
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
-            program = exported_program_to_device(program, "cpu")
+            # program = exported_program_to_device(program, "cpu")
             if config.do_quant_fusion_and_const_prop:
                 if program.graph_signature.backward_signature is not None:
                     raise Exception(
diff --git a/export_aoti.py b/export_aoti.py
index 8be26d0d258..e644177568c 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -264,73 +264,61 @@ def forward(self, x):
     "mv2": {
         "model_class": MV2,
         "input_shapes": [(1, 3, 224, 224)],
-        "device": "cuda",
         "description": "MobileNetV2 model",
     },
     "resnet18": {
         "model_class": ResNet18,
         "input_shapes": [(1, 3, 224, 224)],
-        "device": "cuda",
         "description": "ResNet18 model",
     },
     "linear": {
         "model_class": Linear,
         "input_shapes": [(127, 7)],
-        "device": "cuda",
         "description": "Simple linear layer model",
     },
     "conv2d": {
         "model_class": SingleConv2d,
         "input_shapes": [(4, 3, 8, 8)],
-        "device": "cuda",
         "description": "Single Conv2d layer model",
     },
     "depthwise_conv": {
         "model_class": DepthwiseConv,
         "input_shapes": [(1, 32, 112, 112)],
-        "device": "cuda",
         "description": "Single Depthwise Conv2d layer model",
     },
     "add": {
         "model_class": Add,
         "input_shapes": [(10,), (10,)],
-        "device": "cuda",
         "description": "Simple tensor addition model",
     },
     "batchnorm": {
         "model_class": BatchNorm,
         "input_shapes": [(1, 16, 32, 32)],
-        "device": "cuda",
         "description": "Single BatchNorm2d layer model",
     },
     "single_resnet_block": {
         "model_class": SingleResNetBlock,
         "input_shapes": [(1, 64, 8, 8)],
-        "device": "cuda",
         "description": "Single ResNet block with skip connection",
     },
     "llama31": {
         "model_class": Llama31,
         "input_shapes": [(1, 32)],  # batch_size=1, sequence_length=128
-        "device": "cuda",
         "description": "Llama 3.1 model with KV cache disabled",
     },
     "whisper": {
         "model_class": Whisper,
         "input_shapes": [(1, 80, 3000)],
-        "device": "cuda",
         "description": "OpenAI Whisper ASR model. now is encoder only",
     },
     "conv1d": {
         "model_class": MockConv1d,
         "input_shapes": [(1, 80, 3000)],
-        "device": "cuda",
         "description": "Conv1d layer with 80 input channels, 384 output channels",
     },
     "transformer_block": {
         "model_class": TransformerBlock,
         "input_shapes": [(4, 32, 256)],  # batch_size=4, seq_len=32, embed_dim=256
-        "device": "cuda",
         "description": "Single transformer block with multi-head attention and feed-forward network",
     },
 }
@@ -350,7 +338,7 @@ def get_model_and_inputs(
     model_config = MODEL_REGISTRY[model_name]
     model_class = model_config["model_class"]
     input_shapes = model_config["input_shapes"]
-    device = model_config["device"]
+    device = "cpu"
 
     # Create model instance
     model = model_class().to(device).eval()

From 034359affaf9ae69f97b0f986b86fa00e3205b40 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 10 Sep 2025 21:57:38 -0700
Subject: [PATCH 39/50] remove mis-introed libtorch header

---
 backends/aoti/runtime/aoti_backend.cpp | 7 -------
 backends/aoti/runtime/shims/memory.cpp | 1 -
 backends/aoti/runtime/shims/memory.h   | 1 -
 backends/aoti/runtime/shims/types.h    | 1 -
 4 files changed, 10 deletions(-)

diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 6ccd099da0f..efb96c2f363 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -31,9 +31,6 @@
 #include "shims/tensor_attribute.h"
 #include "shims/utils.h"
 
-// Include CUDA AOTI shims
-#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
-
 namespace executorch {
 namespace backends {
 namespace aoti {
@@ -73,10 +70,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       FreeableBuffer* processed, // This will be a empty buffer
       ArrayRef<CompileSpec> compile_specs // This will be my empty list
   ) const override {
-    // const char* so_path = static_cast<const char*>(processed->data());
-
-    // printf("so path: %s\n", so_path);
-
     const NamedDataMap* named_data_map = context.get_named_data_map();
 
     std::string so_path = "/tmp/test.so";
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index cbf52932268..afe13fe8616 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -61,7 +61,6 @@ bool is_tensor_channels_last(
       (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
 }
 
-// Version 2: For use with ExecutorTorch tensors (int32_t sizes)
 // Check if tensor is in contiguous memory format (NCHW for 4D tensors)
 bool is_tensor_contiguous(
     int64_t ndim,
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index 57058397972..87639d9d8e4 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -8,7 +8,6 @@
 
 #pragma once
 
-#include <torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h
index 312d05a4d33..27b4394d1b6 100644
--- a/backends/aoti/runtime/shims/types.h
+++ b/backends/aoti/runtime/shims/types.h
@@ -11,7 +11,6 @@
 #include <cuda_runtime.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
-#include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <cstdint>
 
 namespace executorch {

From bc559a6664726bb2af067499df770406e69bad0b Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 10 Sep 2025 22:14:09 -0700
Subject: [PATCH 40/50] remove unnecessary cuda stream functions

---
 backends/aoti/runtime/aoti_backend.cpp | 32 +---------------
 backends/aoti/runtime/shims/memory.cpp | 43 ---------------------
 backends/aoti/runtime/shims/memory.h   | 53 +++++++++++---------------
 3 files changed, 24 insertions(+), 104 deletions(-)

diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index efb96c2f363..6160670042b 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -11,7 +11,6 @@
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
 
-#include <cuda_runtime.h>
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/stat.h>
@@ -288,19 +287,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "AOTIBackend output generated");
 
-    // Create a CUDA stream for this execution
-    cudaStream_t cuda_stream;
-    cudaError_t stream_err = cudaStreamCreate(&cuda_stream);
-    if (stream_err != cudaSuccess) {
-      ET_LOG(
-          Error,
-          "Failed to create CUDA stream: %s",
-          cudaGetErrorString(stream_err));
-      return Error::Internal;
-    }
-
-    ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream);
-
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
         handle->container_handle,
@@ -308,7 +294,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
         n_inputs,
         gpu_outputs.data(), // Use GPU output tensors
         n_outputs,
-        cuda_stream, // Pass the actual CUDA stream!
+        nullptr, // Pass the actual CUDA stream!
         nullptr); // proxy_executor_handle can remain nullptr
 
     if (error != Error::Ok) {
@@ -321,18 +307,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
 
     ET_LOG(Debug, "AOTIBackend running done");
 
-    // Synchronize the CUDA stream to ensure kernels complete
-    cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
-    if (sync_err != cudaSuccess) {
-      ET_LOG(
-          Error,
-          "Failed to synchronize CUDA stream: %s",
-          cudaGetErrorString(sync_err));
-      return Error::Internal;
-    }
-
-    ET_LOG(Debug, "CUDA stream synchronized");
-
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
       auto cpu_output_tensor = &(args[i + n_inputs]->toTensor());
@@ -356,10 +330,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       aoti_torch_delete_tensor_object(gpu_outputs[i]);
     }
 
-    // Destroy the CUDA stream
-    cudaStreamDestroy(cuda_stream);
-    ET_LOG(Debug, "CUDA stream destroyed and GPU tensors cleaned up");
-
     ET_LOG(Debug, "AOTIBackend execution completed successfully");
 
     return Error::Ok;
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index afe13fe8616..2b03468ddb3 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -639,49 +639,6 @@ AOTITorchError aoti_torch_copy_(
   return Error::Ok;
 }
 
-AOTITorchError aoti_torch_create_cuda_stream_guard(
-    void* stream,
-    int32_t device_index,
-    CUDAStreamGuardHandle* ret_guard) {
-  std::cout << "Entering stream guard for device " << device_index
-            << " with stream " << stream << std::endl;
-
-  // Set device
-  cudaError_t err = cudaSetDevice(device_index);
-  if (err != cudaSuccess) {
-    std::cerr << "Failed to set device " << device_index << ": "
-              << cudaGetErrorString(err) << std::endl;
-    return Error::Internal;
-  }
-
-  // Create minimal guard structure
-  CUDAStreamGuardOpaque* guard = new CUDAStreamGuardOpaque();
-  guard->device_index = device_index;
-  guard->original_stream = static_cast<cudaStream_t>(stream);
-  guard->sync_event = nullptr;
-
-  std::cout << "Stream guard created successfully for stream " << stream
-            << std::endl;
-
-  *ret_guard = guard;
-  return Error::Ok;
-}
-
-AOTITorchError aoti_torch_delete_cuda_stream_guard(
-    CUDAStreamGuardHandle guard) {
-  std::cout << "Exiting stream guard" << std::endl;
-
-  if (guard == nullptr) {
-    return Error::Ok;
-  }
-
-  // Clean up the guard structure
-  delete guard;
-
-  std::cout << "Stream guard cleanup completed" << std::endl;
-  return Error::Ok;
-}
-
 AOTITorchError aoti_torch__reinterpret_tensor(
     AOTITensorHandle self,
     int64_t ndim,
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index 87639d9d8e4..e0a83109932 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -26,30 +26,30 @@ extern std::unordered_map<Tensor*, bool> is_tensor_own_memory;
 extern std::unordered_set<std::shared_ptr<Tensor>> tensors;
 
 // Memory-related operations
-// AOTITorchError aoti_torch_create_tensor_from_blob_v2(
-//     void* data,
-//     int64_t ndim,
-//     const int64_t* sizes_ptr,
-//     const int64_t* strides_ptr,
-//     int64_t storage_offset,
-//     int32_t dtype,
-//     int32_t device_type,
-//     int32_t device_index,
-//     AOTITensorHandle* ret_new_tensor,
-//     int32_t layout,
-//     const uint8_t* opaque_metadata,
-//     int64_t opaque_metadata_size);
+AOTITorchError aoti_torch_create_tensor_from_blob_v2(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor,
+    int32_t layout,
+    const uint8_t* opaque_metadata,
+    int64_t opaque_metadata_size);
 
-// AOTITorchError aoti_torch_create_tensor_from_blob(
-//     void* data,
-//     int64_t ndim,
-//     const int64_t* sizes_ptr,
-//     const int64_t* strides_ptr,
-//     int64_t storage_offset,
-//     int32_t dtype,
-//     int32_t device_type,
-//     int32_t device_index,
-//     AOTITensorHandle* ret_new_tensor);
+AOTITorchError aoti_torch_create_tensor_from_blob(
+    void* data,
+    int64_t ndim,
+    const int64_t* sizes_ptr,
+    const int64_t* strides_ptr,
+    int64_t storage_offset,
+    int32_t dtype,
+    int32_t device_type,
+    int32_t device_index,
+    AOTITensorHandle* ret_new_tensor);
 
 AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
@@ -67,13 +67,6 @@ AOTITorchError aoti_torch_copy_(
     AOTITensorHandle src,
     int32_t non_blocking);
 
-AOTITorchError aoti_torch_create_cuda_stream_guard(
-    void* stream,
-    int32_t device_index,
-    CUDAStreamGuardHandle* ret_guard);
-
-AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
-
 AOTITorchError aoti_torch__reinterpret_tensor(
     AOTITensorHandle self,
     int64_t ndim,

From 490a2b294900c47db710ba566c1450ff034c862e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 11 Sep 2025 15:10:57 -0700
Subject: [PATCH 41/50] remove debug print in c++

---
 backends/aoti/runtime/shims/memory.cpp        | 205 +++++-------------
 backends/aoti/runtime/shims/memory.h          |  12 +-
 .../aoti/runtime/shims/tensor_attribute.cpp   |  27 +--
 .../aoti/runtime/shims/tensor_attribute.h     |   3 +-
 backends/aoti/runtime/shims/types.h           |   8 -
 backends/aoti/runtime/shims/utils.cpp         |  22 ++
 backends/aoti/runtime/shims/utils.h           |   6 +
 7 files changed, 97 insertions(+), 186 deletions(-)

diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 2b03468ddb3..bf5336e9867 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "tensor_attribute.h"
+#include "utils.h"
 
 namespace executorch {
 namespace backends {
@@ -25,6 +26,20 @@ namespace aoti {
 
 namespace { // Internal namespace for utility functions
 
+// Utility function to print array values in format [val1, val2, ...]
+// For use with pointer-based arrays (e.g., int64_t* strides, int64_t* sizes)
+template <typename ValueType>
+void print_array_values(
+    const ValueType* values,
+    int64_t count,
+    const std::string& name = "values") {
+  std::cout << name << ": [";
+  for (int i = 0; i < count; i++) {
+    std::cout << values[i] << (i < count - 1 ? ", " : "");
+  }
+  std::cout << "]" << std::endl;
+}
+
 // Version 1: For use with int64_t sizes (e.g., from blob creation functions)
 // Check if tensor is in contiguous memory format (NCHW for 4D tensors)
 // Contiguous format means strides decrease from left to right:
@@ -61,7 +76,8 @@ bool is_tensor_channels_last(
       (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
 }
 
-// Check if tensor is in contiguous memory format (NCHW for 4D tensors)
+// Check if tensor is in contiguous memory format (NCHW for 4D tensors) for
+// int32_t sizes
 bool is_tensor_contiguous(
     int64_t ndim,
     const int32_t* sizes,
@@ -114,29 +130,22 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     int32_t layout,
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size) {
-  std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim
-            << ", dtype: " << dtype << ", device_type: " << device_type
-            << ", storage_offset: " << storage_offset << std::endl;
-
   // Only float32 tensors are supported
-  if (dtype != 6) { // 6 = float32
-    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
-              << dtype << " (expected: 6 for float32)" << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError dtype_error = validate_dtype(dtype);
+  if (dtype_error != Error::Ok) {
+    return dtype_error;
   }
 
   // Storage offset must always be 0
-  if (storage_offset != 0) {
-    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
-              << storage_offset << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError storage_offset_error = validate_storage_offset(storage_offset);
+  if (storage_offset_error != Error::Ok) {
+    return storage_offset_error;
   }
 
   // Convert sizes to the format expected by ExecutorTorch
   std::vector<int32_t> sizes(ndim);
   for (int i = 0; i < ndim; i++) {
     sizes[i] = static_cast<int32_t>(sizes_ptr[i]);
-    std::cout << "Size[" << i << "] = " << sizes[i] << std::endl;
   }
 
   // check the tensor format
@@ -168,28 +177,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
   tensors.insert(tensor);
 
   *ret_new_tensor = tensor.get();
-
   is_tensor_own_memory[tensor.get()] = false;
 
-  std::cout << "Successfully created tensor from blob: " << tensor.get()
-            << " wrapping data at: " << adjusted_data << std::endl;
-
   return Error::Ok;
 }
 
-AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor) {
-  throw std::runtime_error("Should never create from blob");
-}
-
 AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
@@ -205,14 +197,14 @@ AOTITorchError aoti_torch_empty_strided(
     numel *= sizes_ptr[i];
   }
 
-  if (dtype != 6) { // throw if not float32
-    throw std::runtime_error("Need to implement empty_strided for non-float32");
+  AOTITorchError dtype_error = validate_dtype(dtype);
+  if (dtype_error != Error::Ok) {
+    return dtype_error;
   }
 
   int64_t nbytes = numel * 4;
 
   if (device_type == 1) { // cuda
-    std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl;
     cudaError_t err = cudaMalloc(&ptr, nbytes);
     if (err != cudaSuccess) {
       std::cout << "failed to allocate " << nbytes
@@ -220,8 +212,8 @@ AOTITorchError aoti_torch_empty_strided(
       throw std::runtime_error("Failed to call cudaMalloc");
     }
   } else if (device_type == 0) { // cpu
-    std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl;
     // Ensure 16-byte alignment for CPU memory to match CUDA requirements
+    // do we need to do this in cuda backend?
     int result = posix_memalign(&ptr, 16, nbytes);
     if (result != 0) {
       throw std::runtime_error("Failed to allocate aligned CPU memory");
@@ -233,8 +225,6 @@ AOTITorchError aoti_torch_empty_strided(
     throw std::runtime_error(
         "Need to implement empty_strided for non-CUDA non-CPU");
   }
-  std::cout << "////Allocated " << nbytes << " bytes at " << ptr
-            << ", sizes_ptr " << sizes_ptr << std::endl;
 
   // ETensor sizes
   std::vector<int32_t> sizes(ndim);
@@ -242,13 +232,6 @@ AOTITorchError aoti_torch_empty_strided(
     sizes[i] = sizes_ptr[i];
   }
 
-  std::cout << "Sizes: ";
-  for (int i = 0; i < ndim; i++) {
-    std::cout << sizes[i] << ", ";
-  }
-
-  std::cout << std::endl;
-
   // ETensor strides
   std::vector<int32_t> strides(ndim);
   if (strides_ptr != nullptr) {
@@ -263,7 +246,6 @@ AOTITorchError aoti_torch_empty_strided(
       strides[i] = strides[i + 1] * sizes_ptr[i + 1];
     }
   }
-  std::cout << std::endl;
 
   // ETensor creation
   auto tensor = executorch::extension::from_blob(ptr, sizes, strides);
@@ -273,22 +255,10 @@ AOTITorchError aoti_torch_empty_strided(
   *ret_new_tensor = tensor.get();
   is_tensor_own_memory[tensor.get()] = true;
 
-  std::cout << "Finished. Created tensor " << tensor.get() << " with sizes "
-            << std::endl
-            << "sizes.data(): " << sizes.data()
-            << ", tensor->sizes().data(): " << tensor->sizes().data()
-            << std::endl;
-  std::cout << "Size[0] of tensor " << tensor.get() << " is "
-            << tensor->sizes()[0] << std::endl
-            << std::endl;
-
   return Error::Ok;
 }
 
 AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
-  std::cout << "Called aoti_torch_delete_tensor_object for tensor " << tensor
-            << std::endl;
-
   // Check ownership before cleaning up metadata
   auto ownership_it = is_tensor_own_memory.find(tensor);
   bool owns_memory = (ownership_it != is_tensor_own_memory.end())
@@ -301,8 +271,7 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
   is_tensor_own_memory.erase(tensor);
 
   if (!owns_memory) {
-    std::cout << "Tensor " << tensor << " does not own memory. Skipped \n\n"
-              << std::endl;
+    // Don't free memory since the tensor doesn't own it
     return Error::Ok;
   }
 
@@ -320,26 +289,16 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
       // et tensor does not own data; need to free them manually.
       if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
         // This is GPU memory - free with proper synchronization
-        std::cout << "Freeing GPU memory at " << data_ptr << std::endl;
         cudaDeviceSynchronize(); // Wait for all operations to complete BEFORE
                                  // freeing
         cudaFree(data_ptr);
-        std::cout << "GPU memory freed successfully" << std::endl;
       } else {
         // This is CPU memory - free immediately
-        std::cout << "Freeing CPU memory at " << data_ptr << std::endl;
         free(data_ptr);
-        std::cout << "CPU memory freed successfully" << std::endl;
       }
-
-      std::cout << "Memory freed. Now erasing tensor " << tensor << std::endl;
-
       // Remove from set (this will call the destructor if it's the last
       // reference)
       tensors.erase(it);
-
-      std::cout << "Tensor erased. Now returning \n\n" << std::endl;
-
       return Error::Ok;
     }
   }
@@ -374,10 +333,14 @@ AOTITorchError aoti_torch_copy_(
   aoti_torch_get_dtype(self, &self_dtype);
   aoti_torch_get_dtype(src, &src_dtype);
 
-  if (self_dtype != 6 || src_dtype != 6) { // 6 = float32
-    std::cout << "Error: Only float32 tensors supported. Got self.dtype="
-              << self_dtype << ", src.dtype=" << src_dtype << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError self_dtype_error = validate_dtype(self_dtype);
+  if (self_dtype_error != Error::Ok) {
+    return self_dtype_error;
+  }
+
+  AOTITorchError src_dtype_error = validate_dtype(src_dtype);
+  if (src_dtype_error != Error::Ok) {
+    return src_dtype_error;
   }
 
   // Get stride information for layout validation
@@ -386,8 +349,10 @@ AOTITorchError aoti_torch_copy_(
   aoti_torch_get_strides(self, &self_strides);
   aoti_torch_get_strides(src, &src_strides);
 
-  auto self_sizes = self->sizes();
-  auto src_sizes = src->sizes();
+  int64_t* self_sizes;
+  int64_t* src_sizes;
+  aoti_torch_get_sizes(self, &self_sizes);
+  aoti_torch_get_sizes(src, &src_sizes);
 
   // Check if tensors have the same tensor schema (sizes, strides, dtype)
   bool same_schema = true;
@@ -416,66 +381,46 @@ AOTITorchError aoti_torch_copy_(
   bool self_is_channels_last = false;
   bool src_is_channels_last = false;
 
-  if (same_schema) {
-    std::cout << "Same tensor schema detected - enabling naive copy"
-              << std::endl;
-    // For same schema, we don't need to check memory formats - just use direct
-    // copy
-  } else {
+  // For same schema, we don't need to check memory formats - just use direct
+  // copy
+  if (!same_schema) {
     // Different strides: check memory format and only support contiguous <->
     // channels-last conversion
-    std::cout
-        << "Different tensor schemas - checking memory format compatibility"
-        << std::endl;
 
     // Check if contiguous (strides decrease from left to right)
     self_is_contiguous =
-        is_tensor_contiguous(self->dim(), self_sizes.data(), self_strides);
+        is_tensor_contiguous(self->dim(), self_sizes, self_strides);
 
     src_is_contiguous =
-        is_tensor_contiguous(src->dim(), src_sizes.data(), src_strides);
+        is_tensor_contiguous(src->dim(), src_sizes, src_strides);
 
     // Check if channels-last (4D: NHWC format)
     if (!self_is_contiguous) {
       self_is_channels_last =
-          is_tensor_channels_last(self->dim(), self_sizes.data(), self_strides);
+          is_tensor_channels_last(self->dim(), self_sizes, self_strides);
     }
 
     if (!src_is_contiguous) {
       src_is_channels_last =
-          is_tensor_channels_last(src->dim(), src_sizes.data(), src_strides);
+          is_tensor_channels_last(src->dim(), src_sizes, src_strides);
     }
 
     // Validate layout assumptions only when schemas differ
     if (!self_is_contiguous && !self_is_channels_last) {
       std::cout
           << "Error: self tensor must be contiguous or channels-last for stride conversion. "
-          << "Got strides: [";
-      for (int i = 0; i < self->dim(); i++) {
-        std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : "");
-      }
-      std::cout << "]" << std::endl;
-      std::cout << "self_sizes: [";
-      for (int i = 0; i < self->dim(); i++) {
-        std::cout << self_sizes[i] << (i < self->dim() - 1 ? ", " : "");
-      }
-      std::cout << "]" << std::endl;
+          << std::endl;
+      print_array_values(self_strides, self->dim(), "self strides");
+      print_array_values(self_sizes, self->dim(), "self_sizes");
       return Error::InvalidArgument;
     }
 
     if (!src_is_contiguous && !src_is_channels_last) {
       std::cout
-          << "Error: src tensor must be contiguous or channels-last for stride conversion. \n"
-          << "Got strides: [";
-      for (int i = 0; i < src->dim(); i++) {
-        std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : "");
-      }
-      std::cout << "]" << std::endl;
-      std::cout << "src_sizes: [";
-      for (int i = 0; i < self->dim(); i++) {
-        std::cout << src_sizes[i] << (i < self->dim() - 1 ? ", " : "");
-      }
-      std::cout << "]" << std::endl;
+          << "Error: src tensor must be contiguous or channels-last for stride conversion."
+          << std::endl;
+      print_array_values(src_strides, src->dim(), "self strides");
+      print_array_values(src_sizes, src->dim(), "src_sizes");
       return Error::InvalidArgument;
     }
   }
@@ -493,18 +438,9 @@ AOTITorchError aoti_torch_copy_(
   bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
   bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
 
-  std::cout << "Copy layout: src="
-            << (src_is_contiguous ? "contiguous" : "channels-last") << " ("
-            << (srcIsDevice ? "GPU" : "CPU") << ") -> "
-            << "dst=" << (self_is_contiguous ? "contiguous" : "channels-last")
-            << " (" << (dstIsDevice ? "GPU" : "CPU") << ")" << std::endl;
-
   size_t total_bytes = src->nbytes();
 
   if (same_schema) {
-    std::cout << "Same layout - doing direct copy of " << total_bytes
-              << " bytes" << std::endl;
-
     // Simple copy since layouts match
     if (srcIsDevice && dstIsDevice) {
       err = cudaMemcpy(
@@ -646,23 +582,10 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     const int64_t* strides_ptr,
     int64_t storage_offset,
     AOTITensorHandle* ret_new_tensor) {
-  std::cout << "aoti_torch__reinterpret_tensor called with tensor " << self
-            << ", ndim: " << ndim << ", storage_offset: " << storage_offset
-            << std::endl;
-
-  for (int i = 0; i < ndim; i++) {
-    std::cout << "sizes[" << i << "]: " << sizes_ptr[i] << std::endl;
-  }
-  for (int i = 0; i < ndim; i++) {
-    std::cout << "strides[" << i << "]: " << strides_ptr[i] << std::endl;
-  }
-
   // Check if storage_offset is not 0 - return error if not
-  if (storage_offset != 0) {
-    std::cout
-        << "Error: aoti_torch__reinterpret_tensor does not support non-zero storage_offset: "
-        << storage_offset << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError storage_offset_error = validate_storage_offset(storage_offset);
+  if (storage_offset_error != Error::Ok) {
+    return storage_offset_error;
   }
 
   // Check if dimensions match
@@ -680,13 +603,6 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     return dtype_err;
   }
 
-  if (dtype != 6) { // 6 = float32
-    std::cout
-        << "ERROR: Only float32 tensors are supported in reinterpret_tensor. Got dtype: "
-        << dtype << " (expected: 6 for float32)" << std::endl;
-    return Error::InvalidArgument;
-  }
-
   int32_t device_type;
   AOTITorchError device_type_err =
       aoti_torch_get_device_type(self, &device_type);
@@ -705,10 +621,6 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     return device_index_err;
   }
 
-  std::cout << "Creating new tensor with dtype: " << dtype
-            << ", device_type: " << device_type
-            << ", device_index: " << device_index << std::endl;
-
   // Create new tensor with the provided sizes and strides using
   // aoti_torch_empty_strided
   AOTITorchError create_err = aoti_torch_empty_strided(
@@ -737,9 +649,6 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     return copy_err;
   }
 
-  std::cout << "Successfully created reinterpreted tensor " << *ret_new_tensor
-            << " from source tensor " << self << std::endl;
-
   return Error::Ok;
 }
 
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index e0a83109932..37c5a5796f5 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <cuda_runtime.h>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
@@ -40,17 +41,6 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     const uint8_t* opaque_metadata,
     int64_t opaque_metadata_size);
 
-AOTITorchError aoti_torch_create_tensor_from_blob(
-    void* data,
-    int64_t ndim,
-    const int64_t* sizes_ptr,
-    const int64_t* strides_ptr,
-    int64_t storage_offset,
-    int32_t dtype,
-    int32_t device_type,
-    int32_t device_index,
-    AOTITensorHandle* ret_new_tensor);
-
 AOTITorchError aoti_torch_empty_strided(
     int64_t ndim,
     const int64_t* sizes_ptr,
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index 8e0097cd8bd..955beebd0ed 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -8,6 +8,7 @@
 
 #include "tensor_attribute.h"
 #include <iostream>
+#include "utils.h"
 
 namespace executorch {
 namespace backends {
@@ -44,10 +45,10 @@ AOTITorchError aoti_torch_get_storage_offset(
   *ret_storage_offset = 0;
 
   // ASSERTION: Storage offset must always be 0
-  if (*ret_storage_offset != 0) {
-    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
-              << *ret_storage_offset << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError storage_offset_error =
+      validate_storage_offset(*ret_storage_offset);
+  if (storage_offset_error != Error::Ok) {
+    return storage_offset_error;
   }
 
   return Error::Ok;
@@ -66,11 +67,7 @@ AOTITorchError aoti_torch_get_strides(
     it = tensor_to_strides.emplace(tensor, std::move(strides)).first;
   }
   *ret_strides = it->second.data();
-  std::cout << "getting strides from tensor " << tensor << " with dim "
-            << tensor->dim() << std::endl;
-  for (int i = 0; i < tensor->dim(); i++) {
-    std::cout << "strides " << i << " = " << (*ret_strides)[i] << std::endl;
-  }
+
   return Error::Ok;
 }
 
@@ -80,10 +77,9 @@ AOTITorchError aoti_torch_get_dtype(
   *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
 
   // ASSERTION: Only float32 tensors are supported
-  if (*ret_dtype != 6) { // 6 = float32
-    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
-              << *ret_dtype << " (expected: 6 for float32)" << std::endl;
-    return Error::InvalidArgument;
+  AOTITorchError dtype_error = validate_dtype(*ret_dtype);
+  if (dtype_error != Error::Ok) {
+    return dtype_error;
   }
 
   return Error::Ok;
@@ -102,11 +98,6 @@ AOTITorchError aoti_torch_get_sizes(
     it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first;
   }
   *ret_sizes = it->second.data();
-  std::cout << "getting sizes from tensor " << tensor << " with dim "
-            << tensor->dim() << std::endl;
-  for (int i = 0; i < tensor->dim(); i++) {
-    std::cout << "size " << i << " = " << (*ret_sizes)[i] << std::endl;
-  }
   return Error::Ok;
 }
 
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h
index 387056a30fd..20ea3d487a0 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.h
+++ b/backends/aoti/runtime/shims/tensor_attribute.h
@@ -8,9 +8,10 @@
 
 #pragma once
 
-#include "types.h"
+#include <cuda_runtime.h>
 #include <unordered_map>
 #include <vector>
+#include "types.h"
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h
index 27b4394d1b6..1bcae2058ca 100644
--- a/backends/aoti/runtime/shims/types.h
+++ b/backends/aoti/runtime/shims/types.h
@@ -29,14 +29,6 @@ using AOTITensorHandle = Tensor*;
 using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
-// CUDA-specific types
-struct CUDAStreamGuardOpaque {
-  cudaStream_t original_stream;
-  int device_index;
-  cudaEvent_t sync_event;
-};
-using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*;
-
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
index a9dc5c84eb7..441cd719fa9 100644
--- a/backends/aoti/runtime/shims/utils.cpp
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -195,6 +195,28 @@ void cleanup_aoti_tensor_output() {
   // No cleanup needed since file is opened and closed on each call
 }
 
+// Dtype validation utility function
+AOTITorchError validate_dtype(int32_t dtype) {
+  // Only float32 tensors are supported (dtype 6)
+  if (dtype != 6) {
+    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
+              << dtype << " (expected: 6 for float32)" << std::endl;
+    return Error::InvalidArgument;
+  }
+  return Error::Ok;
+}
+
+// Storage offset validation utility function
+AOTITorchError validate_storage_offset(int64_t storage_offset) {
+  // Storage offset must always be 0
+  if (storage_offset != 0) {
+    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
+              << storage_offset << std::endl;
+    return Error::InvalidArgument;
+  }
+  return Error::Ok;
+}
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
index 06d2edce212..630bfa3d74c 100644
--- a/backends/aoti/runtime/shims/utils.h
+++ b/backends/aoti/runtime/shims/utils.h
@@ -24,6 +24,12 @@ extern "C" {
 // Cleanup function for tensor output file (called during backend destruction)
 void cleanup_aoti_tensor_output();
 
+// Dtype validation utility function
+AOTITorchError validate_dtype(int32_t dtype);
+
+// Storage offset validation utility function
+AOTITorchError validate_storage_offset(int64_t storage_offset);
+
 } // extern "C"
 
 } // namespace aoti

From 558d0c2ae06f29273bb9f43709a4e44251a46d5a Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 11 Sep 2025 17:12:54 -0700
Subject: [PATCH 42/50] remove debug print in c++ - 2

---
 backends/aoti/runtime/shims/memory.cpp        | 57 +------------------
 .../aoti/runtime/shims/tensor_attribute.cpp   | 18 +-----
 2 files changed, 4 insertions(+), 71 deletions(-)

diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index bf5336e9867..62f08ba8444 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -40,7 +40,6 @@ void print_array_values(
   std::cout << "]" << std::endl;
 }
 
-// Version 1: For use with int64_t sizes (e.g., from blob creation functions)
 // Check if tensor is in contiguous memory format (NCHW for 4D tensors)
 // Contiguous format means strides decrease from left to right:
 // For NCHW: strides = [C*H*W, H*W, W, 1]
@@ -76,39 +75,6 @@ bool is_tensor_channels_last(
       (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
 }
 
-// Check if tensor is in contiguous memory format (NCHW for 4D tensors) for
-// int32_t sizes
-bool is_tensor_contiguous(
-    int64_t ndim,
-    const int32_t* sizes,
-    const int64_t* strides) {
-  int64_t expected_stride = 1;
-  for (int i = ndim - 1; i >= 0; i--) {
-    if (strides[i] != expected_stride) {
-      return false;
-    }
-    expected_stride *= sizes[i];
-  }
-  return true;
-}
-
-// Check if tensor is in channels-last format (NHWC for 4D tensors)
-bool is_tensor_channels_last(
-    int64_t ndim,
-    const int32_t* sizes,
-    const int64_t* strides) {
-  if (ndim != 4) {
-    return false; // Channels-last only defined for 4D tensors
-  }
-
-  int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3];
-
-  // Check NHWC format: strides = [H*W*C, 1, W*C, C]
-  // Handle edge cases where dimensions might be 1
-  return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) &&
-      (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1);
-}
-
 } // anonymous namespace
 
 // Global storage for tensors and their metadata
@@ -318,9 +284,6 @@ AOTITorchError aoti_torch_copy_(
     AOTITensorHandle self,
     AOTITensorHandle src,
     int32_t non_blocking) {
-  std::cout << "aoti_torch_copy_ called: self=" << self << ", src=" << src
-            << std::endl;
-
   // assert same dim for now
   if (self->dim() != src->dim()) {
     std::cout << "Error: dimension mismatch. self.dim()=" << self->dim()
@@ -357,24 +320,14 @@ AOTITorchError aoti_torch_copy_(
   // Check if tensors have the same tensor schema (sizes, strides, dtype)
   bool same_schema = true;
 
-  // Check sizes match
+  // Check schema match
   for (int i = 0; i < self->dim(); i++) {
-    if (self_sizes[i] != src_sizes[i]) {
+    if (self_sizes[i] != src_sizes[i] || self_strides[i] != src_strides[i]) {
       same_schema = false;
       break;
     }
   }
 
-  // Check strides match (only if sizes match)
-  if (same_schema) {
-    for (int i = 0; i < self->dim(); i++) {
-      if (self_strides[i] != src_strides[i]) {
-        same_schema = false;
-        break;
-      }
-    }
-  }
-
   // Declare layout variables for both cases
   bool self_is_contiguous = true;
   bool src_is_contiguous = true;
@@ -468,8 +421,6 @@ AOTITorchError aoti_torch_copy_(
     }
   } else {
     // Layout conversion needed (contiguous <-> channels-last)
-    std::cout << "Layout conversion needed - doing element-wise copy"
-              << std::endl;
 
     if (self->dim() != 4) {
       std::cout << "Error: Layout conversion only supported for 4D tensors"
@@ -568,10 +519,6 @@ AOTITorchError aoti_torch_copy_(
     dst_first = static_cast<const float*>(self->data_ptr())[0];
   }
 
-  std::cout << "Copy verification: src[0]=" << src_first
-            << ", dst[0]=" << dst_first << std::endl;
-  std::cout << "aoti_torch_copy_ completed successfully" << std::endl;
-
   return Error::Ok;
 }
 
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index 955beebd0ed..57a3805100f 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -44,13 +44,6 @@ AOTITorchError aoti_torch_get_storage_offset(
   // Storage offset is always 0 in ET
   *ret_storage_offset = 0;
 
-  // ASSERTION: Storage offset must always be 0
-  AOTITorchError storage_offset_error =
-      validate_storage_offset(*ret_storage_offset);
-  if (storage_offset_error != Error::Ok) {
-    return storage_offset_error;
-  }
-
   return Error::Ok;
 }
 
@@ -110,10 +103,8 @@ AOTITorchError aoti_torch_get_storage_size(
 AOTITorchError aoti_torch_get_device_type(
     AOTITensorHandle tensor,
     int32_t* ret_device_type) {
-  // Let's assume all tensors AOTI using are on CUDA device
+  // All tensors in aoti-cuda delegate are on CUDA
   *ret_device_type = aoti_torch_device_type_cuda();
-  std::cout << "getting device_type from tensor " << tensor << " = "
-            << *ret_device_type << std::endl;
   return Error::Ok;
 }
 
@@ -122,15 +113,11 @@ AOTITorchError aoti_torch_get_device_index(
     int32_t* ret_device_index) {
   // Let's assume all tensors AOTI using are on CUDA:0
   *ret_device_index = 0;
-  std::cout << "getting device_index from tensor " << tensor << " = "
-            << *ret_device_index << std::endl;
   return Error::Ok;
 }
 
 AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) {
-  *ret_dim = tensor->dim();
-  std::cout << "getting dim from tensor " << tensor << " = " << *ret_dim
-            << std::endl;
+  *ret_dim = static_cast<int64_t>(tensor->dim());
   return Error::Ok;
 }
 
@@ -152,7 +139,6 @@ aoti_torch_device_type_cuda() {
 }
 
 __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
-  // Let assume the dtype here is all we will support
   return 6;
 }
 

From 5609a5d499cf4945dff083607f53832df1f89d04 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 11 Sep 2025 17:34:37 -0700
Subject: [PATCH 43/50] use et_log and et::error for err and its msg

---
 backends/aoti/runtime/shims/memory.cpp | 173 ++++++++++++++++---------
 backends/aoti/runtime/shims/memory.h   |   2 +-
 backends/aoti/runtime/shims/utils.cpp  |  13 +-
 3 files changed, 124 insertions(+), 64 deletions(-)

diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index 62f08ba8444..ebc6a0012a0 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -7,6 +7,7 @@
  */
 
 #include "memory.h"
+#include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
@@ -26,18 +27,28 @@ namespace aoti {
 
 namespace { // Internal namespace for utility functions
 
-// Utility function to print array values in format [val1, val2, ...]
+// Utility function to log array values as error msg in format [val1, val2, ...]
 // For use with pointer-based arrays (e.g., int64_t* strides, int64_t* sizes)
-template <typename ValueType>
-void print_array_values(
-    const ValueType* values,
+void et_error_log_array_values(
+    const int64_t* values,
     int64_t count,
     const std::string& name = "values") {
-  std::cout << name << ": [";
-  for (int i = 0; i < count; i++) {
-    std::cout << values[i] << (i < count - 1 ? ", " : "");
+  if (count <= 0) {
+    ET_LOG(Error, "%s: empty array", name.c_str());
+    return;
   }
-  std::cout << "]" << std::endl;
+
+  // Build array string representation
+  std::string array_str = "[";
+  for (int64_t i = 0; i < count; i++) {
+    array_str += std::to_string(values[i]);
+    if (i < count - 1) {
+      array_str += ", ";
+    }
+  }
+  array_str += "]";
+
+  ET_LOG(Error, "%s: %s", name.c_str(), array_str.c_str());
 }
 
 // Check if tensor is in contiguous memory format (NCHW for 4D tensors)
@@ -117,9 +128,9 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
   // check the tensor format
   // Only support contiguous format for now
   if (!is_tensor_contiguous(ndim, sizes_ptr, strides_ptr)) {
-    std::cout
-        << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error"
-        << std::endl;
+    ET_LOG(
+        Error,
+        "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format");
     return Error::InvalidArgument;
   }
 
@@ -135,7 +146,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
   );
 
   if (!tensor) {
-    std::cerr << "Failed to create tensor from blob" << std::endl;
+    ET_LOG(Error, "Failed to create tensor from blob");
     return Error::InvalidArgument;
   }
 
@@ -173,23 +184,31 @@ AOTITorchError aoti_torch_empty_strided(
   if (device_type == 1) { // cuda
     cudaError_t err = cudaMalloc(&ptr, nbytes);
     if (err != cudaSuccess) {
-      std::cout << "failed to allocate " << nbytes
-                << " error: " << cudaGetErrorString(err) << std::endl;
-      throw std::runtime_error("Failed to call cudaMalloc");
+      ET_LOG(
+          Error,
+          "failed to allocate %ld bytes: %s",
+          nbytes,
+          cudaGetErrorString(err));
+      return Error::MemoryAllocationFailed;
     }
   } else if (device_type == 0) { // cpu
     // Ensure 16-byte alignment for CPU memory to match CUDA requirements
     // do we need to do this in cuda backend?
     int result = posix_memalign(&ptr, 16, nbytes);
     if (result != 0) {
-      throw std::runtime_error("Failed to allocate aligned CPU memory");
+      ET_LOG(Error, "Failed to allocate aligned CPU memory");
+      return Error::MemoryAllocationFailed;
     }
     if (ptr == nullptr) {
-      throw std::runtime_error("Failed to call posix_memalign");
+      ET_LOG(Error, "Failed to call posix_memalign");
+      return Error::MemoryAllocationFailed;
     }
   } else {
-    throw std::runtime_error(
-        "Need to implement empty_strided for non-CUDA non-CPU");
+    ET_LOG(
+        Error,
+        "Need to implement empty_strided for non-CUDA non-CPU device type %d",
+        device_type);
+    return Error::NotImplemented;
   }
 
   // ETensor sizes
@@ -268,16 +287,16 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) {
       return Error::Ok;
     }
   }
-  std::cout << "Error: Didn't find tensor " << tensor << std::endl;
+  ET_LOG(Error, "Didn't find tensor %p", tensor);
   return Error::InvalidArgument;
 }
 
-void checkCudaError(cudaError_t err, const char* msg) {
+AOTITorchError checkCudaError(cudaError_t err, const char* msg) {
   if (err != cudaSuccess) {
-    std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")"
-              << std::endl;
-    exit(EXIT_FAILURE);
+    ET_LOG(Error, "%s (%s)", msg, cudaGetErrorString(err));
+    return Error::Internal;
   }
+  return Error::Ok;
 }
 
 AOTITorchError aoti_torch_copy_(
@@ -286,8 +305,11 @@ AOTITorchError aoti_torch_copy_(
     int32_t non_blocking) {
   // assert same dim for now
   if (self->dim() != src->dim()) {
-    std::cout << "Error: dimension mismatch. self.dim()=" << self->dim()
-              << ", src.dim()=" << src->dim() << std::endl;
+    ET_LOG(
+        Error,
+        "dimension mismatch. self.dim()=%d, src.dim()=%d",
+        self->dim(),
+        src->dim());
     return Error::InvalidArgument;
   }
 
@@ -360,20 +382,20 @@ AOTITorchError aoti_torch_copy_(
 
     // Validate layout assumptions only when schemas differ
     if (!self_is_contiguous && !self_is_channels_last) {
-      std::cout
-          << "Error: self tensor must be contiguous or channels-last for stride conversion. "
-          << std::endl;
-      print_array_values(self_strides, self->dim(), "self strides");
-      print_array_values(self_sizes, self->dim(), "self_sizes");
+      ET_LOG(
+          Error,
+          "self tensor must be contiguous or channels-last for stride conversion");
+      et_error_log_array_values(self_strides, self->dim(), "self strides");
+      et_error_log_array_values(self_sizes, self->dim(), "self_sizes");
       return Error::InvalidArgument;
     }
 
     if (!src_is_contiguous && !src_is_channels_last) {
-      std::cout
-          << "Error: src tensor must be contiguous or channels-last for stride conversion."
-          << std::endl;
-      print_array_values(src_strides, src->dim(), "self strides");
-      print_array_values(src_sizes, src->dim(), "src_sizes");
+      ET_LOG(
+          Error,
+          "src tensor must be contiguous or channels-last for stride conversion");
+      et_error_log_array_values(src_strides, src->dim(), "self strides");
+      et_error_log_array_values(src_sizes, src->dim(), "src_sizes");
       return Error::InvalidArgument;
     }
   }
@@ -383,10 +405,18 @@ AOTITorchError aoti_torch_copy_(
   cudaError_t err;
 
   err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr());
-  checkCudaError(err, "Failed to get source pointer attributes");
+  AOTITorchError cuda_err =
+      checkCudaError(err, "Failed to get source pointer attributes");
+  if (cuda_err != Error::Ok) {
+    return cuda_err;
+  }
 
   err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr());
-  checkCudaError(err, "Failed to get destination pointer attributes");
+  cuda_err =
+      checkCudaError(err, "Failed to get destination pointer attributes");
+  if (cuda_err != Error::Ok) {
+    return cuda_err;
+  }
 
   bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice;
   bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice;
@@ -401,21 +431,30 @@ AOTITorchError aoti_torch_copy_(
           src->data_ptr(),
           total_bytes,
           cudaMemcpyDeviceToDevice);
-      checkCudaError(err, "Failed to copy from device to device");
+      cuda_err = checkCudaError(err, "Failed to copy from device to device");
+      if (cuda_err != Error::Ok) {
+        return cuda_err;
+      }
     } else if (srcIsDevice && !dstIsDevice) {
       err = cudaMemcpy(
           self->mutable_data_ptr(),
           src->data_ptr(),
           total_bytes,
           cudaMemcpyDeviceToHost);
-      checkCudaError(err, "Failed to copy from device to host");
+      cuda_err = checkCudaError(err, "Failed to copy from device to host");
+      if (cuda_err != Error::Ok) {
+        return cuda_err;
+      }
     } else if (!srcIsDevice && dstIsDevice) {
       err = cudaMemcpy(
           self->mutable_data_ptr(),
           src->data_ptr(),
           total_bytes,
           cudaMemcpyHostToDevice);
-      checkCudaError(err, "Failed to copy from host to device");
+      cuda_err = checkCudaError(err, "Failed to copy from host to device");
+      if (cuda_err != Error::Ok) {
+        return cuda_err;
+      }
     } else {
       std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes);
     }
@@ -423,8 +462,7 @@ AOTITorchError aoti_torch_copy_(
     // Layout conversion needed (contiguous <-> channels-last)
 
     if (self->dim() != 4) {
-      std::cout << "Error: Layout conversion only supported for 4D tensors"
-                << std::endl;
+      ET_LOG(Error, "Layout conversion only supported for 4D tensors");
       return Error::NotImplemented;
     }
 
@@ -439,7 +477,11 @@ AOTITorchError aoti_torch_copy_(
       src_host_data = new float[total_elements];
       err = cudaMemcpy(
           src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost);
-      checkCudaError(err, "Failed to copy src to host");
+      cuda_err = checkCudaError(err, "Failed to copy src to host");
+      if (cuda_err != Error::Ok) {
+        delete[] src_host_data;
+        return cuda_err;
+      }
       need_free_src = true;
     } else {
       src_host_data = static_cast<float*>(src->data_ptr());
@@ -491,7 +533,15 @@ AOTITorchError aoti_torch_copy_(
           dst_host_data,
           total_bytes,
           cudaMemcpyHostToDevice);
-      checkCudaError(err, "Failed to copy result to device");
+      cuda_err = checkCudaError(err, "Failed to copy result to device");
+      if (cuda_err != Error::Ok) {
+        // Clean up temporary buffers before returning
+        if (need_free_src)
+          delete[] src_host_data;
+        if (need_free_dst)
+          delete[] dst_host_data;
+        return cuda_err;
+      }
     }
 
     // Clean up temporary buffers
@@ -506,7 +556,10 @@ AOTITorchError aoti_torch_copy_(
   if (srcIsDevice) {
     err = cudaMemcpy(
         &src_first, src->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost);
-    checkCudaError(err, "Failed to copy first src element");
+    cuda_err = checkCudaError(err, "Failed to copy first src element");
+    if (cuda_err != Error::Ok) {
+      return cuda_err;
+    }
   } else {
     src_first = static_cast<const float*>(src->data_ptr())[0];
   }
@@ -514,7 +567,10 @@ AOTITorchError aoti_torch_copy_(
   if (dstIsDevice) {
     err = cudaMemcpy(
         &dst_first, self->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost);
-    checkCudaError(err, "Failed to copy first dst element");
+    cuda_err = checkCudaError(err, "Failed to copy first dst element");
+    if (cuda_err != Error::Ok) {
+      return cuda_err;
+    }
   } else {
     dst_first = static_cast<const float*>(self->data_ptr())[0];
   }
@@ -537,8 +593,11 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 
   // Check if dimensions match
   if (self->dim() != ndim) {
-    std::cout << "Error: tensor dimension mismatch. self->dim(): "
-              << self->dim() << ", provided ndim: " << ndim << std::endl;
+    ET_LOG(
+        Error,
+        "tensor dimension mismatch. self->dim(): %d, provided ndim: %ld",
+        self->dim(),
+        ndim);
     return Error::InvalidArgument;
   }
 
@@ -546,7 +605,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
   int32_t dtype;
   AOTITorchError dtype_err = aoti_torch_get_dtype(self, &dtype);
   if (dtype_err != Error::Ok) {
-    std::cout << "Error: failed to get dtype from input tensor" << std::endl;
+    ET_LOG(Error, "failed to get dtype from input tensor");
     return dtype_err;
   }
 
@@ -554,8 +613,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
   AOTITorchError device_type_err =
       aoti_torch_get_device_type(self, &device_type);
   if (device_type_err != Error::Ok) {
-    std::cout << "Error: failed to get device_type from input tensor"
-              << std::endl;
+    ET_LOG(Error, "failed to get device_type from input tensor");
     return device_type_err;
   }
 
@@ -563,8 +621,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
   AOTITorchError device_index_err =
       aoti_torch_get_device_index(self, &device_index);
   if (device_index_err != Error::Ok) {
-    std::cout << "Error: failed to get device_index from input tensor"
-              << std::endl;
+    ET_LOG(Error, "failed to get device_index from input tensor");
     return device_index_err;
   }
 
@@ -580,16 +637,14 @@ AOTITorchError aoti_torch__reinterpret_tensor(
       ret_new_tensor);
 
   if (create_err != Error::Ok) {
-    std::cout << "Error: failed to create new tensor with empty_strided"
-              << std::endl;
+    ET_LOG(Error, "failed to create new tensor with empty_strided");
     return create_err;
   }
 
   // Copy data from source tensor to new tensor
   AOTITorchError copy_err = aoti_torch_copy_(*ret_new_tensor, self, 0);
   if (copy_err != Error::Ok) {
-    std::cout << "Error: failed to copy data from source tensor to new tensor"
-              << std::endl;
+    ET_LOG(Error, "failed to copy data from source tensor to new tensor");
     // Clean up the created tensor on failure
     aoti_torch_delete_tensor_object(*ret_new_tensor);
     *ret_new_tensor = nullptr;
@@ -603,7 +658,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
 void cleanup_memory() {
   is_tensor_own_memory.clear();
   if (!tensors.empty()) {
-    std::cout << "Warning: tensors not empty" << std::endl;
+    ET_LOG(Error, "Warning: tensors not empty during cleanup");
   }
 }
 
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h
index 37c5a5796f5..8e8e2910b03 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/runtime/shims/memory.h
@@ -66,7 +66,7 @@ AOTITorchError aoti_torch__reinterpret_tensor(
     AOTITensorHandle* ret_new_tensor);
 
 // Utility functions
-void checkCudaError(cudaError_t err, const char* msg);
+AOTITorchError checkCudaError(cudaError_t err, const char* msg);
 void cleanup_memory();
 
 } // extern "C"
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
index 441cd719fa9..d5399125b3b 100644
--- a/backends/aoti/runtime/shims/utils.cpp
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -7,6 +7,7 @@
  */
 
 #include "utils.h"
+#include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdio>
 #include <fstream>
@@ -199,8 +200,10 @@ void cleanup_aoti_tensor_output() {
 AOTITorchError validate_dtype(int32_t dtype) {
   // Only float32 tensors are supported (dtype 6)
   if (dtype != 6) {
-    std::cout << "ERROR: Only float32 tensors are supported. Got dtype: "
-              << dtype << " (expected: 6 for float32)" << std::endl;
+    ET_LOG(
+        Error,
+        "Only float32 tensors are supported. Got dtype: %d (expected: 6 for float32)",
+        dtype);
     return Error::InvalidArgument;
   }
   return Error::Ok;
@@ -210,8 +213,10 @@ AOTITorchError validate_dtype(int32_t dtype) {
 AOTITorchError validate_storage_offset(int64_t storage_offset) {
   // Storage offset must always be 0
   if (storage_offset != 0) {
-    std::cout << "ERROR: Storage offset must be 0. Got storage_offset: "
-              << storage_offset << std::endl;
+    ET_LOG(
+        Error,
+        "Storage offset must be 0. Got storage_offset: %ld",
+        storage_offset);
     return Error::InvalidArgument;
   }
   return Error::Ok;

From 5d4c928249c8743a0bc125ba481ffac38c440464 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Tue, 16 Sep 2025 11:26:45 -0700
Subject: [PATCH 44/50] remove spaghhtti code 1/n

---
 CMakeLists.txt                                |   1 -
 backends/aoti/aoti_backend.py                 |  47 ++--
 backends/aoti/runtime/aoti_backend.cpp        |   2 +
 backends/aoti/runtime/shims/memory.cpp        |  14 +-
 .../aoti/runtime/shims/tensor_attribute.cpp   |  43 +++-
 backends/aoti/runtime/shims/utils.cpp         | 234 +++++-------------
 backends/aoti/runtime/shims/utils.h           |  27 +-
 exir/backend/backend_api.py                   |   1 -
 exir/emit/_emit_program.py                    |  10 -
 exir/program/_program.py                      |  10 -
 runtime/executor/method.cpp                   |   2 -
 11 files changed, 159 insertions(+), 232 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3debc9fcf5..ad3163a2297 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,6 @@
 # https://github.com/google/XNNPACK/commit/c690daa67f883e1b627aadf7684c06797e9a0684
 cmake_minimum_required(VERSION 3.29)
 project(executorch)
-# project(executorch LANGUAGES CXX CUDA)
 
 
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a64bb9c5cc5..a07f91eaee7 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -79,36 +79,34 @@ def preprocess(
         # print("here", edge_program.example_inputs)
         copy_edge_program = copy.deepcopy(edge_program)
 
-        # Move the edge_program from CPU to CUDA using move_to_device_pass
-        copy_edge_program = move_to_device_pass(copy_edge_program, "cuda")
-        # graph_module = copy_edge_program.graph_module
-        edge_program_module = copy_edge_program.module()
+        # Move the edge_program from CPU to CUDA for aoti compile
+        cuda_edge_program = move_to_device_pass(copy_edge_program, "cuda")
+
+        edge_program_module = cuda_edge_program.module()
         args, kwargs = copy_edge_program.example_inputs
 
-        # Deep copy args and move tensors to CUDA for aot_compile
-        def move_to_cuda(obj):
-            if isinstance(obj, torch.Tensor):
-                return obj.cuda()
-            elif isinstance(obj, (list, tuple)):
-                return type(obj)(move_to_cuda(item) for item in obj)
-            elif isinstance(obj, dict):
-                return {key: move_to_cuda(value) for key, value in obj.items()}
-            else:
-                return obj
-
-        args = move_to_cuda(copy.deepcopy(args))
-        kwargs = move_to_cuda(copy.deepcopy(kwargs))
-
-        # print("args, kwargs", args, kwargs)
-        print("len(args)", len(args))
-        print("args[0].shape", args[0].shape)
-        print("len(kwargs)", len(kwargs))
+        # # Deep copy args and move tensors to CUDA for aot_compile
+        # def move_to_cuda(obj):
+        #     if isinstance(obj, torch.Tensor):
+        #         return obj.cuda()
+        #     elif isinstance(obj, (list, tuple)):
+        #         return type(obj)(move_to_cuda(item) for item in obj)
+        #     elif isinstance(obj, dict):
+        #         return {key: move_to_cuda(value) for key, value in obj.items()}
+        #     else:
+        #         return obj
+
+        # args = move_to_cuda(copy.deepcopy(args))
+        # kwargs = move_to_cuda(copy.deepcopy(kwargs))
 
         output_path = os.path.join(os.getcwd(), "aoti.so")
 
         options: dict[str, typing.Any] = {
+            "aot_inductor.embed_kernel_binary": True,
+            "aot_inductor.link_libtorch": False,
             "aot_inductor.package_constants_in_so": True,
             "aot_inductor.output_path": output_path,
+            "aot_inductor.debug_compile": True,
             "aot_inductor.force_mmap_weights": False,
             "max_autotune": True,
             "max_autotune_gemm_backends": "TRITON",
@@ -126,11 +124,6 @@ def move_to_cuda(obj):
 
         assert so_path == output_path, f"Expected {output_path} but got {so_path}"
 
-        check_call(
-            f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}",
-            shell=True,
-        )
-
         print("so_path", so_path)
 
         with open(so_path, "rb") as f:
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp
index 6160670042b..242ee24e1d9 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/runtime/aoti_backend.cpp
@@ -71,6 +71,8 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
   ) const override {
     const NamedDataMap* named_data_map = context.get_named_data_map();
 
+    // std::string so_path = "/home/gasoonjia/executorch/aoti.so";
+
     std::string so_path = "/tmp/test.so";
     std::string so_blob_key = "so_blob";
 
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp
index ebc6a0012a0..99990f4aeab 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/runtime/shims/memory.cpp
@@ -134,15 +134,12 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2(
     return Error::InvalidArgument;
   }
 
-  // Since storage_offset is guaranteed to be 0, use data pointer directly
-  void* adjusted_data = data;
-
   // Create ExecutorTorch tensor that wraps the existing memory
   // Note: We're NOT copying the data, just wrapping it
   auto tensor = executorch::extension::make_tensor_ptr(
       sizes, // tensor dimensions
-      adjusted_data, // existing memory (don't copy!)
-      executorch::aten::ScalarType::Float // only supported dtype
+      data, // existing memory (don't copy!)
+      dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType
   );
 
   if (!tensor) {
@@ -179,7 +176,12 @@ AOTITorchError aoti_torch_empty_strided(
     return dtype_error;
   }
 
-  int64_t nbytes = numel * 4;
+  size_t element_size = dtype_to_element_size(dtype);
+  if (element_size == 0) {
+    ET_LOG(Error, "Invalid element size for dtype: %d", dtype);
+    return Error::InvalidArgument;
+  }
+  int64_t nbytes = numel * element_size;
 
   if (device_type == 1) { // cuda
     cudaError_t err = cudaMalloc(&ptr, nbytes);
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp
index 57a3805100f..8d26bbbbe30 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/runtime/shims/tensor_attribute.cpp
@@ -138,9 +138,48 @@ aoti_torch_device_type_cuda() {
   return 1;
 }
 
+// Dtype constants - these return the PyTorch dtype codes
+// Currently only float32 is supported, but using robust enum-based approach
 __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
-  return 6;
-}
+  return static_cast<int32_t>(SupportedDTypes::FLOAT32);
+}
+
+// Future dtype support (commented out for now):
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bool() {
+//   return static_cast<int32_t>(SupportedDTypes::BOOL);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_uint8() {
+//   return static_cast<int32_t>(SupportedDTypes::UINT8);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int8() {
+//   return static_cast<int32_t>(SupportedDTypes::INT8);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int16() {
+//   return static_cast<int32_t>(SupportedDTypes::INT16);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int32() {
+//   return static_cast<int32_t>(SupportedDTypes::INT32);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int64() {
+//   return static_cast<int32_t>(SupportedDTypes::INT64);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float16() {
+//   return static_cast<int32_t>(SupportedDTypes::FLOAT16);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float64() {
+//   return static_cast<int32_t>(SupportedDTypes::FLOAT64);
+// }
+// 
+// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bfloat16() {
+//   return static_cast<int32_t>(SupportedDTypes::BFLOAT16);
+// }
 
 void cleanup_tensor_metadata() {
   tensor_to_sizes.clear();
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
index d5399125b3b..8b1734082bd 100644
--- a/backends/aoti/runtime/shims/utils.cpp
+++ b/backends/aoti/runtime/shims/utils.cpp
@@ -26,187 +26,79 @@ const char* const TENSOR_OUTPUT_FILENAME =
 
 extern "C" {
 
-// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) {
-//   printf("Printing tensor handle: %p\n", self);
-
-//   if (!self) {
-//     throw std::runtime_error("Tensor handle is null");
-//   }
-
-//   printf("Tensor handle is not null\n");
-
-//   // Get dtype and check if it's float32 (dtype 6 in PyTorch)
-//   int32_t dtype = 0;
-//   if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) {
-//     throw std::runtime_error("Failed to get tensor dtype");
-//   }
-
-//   printf("Tensor dtype is: %d\n", dtype);
-
-//   if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch
-//     throw std::runtime_error(
-//         "Tensor dtype is not float32. Expected dtype 6, got: " +
-//         std::to_string(dtype));
-//   }
-
-//   printf("Tensor dtype is float32\n");
-
-//   // Get data pointer
-//   void* data_ptr = nullptr;
-//   if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS ||
-//       !data_ptr) {
-//     throw std::runtime_error("Failed to get tensor data pointer");
-//   }
-
-//   printf("Tensor data pointer is %p not null\n", data_ptr);
-
-//   // Get dimensions
-//   int64_t dim = 0;
-//   if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) {
-//     throw std::runtime_error("Failed to get tensor dimensions");
-//   }
-
-//   printf("Tensor dimensions are: %ld\n", dim);
-
-//   // Get sizes
-//   int64_t* sizes = nullptr;
-//   if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) {
-//     throw std::runtime_error("Failed to get tensor sizes");
-//   }
-
-//   printf("Tensor sizes are: %ld\n", sizes);
-
-//   // Calculate total number of elements
-//   int64_t total_elements = 1;
-//   for (int i = 0; i < dim; i++) {
-//     total_elements *= sizes[i];
-//   }
-
-//   printf("Total elements in tensor: %ld\n", total_elements);
-
-//   // Check device type to handle CUDA tensors properly
-//   int32_t device_type = 0;
-//   if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) {
-//     throw std::runtime_error("Failed to get tensor device type");
-//   }
-
-//   printf("Tensor device type: %d\n", device_type);
-
-//   AtenTensorHandle cpu_tensor = nullptr;
-//   const float* float_data = nullptr;
-//   bool need_cleanup = false;
-
-//   // Check if tensor is on CUDA (device_type 1 is CUDA)
-//   if (device_type == 1) {
-//     printf("Tensor is on CUDA, copying to CPU...\n");
-
-//     // Get strides for creating CPU tensor
-//     int64_t* strides = nullptr;
-//     if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS ||
-//         !strides) {
-//       throw std::runtime_error("Failed to get tensor strides");
-//     }
-
-//     // Create a CPU tensor with same shape and layout
-//     if (aoti_torch_empty_strided(
-//             dim, sizes, strides, dtype, 0, -1, &cpu_tensor) !=
-//         AOTI_TORCH_SUCCESS) {
-//       throw std::runtime_error("Failed to create CPU tensor");
-//     }
-
-//     // Copy data from CUDA to CPU tensor
-//     if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) {
-//       aoti_torch_delete_tensor_object(cpu_tensor);
-//       throw std::runtime_error("Failed to copy tensor from CUDA to CPU");
-//     }
-
-//     // Get CPU data pointer
-//     void* cpu_data_ptr = nullptr;
-//     if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) !=
-//             AOTI_TORCH_SUCCESS ||
-//         !cpu_data_ptr) {
-//       aoti_torch_delete_tensor_object(cpu_tensor);
-//       throw std::runtime_error("Failed to get CPU tensor data pointer");
-//     }
-
-//     float_data = static_cast<const float*>(cpu_data_ptr);
-//     need_cleanup = true;
-//     printf("Successfully copied CUDA tensor to CPU\n");
-//   } else {
-//     // Tensor is already on CPU, use original data pointer
-//     printf("Tensor is on CPU, using original data pointer\n");
-//     float_data = static_cast<const float*>(data_ptr);
-//   }
-
-//   // Open file for writing (append mode to not overwrite previous outputs)
-//   printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME);
-
-//   std::ofstream output_file(
-//       internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app);
-//   if (!output_file.is_open()) {
-//     if (need_cleanup) {
-//       aoti_torch_delete_tensor_object(cpu_tensor);
-//     }
-//     throw std::runtime_error(
-//         "Failed to open output file: " +
-//         std::string(internal::TENSOR_OUTPUT_FILENAME));
-//   }
-
-//   printf("Successfully opened file for writing\n");
+// Function to cleanup the tensor output file (to be called from
+// aoti_backend.cpp)
+void cleanup_aoti_tensor_output() {
+  // No cleanup needed since file is opened and closed on each call
+}
 
-//   // Write message and tensor info to file
-//   output_file << "=== " << msg << " ===" << std::endl;
-//   output_file << "Device type: " << device_type << std::endl;
-//   output_file << "Dimensions: " << dim << std::endl;
-//   output_file << "Sizes: [";
-//   for (int i = 0; i < dim; i++) {
-//     output_file << sizes[i];
-//     if (i < dim - 1)
-//       output_file << ", ";
-//   }
-//   output_file << "]" << std::endl;
-//   output_file << "Total elements: " << total_elements << std::endl;
-//   output_file << "Data content:" << std::endl;
+// Helper function to check if a dtype is supported
+bool is_dtype_supported_in_et_cuda(int32_t dtype) {
+  switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
+      return true;
+    // case static_cast<int32_t>(SupportedDTypes::BOOL):
+    // case static_cast<int32_t>(SupportedDTypes::UINT8):
+    // case static_cast<int32_t>(SupportedDTypes::INT8):
+    // case static_cast<int32_t>(SupportedDTypes::INT16):
+    // case static_cast<int32_t>(SupportedDTypes::INT32):
+    // case static_cast<int32_t>(SupportedDTypes::INT64):
+    // case static_cast<int32_t>(SupportedDTypes::FLOAT16):
+    // case static_cast<int32_t>(SupportedDTypes::FLOAT64):
+    // case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
+    //   return true;
+    default:
+      return false;
+  }
+}
 
-//   // Write tensor data to file (now safe to access)
-//   for (int64_t i = 0; i < total_elements; i++) {
-//     output_file << float_data[i] << " ";
-//     if (i < total_elements - 1) {
-//       output_file << ", ";
-//       // Add newline every 10 elements for readability
-//       if ((i + 1) % 10 == 0) {
-//         output_file << std::endl;
-//       }
-//     }
-//   }
-//   output_file << std::endl << std::endl;
+// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
+// elementSize function)
+size_t dtype_to_element_size(int32_t dtype) {
+  // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing
+  // elementSize function
+  executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype);
+  if (scalar_type == executorch::aten::ScalarType::Undefined) {
+    ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype);
+    return 0; // Return 0 to indicate error
+  }
 
-//   // Clean up CPU tensor if we created one
-//   if (need_cleanup) {
-//     aoti_torch_delete_tensor_object(cpu_tensor);
-//     printf("Cleaned up temporary CPU tensor\n");
-//   }
+  // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h
+  return executorch::runtime::elementSize(scalar_type);
+}
 
-//   // File will be automatically closed when output_file goes out of scope
-// }
+// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
+// ScalarType::Float)
+executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
+  // First check if the dtype is supported
+  if (!is_dtype_supported_in_et_cuda(dtype)) {
+    ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype);
+    return executorch::aten::ScalarType::Undefined;
+  }
 
-// Function to cleanup the tensor output file (to be called from
-// aoti_backend.cpp)
-void cleanup_aoti_tensor_output() {
-  // No cleanup needed since file is opened and closed on each call
+  // If supported, use switch to convert
+  switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
+      return executorch::aten::ScalarType::Float;
+    default:
+      ET_LOG(
+          Error, "Unexpected error in dtype conversion for dtype: %d", dtype);
+      return executorch::aten::ScalarType::Undefined;
+  }
 }
 
 // Dtype validation utility function
 AOTITorchError validate_dtype(int32_t dtype) {
-  // Only float32 tensors are supported (dtype 6)
-  if (dtype != 6) {
-    ET_LOG(
-        Error,
-        "Only float32 tensors are supported. Got dtype: %d (expected: 6 for float32)",
-        dtype);
-    return Error::InvalidArgument;
+  if (is_dtype_supported_in_et_cuda(dtype)) {
+    return Error::Ok;
   }
-  return Error::Ok;
+
+  ET_LOG(
+      Error,
+      "Unsupported dtype: %d. Supported dtypes: %d (float32)",
+      dtype,
+      static_cast<int32_t>(SupportedDTypes::FLOAT32));
+  return Error::InvalidArgument;
 }
 
 // Storage offset validation utility function
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
index 630bfa3d74c..a2af9e95e56 100644
--- a/backends/aoti/runtime/shims/utils.h
+++ b/backends/aoti/runtime/shims/utils.h
@@ -16,10 +16,33 @@ namespace executorch {
 namespace backends {
 namespace aoti {
 
+// Enum for supported data types in et-cuda backend
+enum class SupportedDTypes : int32_t {
+  FLOAT32 = 6, // PyTorch's float32 dtype code
+
+  // BOOL = 11,    // PyTorch's bool dtype code
+  // UINT8 = 1,    // PyTorch's uint8 dtype code
+  // INT8 = 2,     // PyTorch's int8 dtype code
+  // INT16 = 3,    // PyTorch's int16 dtype code
+  // INT32 = 4,    // PyTorch's int32 dtype code
+  // INT64 = 5,    // PyTorch's int64 dtype code
+  // FLOAT16 = 7,  // PyTorch's float16 dtype code
+  // FLOAT64 = 8,  // PyTorch's float64 dtype code
+  // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code
+};
+
 extern "C" {
 
-// // Utility function for printing tensor information
-// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg);
+// Helper function to check if a dtype is supported
+bool is_dtype_supported_in_et_cuda(int32_t dtype);
+
+// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
+// elementSize function)
+size_t dtype_to_element_size(int32_t dtype);
+
+// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
+// ScalarType::Float)
+executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype);
 
 // Cleanup function for tensor output file (called during backend destruction)
 void cleanup_aoti_tensor_output();
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index 95c7c9caa6d..c93c41e223c 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -720,7 +720,6 @@ def to_backend(
             fake_edge_program = copy.deepcopy(edge_program)
         partitioner_result = partitioner_instance(fake_edge_program)
         tagged_exported_program = partitioner_result.tagged_exported_program
-        # Make sure tagged_exported_program has the same example_inputs as edge_program
         tagged_exported_program.example_inputs = edge_program.example_inputs
 
         method_to_tagged_exported_program[method_name] = tagged_exported_program
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 61997e97687..3430ad7a920 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -156,13 +156,9 @@ def emit_program(
     instruction_id_to_num_outs_map = {}
     program_state = _ProgramState()
 
-    print(
-        "111111111111111111111111111111111111111111111111111111111111111111111111111111"
-    )
 
     # emit each entry point in order according to name.
     for name, exported_program in sorted(methods.items()):
-        print(name)
         # create empty state
         emitter_state = _EmitterState(
             values=[],
@@ -174,8 +170,6 @@ def emit_program(
             emit_mutable_buffer_names=emit_mutable_buffer_names,
         )
 
-        print("222222222222222222222222222222222222222222222222222222222222222222222")
-
         gm = _remove_non_user_outputs(exported_program)
 
         emitter = _TopLevelEmitter(
@@ -184,8 +178,6 @@ def emit_program(
 
         emitter.run()
 
-        print("333333333333333333333333333333333333333333333333333333333333333333333")
-
         plans.append(emitter.plan())
 
         debug_handle_map[name] = emitter.debug_handle_map
@@ -202,8 +194,6 @@ def emit_program(
     if prim_getters is not None:
         plans.extend(emitter._emit_prim_getters(prim_getters))
 
-    print("333333333333333333333333333333333333333333333333333333333333333333333")
-
     return EmitterOutput(
         debug_handle_map=debug_handle_map,
         method_to_delegate_debug_id_map=method_to_delegate_debug_id_map,
diff --git a/exir/program/_program.py b/exir/program/_program.py
index e3ada9301b7..c740bbcb7b3 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1698,16 +1698,8 @@ def to_executorch(  # noqa (FLAKE8) C901
         """
         config = config if config else ExecutorchBackendConfig()
 
-        def exported_program_to_device(exported_program, device):
-            for _, param in exported_program.named_parameters():
-                param.data = param.data.to(device)
-            for _, buffer in exported_program.named_buffers():
-                buffer.data = buffer.data.to(device)
-            return exported_program
-
         execution_programs: Dict[str, ExportedProgram] = {}
         for name, program in self._edge_programs.items():
-            # program = exported_program_to_device(program, "cpu")
             if config.do_quant_fusion_and_const_prop:
                 if program.graph_signature.backward_signature is not None:
                     raise Exception(
@@ -1834,7 +1826,6 @@ def __init__(
 
         backend_config = backend_config or ExecutorchBackendConfig()
 
-        print("start emitting..")
         # Emit methods
         self._emitter_output: EmitterOutput = emit_program(
             self._execution_programs,
@@ -1842,7 +1833,6 @@ def __init__(
             self._config_methods,
             backend_config.emit_mutable_buffer_names,
         )
-        print("done. start serializing..")
 
         # Serialize emitter output, ready to be written to a file.
         self._data_serializer = FlatTensorSerializer()
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
index 1c90f88df7c..65a47594c8d 100644
--- a/runtime/executor/method.cpp
+++ b/runtime/executor/method.cpp
@@ -1580,8 +1580,6 @@ Error Method::execute() {
         "chain %" ET_PRIsize_t " has no instructions field",
         step_state_.chain_idx);
 
-    ET_LOG(Debug, "Executing chain idx: %" ET_PRIsize_t, step_state_.chain_idx);
-
     // Loop over instructions
     step_state_.instr_idx = 0;
     while (step_state_.instr_idx < chain.s_chain_->instructions()->size()) {

From 3e2f2b7b72b14962fc56051eabe8f3332682462e Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 17 Sep 2025 11:31:19 -0700
Subject: [PATCH 45/50] remove unnecessary export code

---
 backends/aoti/aoti_backend.py     | 26 ++----------
 backends/aoti/aoti_partitioner.py | 70 +++++++------------------------
 2 files changed, 17 insertions(+), 79 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index a07f91eaee7..58d94d4042f 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -73,31 +73,15 @@ def preprocess(
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
 
-        print("entering  the lowerable parts in AotiBackend.preprocess....")
         named_data_store = NamedDataStore()
 
-        # print("here", edge_program.example_inputs)
-        copy_edge_program = copy.deepcopy(edge_program)
+        # copy_edge_program = copy.deepcopy(edge_program)
 
         # Move the edge_program from CPU to CUDA for aoti compile
-        cuda_edge_program = move_to_device_pass(copy_edge_program, "cuda")
+        cuda_edge_program = move_to_device_pass(edge_program, "cuda")
 
         edge_program_module = cuda_edge_program.module()
-        args, kwargs = copy_edge_program.example_inputs
-
-        # # Deep copy args and move tensors to CUDA for aot_compile
-        # def move_to_cuda(obj):
-        #     if isinstance(obj, torch.Tensor):
-        #         return obj.cuda()
-        #     elif isinstance(obj, (list, tuple)):
-        #         return type(obj)(move_to_cuda(item) for item in obj)
-        #     elif isinstance(obj, dict):
-        #         return {key: move_to_cuda(value) for key, value in obj.items()}
-        #     else:
-        #         return obj
-
-        # args = move_to_cuda(copy.deepcopy(args))
-        # kwargs = move_to_cuda(copy.deepcopy(kwargs))
+        args, kwargs = cuda_edge_program.example_inputs
 
         output_path = os.path.join(os.getcwd(), "aoti.so")
 
@@ -122,10 +106,6 @@ def preprocess(
                     "Please add them to the AOTI backend."
                 )
 
-        assert so_path == output_path, f"Expected {output_path} but got {so_path}"
-
-        print("so_path", so_path)
-
         with open(so_path, "rb") as f:
             so_data = f.read()
 
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py
index 6aeb63f959d..6b9089e5915 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/aoti_partitioner.py
@@ -6,8 +6,7 @@
 
 # pyre-unsafe
 
-import operator
-from typing import Callable, cast, Dict, final, List, Optional, Set, Tuple
+from typing import Callable, Dict, final, List, Optional, Tuple
 
 import torch
 from executorch.backends.aoti.aoti_backend import AotiBackend  # usort: skip
@@ -18,65 +17,26 @@
     PartitionResult,
 )
 from executorch.exir.backend.utils import tag_constant_data
-from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
-from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
-
-from torch.fx.passes.operator_support import OperatorSupportBase
-
-
-class AOTISupportedOperators(OperatorSupportBase):
-    def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
-        # supported = node.op == "call_function" and (
-        #     node.target == operator.getitem
-        #     or str(node.target._op) not in inductor_fallback_ops
-        #     or str(node.target._op) in supported_fallback_operators
-        # )
-
-        supported = node.op == "call_function"
-
-        return supported
-
-    def is_node_supported_custom(self, node: torch.fx.Node) -> bool:
-        if node.target == exir_ops.edge.aten.mean.dim:
-            keep_dim = node.args[2] if len(node.args) > 2 else False
-            return cast(bool, keep_dim)
-        if node.target == exir_ops.edge.aten.var.correction:
-            keep_dim = node.kwargs.get("keepdim", False)
-            return cast(bool, keep_dim)
-        return True
 
 
 @final
 class AotiPartitioner(Partitioner):
     def __init__(self, compile_spec: List[CompileSpec]) -> None:
         self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec)
-        print(self.delegation_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
-        # Run the CapabilityBasedPartitioner to return the largest possible
-        # subgraphs containing the nodes with the tags
-        # logger.info("AotiPartitioner::partition")
-        print("entering partitioner...")
-
-        partition_tags = {}
-
-        capability_partitioner = CapabilityBasedPartitioner(
-            exported_program.graph_module,
-            AOTISupportedOperators(),
-            allows_single_node_partition=True,
-        )
-        partition_list = capability_partitioner.propose_partitions()
-
-        assert len(partition_list) == 1, "Graph break is not supported yet"
-
-        print(f"graph breaks into {len(partition_list)} parts")
+        """
+        Fully delegate the graph to AOTInductor by tagging all nodes as a single partition.
+        """
 
-        for partition in partition_list:
-            for node in partition.nodes:
-                tag = f"tag{partition.id}"
-                node.meta["delegation_tag"] = tag
-                partition_tags[tag] = self.delegation_spec
+        partition_tags: Dict[str, DelegationSpec] = {}
+        for node in exported_program.graph.nodes:
+            if node.op != "call_function":
+                continue
+            tag = f"tag0"
+            node.meta["delegation_tag"] = tag
+            partition_tags[tag] = self.delegation_spec
 
         tag_constant_data(exported_program)
 
@@ -89,15 +49,13 @@ def ops_to_not_decompose(
     ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
         """
         Return a list of operations that should not be decomposed and let the AOT compiler handle them.
+        Currently we skip decomposing all ops and let the AOT compiler handle them.
         """
         do_not_decompose = set()
-        op_support = AOTISupportedOperators()
 
         for node in ep.graph.nodes:
-            if (
-                node.op == "call_function"
-                and isinstance(node.target, torch._ops.OpOverload)
-                and op_support.is_node_supported(None, node)
+            if node.op == "call_function" and isinstance(
+                node.target, torch._ops.OpOverload
             ):
                 do_not_decompose.add(node.target)
         return list(do_not_decompose), None

From 32c14b12f154614d69418c3a38a7d280365f3124 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Wed, 17 Sep 2025 14:20:22 -0700
Subject: [PATCH 46/50] set example input to only the very first partition

---
 backends/aoti/aoti_backend.py  |  2 --
 exir/backend/backend_api.py    | 10 ++++++++--
 exir/lowered_backend_module.py |  8 +++++++-
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py
index 58d94d4042f..21fcd5d86f0 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/aoti_backend.py
@@ -75,8 +75,6 @@ def preprocess(
 
         named_data_store = NamedDataStore()
 
-        # copy_edge_program = copy.deepcopy(edge_program)
-
         # Move the edge_program from CPU to CUDA for aoti compile
         cuda_edge_program = move_to_device_pass(edge_program, "cuda")
 
diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py
index c93c41e223c..d0225437c99 100644
--- a/exir/backend/backend_api.py
+++ b/exir/backend/backend_api.py
@@ -268,7 +268,9 @@ def _partition_and_lower_one_graph_module(
     """
     Partitioned and lowered the graph module based on the partition tag, this is to handle one graph module.
     """
-    for tag, delegation_spec in partition_result.partition_tags.items():
+    for idx, (tag, delegation_spec) in enumerate(
+        partition_result.partition_tags.items()
+    ):
         # Create partition with nodes containing this tag. There should only be
         # one contained submodule per tag
         node_list = _get_node_list_with_same_tag(
@@ -311,6 +313,7 @@ def _partition_and_lower_one_graph_module(
             tag,
             call_module_node,
             is_submodule,
+            idx == 0,
         )
 
         lowered_submodule = to_backend(
@@ -452,7 +455,9 @@ def _create_partitions_in_graph_module(
     is_submodule: bool,
 ) -> Dict[str, List[torch.fx.Node]]:
     backend_id_to_submodule_name = {}
-    for tag, delegation_spec in partition_result.partition_tags.items():
+    for idx, (tag, delegation_spec) in enumerate(
+        partition_result.partition_tags.items()
+    ):
         # Create partition with nodes containing this tag. There should only be
         # one contained submodule per tag
         node_list = _get_node_list_with_same_tag(
@@ -492,6 +497,7 @@ def _create_partitions_in_graph_module(
             tag,
             call_module_node,
             is_submodule,
+            idx == 0,
         )
         call_module_node.meta["backend_id"] = delegation_spec.backend_id
         call_module_node.meta["compile_spec"] = delegation_spec.compile_specs
diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py
index 2e889c6d81d..3c5ee5d36b0 100644
--- a/exir/lowered_backend_module.py
+++ b/exir/lowered_backend_module.py
@@ -682,6 +682,7 @@ def create_exported_program_from_submodule(
     tag: str,
     call_module_node: torch.fx.Node,
     is_submodule: bool,
+    is_first_partition: bool = False,
 ) -> Tuple[ExportedProgram, Dict[str, InputSpec], Dict[str, OutputSpec]]:
     """
     Creates an ExportedProgram from the given submodule using the parameters and buffers
@@ -720,6 +721,11 @@ def create_exported_program_from_submodule(
     in_spec = pytree.tree_flatten((tuple(subgraph_signature.user_inputs), {}))[1]
     out_spec = pytree.tree_flatten(subgraph_signature.user_outputs)[1]
 
+    # only the example inputs of first parition equals to the example inputs of the owning program
+    submodule_exmaple_inputs = (
+        owning_program.example_inputs if is_first_partition else None
+    )
+
     return (
         ExportedProgram(
             root=submodule,
@@ -735,7 +741,7 @@ def create_exported_program_from_submodule(
                     ),
                 )
             ],
-            example_inputs=owning_program.example_inputs,
+            example_inputs=submodule_exmaple_inputs,
             constants=subgraph_constants,
             verifiers=[owning_program.verifier],
         ),

From 3b028297f9746eec69beb60b223d36f8de2817d4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 18 Sep 2025 01:39:48 -0700
Subject: [PATCH 47/50] refactor aoti-driven backends

---
 CMakeLists.txt                                |   9 +-
 backends/aoti/CMakeLists.txt                  |  48 +++----
 backends/aoti/README.md                       |   2 -
 backends/aoti/{runtime => }/TARGETS           |   0
 .../{runtime => }/aoti_model_container.cpp    |   0
 .../aoti/{runtime => }/aoti_model_container.h |   2 +-
 .../tensor_attribute.cpp => common_shims.cpp} |  81 ++++--------
 .../tensor_attribute.h => common_shims.h}     |  24 +++-
 backends/aoti/cuda/CMakeLists.txt             |  70 ++++++++++
 backends/aoti/cuda/TARGETS                    |   3 +
 backends/aoti/cuda/__init__.py                |   5 +
 .../{aoti_backend.py => cuda/cuda_backend.py} |   2 +-
 .../cuda_partitioner.py}                      |   6 +-
 .../runtime/cuda_backend.cpp}                 |  35 +++--
 .../aoti/{ => cuda}/runtime/shims/memory.cpp  |   7 +-
 .../aoti/{ => cuda}/runtime/shims/memory.h    |   2 +-
 .../cuda/runtime/shims/tensor_attribute.cpp   |  37 ++++++
 .../runtime/shims/tensor_attribute.h}         |  13 +-
 backends/aoti/cuda/runtime/utils.cpp          |  71 ++++++++++
 backends/aoti/cuda/runtime/utils.h            |  36 ++++++
 backends/aoti/cuda/targets.bzl                |  28 ++++
 backends/aoti/runtime/shims/utils.cpp         | 121 ------------------
 backends/aoti/runtime/shims/utils.h           |  60 ---------
 backends/aoti/{runtime => }/targets.bzl       |  16 +--
 backends/aoti/utils.cpp                       |  83 ++++++++++++
 backends/aoti/utils.h                         |  43 +++++++
 exir/emit/_emit_program.py                    |   1 -
 export_and_run_aoti.sh                        |   4 +-
 export_aoti.py                                |   4 +-
 tools/cmake/preset/default.cmake              |   5 +-
 30 files changed, 485 insertions(+), 333 deletions(-)
 delete mode 100644 backends/aoti/README.md
 rename backends/aoti/{runtime => }/TARGETS (100%)
 rename backends/aoti/{runtime => }/aoti_model_container.cpp (100%)
 rename backends/aoti/{runtime => }/aoti_model_container.h (98%)
 rename backends/aoti/{runtime/shims/tensor_attribute.cpp => common_shims.cpp} (62%)
 rename backends/aoti/{runtime/shims/tensor_attribute.h => common_shims.h} (74%)
 create mode 100644 backends/aoti/cuda/CMakeLists.txt
 create mode 100644 backends/aoti/cuda/TARGETS
 create mode 100644 backends/aoti/cuda/__init__.py
 rename backends/aoti/{aoti_backend.py => cuda/cuda_backend.py} (99%)
 rename backends/aoti/{aoti_partitioner.py => cuda/cuda_partitioner.py} (91%)
 rename backends/aoti/{runtime/aoti_backend.cpp => cuda/runtime/cuda_backend.cpp} (93%)
 rename backends/aoti/{ => cuda}/runtime/shims/memory.cpp (98%)
 rename backends/aoti/{ => cuda}/runtime/shims/memory.h (97%)
 create mode 100644 backends/aoti/cuda/runtime/shims/tensor_attribute.cpp
 rename backends/aoti/{runtime/shims/types.h => cuda/runtime/shims/tensor_attribute.h} (74%)
 create mode 100644 backends/aoti/cuda/runtime/utils.cpp
 create mode 100644 backends/aoti/cuda/runtime/utils.h
 create mode 100644 backends/aoti/cuda/targets.bzl
 delete mode 100644 backends/aoti/runtime/shims/utils.cpp
 delete mode 100644 backends/aoti/runtime/shims/utils.h
 rename backends/aoti/{runtime => }/targets.bzl (68%)
 create mode 100644 backends/aoti/utils.cpp
 create mode 100644 backends/aoti/utils.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad3163a2297..21ec1ba8e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,7 +50,6 @@
 cmake_minimum_required(VERSION 3.29)
 project(executorch)
 
-
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
 include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
@@ -592,9 +591,13 @@ if(EXECUTORCH_BUILD_CORTEX_M)
   list(APPEND _executorch_backends coretex_m_backend)
 endif()
 
-if(EXECUTORCH_BUILD_AOTI)
+if(EXECUTORCH_BUILD_CUDA)
+  # Build common AOTI functionality (required for CUDA)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
-  list(APPEND _executorch_backends aoti_backend)
+  # Build CUDA-specific AOTI functionality
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti/cuda)
+  # Add aoti_cuda to backends - it already depends on aoti_common
+  list(APPEND _executorch_backends aoti_cuda)
 endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_APPLE)
diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt
index ca26f30d73e..ab3ac80e57a 100644
--- a/backends/aoti/CMakeLists.txt
+++ b/backends/aoti/CMakeLists.txt
@@ -21,48 +21,34 @@ if(NOT EXECUTORCH_ROOT)
   set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-find_package(CUDAToolkit REQUIRED)
-
 # Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 find_package_torch()
 
-set(_aoti_sources
-    runtime/aoti_backend.cpp
-    runtime/aoti_model_container.cpp
-    runtime/shims/memory.cpp
-    runtime/shims/tensor_attribute.cpp
-    runtime/shims/utils.cpp)
-add_library(aoti_backend STATIC ${_aoti_sources})
+# Common AOTI functionality (non-CUDA)
+set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp utils.cpp)
+add_library(aoti_common STATIC ${_aoti_common_sources})
 target_include_directories(
-  aoti_backend
-  PUBLIC
-    ${CUDAToolkit_INCLUDE_DIRS}
-    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
-    $<INSTALL_INTERFACE:include>
-    # PyTorch AOTI headers from ExecutorTorch's torch detection
-    ${TORCH_INCLUDE_DIRS}
+  aoti_common
+  PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
+         # PyTorch AOTI headers from ExecutorTorch's torch detection
+         ${TORCH_INCLUDE_DIRS}
 )
-target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC)
+target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC)
 # Ensure symbols are exported properly
-target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic)
+target_link_options(aoti_common PUBLIC -Wl,--export-dynamic)
 
-# Link against CUDA::cudart, PyTorch libraries and standard libraries
+# Link against PyTorch libraries and standard libraries
 target_link_libraries(
-  aoti_backend
-  PUBLIC
-    extension_tensor
-    CUDA::cudart
-    ${CMAKE_DL_LIBS}
-    # Link PyTorch libraries for AOTI CUDA functions
-    ${TORCH_LIBRARIES}
+  aoti_common
+  PUBLIC extension_tensor ${CMAKE_DL_LIBS}
+         # Link PyTorch libraries for AOTI functions
+         ${TORCH_LIBRARIES}
 )
-# If you need other CUDA libraries, link them similarly:
-# target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...)
-# If you have a custom function, keep it
-executorch_target_link_options_shared_lib(aoti_backend)
+executorch_target_link_options_shared_lib(aoti_common)
+
 install(
-  TARGETS aoti_backend
+  TARGETS aoti_common
   EXPORT ExecuTorchTargets
   DESTINATION lib
 )
diff --git a/backends/aoti/README.md b/backends/aoti/README.md
deleted file mode 100644
index 9df05c99e07..00000000000
--- a/backends/aoti/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Experimental AOTI backend
-Proceed with caution. This is an experimental backend that is not yet ready for production use.
diff --git a/backends/aoti/runtime/TARGETS b/backends/aoti/TARGETS
similarity index 100%
rename from backends/aoti/runtime/TARGETS
rename to backends/aoti/TARGETS
diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp
similarity index 100%
rename from backends/aoti/runtime/aoti_model_container.cpp
rename to backends/aoti/aoti_model_container.cpp
diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/aoti_model_container.h
similarity index 98%
rename from backends/aoti/runtime/aoti_model_container.h
rename to backends/aoti/aoti_model_container.h
index 39a8a35c14f..d5cae26cd05 100644
--- a/backends/aoti/runtime/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -10,7 +10,7 @@
 
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
-#include "shims/memory.h"
+#include "cuda/runtime/shims/memory.h"
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/common_shims.cpp
similarity index 62%
rename from backends/aoti/runtime/shims/tensor_attribute.cpp
rename to backends/aoti/common_shims.cpp
index 8d26bbbbe30..fbc596ce8b0 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -6,20 +6,31 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "tensor_attribute.h"
+#include "common_shims.h"
+#include <executorch/runtime/platform/log.h>
+#include <cstdint>
+#include <cstdio>
+#include <fstream>
 #include <iostream>
-#include "utils.h"
+#include <stdexcept>
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
+namespace internal {
+// Constants for file operations
+const char* const TENSOR_OUTPUT_FILENAME =
+    "/home/gasoonjia/executorch/aoti_intermediate_output.txt";
+} // namespace internal
+
 // Global storage for tensor metadata
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
 
 extern "C" {
 
+// Autograd mode functions
 int32_t aoti_torch_grad_mode_is_enabled() {
   // No autograd ever
   return false;
@@ -31,6 +42,7 @@ void aoti_torch_grad_mode_set_enabled(bool enabled) {
   }
 }
 
+// Tensor attribute operations
 AOTITorchError aoti_torch_get_data_ptr(
     AOTITensorHandle tensor,
     void** ret_data_ptr) {
@@ -69,12 +81,6 @@ AOTITorchError aoti_torch_get_dtype(
     int32_t* ret_dtype) {
   *ret_dtype = static_cast<int32_t>(tensor->scalar_type());
 
-  // ASSERTION: Only float32 tensors are supported
-  AOTITorchError dtype_error = validate_dtype(*ret_dtype);
-  if (dtype_error != Error::Ok) {
-    return dtype_error;
-  }
-
   return Error::Ok;
 }
 
@@ -100,13 +106,6 @@ AOTITorchError aoti_torch_get_storage_size(
   throw std::runtime_error("Cannot get storage size on ETensor");
 }
 
-AOTITorchError aoti_torch_get_device_type(
-    AOTITensorHandle tensor,
-    int32_t* ret_device_type) {
-  // All tensors in aoti-cuda delegate are on CUDA
-  *ret_device_type = aoti_torch_device_type_cuda();
-  return Error::Ok;
-}
 
 AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
@@ -121,6 +120,7 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) {
   return Error::Ok;
 }
 
+// Device and layout utility functions
 int32_t aoti_torch_device_type_cpu() {
   // Let's say cpu is 0 for ET as well
   return 0;
@@ -132,60 +132,23 @@ __attribute__((__visibility__("default"))) int32_t aoti_torch_layout_strided() {
   return 0;
 }
 
-__attribute__((__visibility__("default"))) int32_t
-aoti_torch_device_type_cuda() {
-  // Let's say cuda is 1 for ET as well
-  return 1;
-}
-
 // Dtype constants - these return the PyTorch dtype codes
 // Currently only float32 is supported, but using robust enum-based approach
 __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() {
-  return static_cast<int32_t>(SupportedDTypes::FLOAT32);
+  return 6; // PyTorch's float32 dtype code
 }
 
-// Future dtype support (commented out for now):
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bool() {
-//   return static_cast<int32_t>(SupportedDTypes::BOOL);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_uint8() {
-//   return static_cast<int32_t>(SupportedDTypes::UINT8);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int8() {
-//   return static_cast<int32_t>(SupportedDTypes::INT8);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int16() {
-//   return static_cast<int32_t>(SupportedDTypes::INT16);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int32() {
-//   return static_cast<int32_t>(SupportedDTypes::INT32);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int64() {
-//   return static_cast<int32_t>(SupportedDTypes::INT64);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float16() {
-//   return static_cast<int32_t>(SupportedDTypes::FLOAT16);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float64() {
-//   return static_cast<int32_t>(SupportedDTypes::FLOAT64);
-// }
-// 
-// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bfloat16() {
-//   return static_cast<int32_t>(SupportedDTypes::BFLOAT16);
-// }
-
+// Cleanup functions
 void cleanup_tensor_metadata() {
   tensor_to_sizes.clear();
   tensor_to_strides.clear();
 }
 
+void cleanup_aoti_tensor_output() {
+  // Clean up any tensor output related resources
+  // For now this is a no-op, but can be extended if needed
+}
+
 } // extern "C"
 
 } // namespace aoti
diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/common_shims.h
similarity index 74%
rename from backends/aoti/runtime/shims/tensor_attribute.h
rename to backends/aoti/common_shims.h
index 20ea3d487a0..260a7661c6b 100644
--- a/backends/aoti/runtime/shims/tensor_attribute.h
+++ b/backends/aoti/common_shims.h
@@ -9,16 +9,30 @@
 #pragma once
 
 #include <cuda_runtime.h>
+#include <executorch/backends/aoti/utils.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdint>
 #include <unordered_map>
 #include <vector>
-#include "types.h"
 
 namespace executorch {
 namespace backends {
 namespace aoti {
 
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+using executorch::runtime::etensor::Tensor;
+
 extern "C" {
 
+// Common AOTI type aliases
+// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility
+using AOTITensorHandle = Tensor*;
+using AOTIRuntimeError = Error;
+using AOTITorchError = Error;
+
 // Global storage for tensor metadata
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_sizes;
 extern std::unordered_map<Tensor*, std::vector<int64_t>> tensor_to_strides;
@@ -48,10 +62,6 @@ AOTITorchError aoti_torch_get_storage_size(
     AOTITensorHandle tensor,
     int64_t* ret_size);
 
-AOTITorchError aoti_torch_get_device_type(
-    AOTITensorHandle tensor,
-    int32_t* ret_device_type);
-
 AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
     int32_t* ret_device_index);
@@ -60,7 +70,6 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim);
 
 // Utility functions for device and layout information
 int32_t aoti_torch_device_type_cpu();
-int32_t aoti_torch_device_type_cuda();
 int32_t aoti_torch_layout_strided();
 int32_t aoti_torch_dtype_float32();
 
@@ -68,8 +77,9 @@ int32_t aoti_torch_dtype_float32();
 int32_t aoti_torch_grad_mode_is_enabled();
 void aoti_torch_grad_mode_set_enabled(bool enabled);
 
-// Cleanup function for clearing global state
+// Cleanup functions for clearing global state
 void cleanup_tensor_metadata();
+void cleanup_aoti_tensor_output();
 
 } // extern "C"
 
diff --git a/backends/aoti/cuda/CMakeLists.txt b/backends/aoti/cuda/CMakeLists.txt
new file mode 100644
index 00000000000..971d92bd044
--- /dev/null
+++ b/backends/aoti/cuda/CMakeLists.txt
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Build AOTI CUDA backend for runtime.
+#
+# ### Editing this file ###
+#
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+find_package(CUDAToolkit REQUIRED)
+
+# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+find_package_torch()
+
+# CUDA-specific AOTI functionality
+set(_aoti_cuda_sources
+    runtime/cuda_backend.cpp
+    runtime/shims/memory.cpp
+    runtime/shims/tensor_attribute.cpp
+    runtime/utils.cpp)
+add_library(aoti_cuda STATIC ${_aoti_cuda_sources})
+target_include_directories(
+  aoti_cuda
+  PUBLIC
+    ${CUDAToolkit_INCLUDE_DIRS}
+    $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+    $<INSTALL_INTERFACE:include>
+    # PyTorch AOTI headers from ExecutorTorch's torch detection
+    ${TORCH_INCLUDE_DIRS}
+)
+target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC)
+# Ensure symbols are exported properly
+target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic)
+
+# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries
+target_link_libraries(
+  aoti_cuda
+  PUBLIC
+    aoti_common
+    CUDA::cudart
+    ${CMAKE_DL_LIBS}
+    # Link PyTorch libraries for AOTI CUDA functions
+    ${TORCH_LIBRARIES}
+)
+# If you need other CUDA libraries, link them similarly:
+# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...)
+executorch_target_link_options_shared_lib(aoti_cuda)
+
+
+install(
+  TARGETS aoti_cuda
+  EXPORT ExecuTorchTargets
+  DESTINATION lib
+)
diff --git a/backends/aoti/cuda/TARGETS b/backends/aoti/cuda/TARGETS
new file mode 100644
index 00000000000..77871de4469
--- /dev/null
+++ b/backends/aoti/cuda/TARGETS
@@ -0,0 +1,3 @@
+load("targets.bzl", "define_common_targets")
+
+define_common_targets()
diff --git a/backends/aoti/cuda/__init__.py b/backends/aoti/cuda/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/backends/aoti/cuda/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/cuda/cuda_backend.py
similarity index 99%
rename from backends/aoti/aoti_backend.py
rename to backends/aoti/cuda/cuda_backend.py
index 21fcd5d86f0..99599de6b6c 100644
--- a/backends/aoti/aoti_backend.py
+++ b/backends/aoti/cuda/cuda_backend.py
@@ -66,7 +66,7 @@ def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels(
 
 
 @final
-class AotiBackend(BackendDetails):
+class CudaBackend(BackendDetails):
     @staticmethod
     def preprocess(
         edge_program: ExportedProgram,
diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/cuda/cuda_partitioner.py
similarity index 91%
rename from backends/aoti/aoti_partitioner.py
rename to backends/aoti/cuda/cuda_partitioner.py
index 6b9089e5915..f48759afa80 100644
--- a/backends/aoti/aoti_partitioner.py
+++ b/backends/aoti/cuda/cuda_partitioner.py
@@ -9,7 +9,7 @@
 from typing import Callable, Dict, final, List, Optional, Tuple
 
 import torch
-from executorch.backends.aoti.aoti_backend import AotiBackend  # usort: skip
+from executorch.backends.aoti.cuda.cuda_backend import CudaBackend  # usort: skip
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
@@ -21,9 +21,9 @@
 
 
 @final
-class AotiPartitioner(Partitioner):
+class CudaPartitioner(Partitioner):
     def __init__(self, compile_spec: List[CompileSpec]) -> None:
-        self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec)
+        self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec)
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         """
diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/cuda/runtime/cuda_backend.cpp
similarity index 93%
rename from backends/aoti/runtime/aoti_backend.cpp
rename to backends/aoti/cuda/runtime/cuda_backend.cpp
index 242ee24e1d9..b6d9bb7d75d 100644
--- a/backends/aoti/runtime/aoti_backend.cpp
+++ b/backends/aoti/cuda/runtime/cuda_backend.cpp
@@ -25,10 +25,8 @@
 #include <vector>
 
 // Include our shim layer headers
-#include "aoti_model_container.h"
-#include "shims/memory.h"
-#include "shims/tensor_attribute.h"
-#include "shims/utils.h"
+#include "../../aoti_model_container.h"
+#include "../../common_shims.h"
 
 namespace executorch {
 namespace backends {
@@ -52,11 +50,11 @@ using executorch::runtime::Result;
 using executorch::runtime::Span;
 using executorch::runtime::etensor::Tensor;
 
-class AOTIBackend final : public ::executorch::runtime::BackendInterface {
+class CudaBackend final : public ::executorch::runtime::BackendInterface {
  public:
   // Once in program
-  AOTIBackend() {
-    ET_LOG(Info, "AOTIBackend ctor");
+  CudaBackend() {
+    ET_LOG(Info, "CudaBackend ctor");
   }
 
   bool is_available() const override {
@@ -172,11 +170,11 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       BackendExecutionContext& context,
       DelegateHandle* handle_,
       Span<EValue*> args) const override {
-    ET_LOG(Debug, "AOTIBackend execute");
+    ET_LOG(Debug, "CudaBackend execute");
 
     AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
 
-    ET_LOG(Debug, "AOTIBackend Handle generated");
+    ET_LOG(Debug, "CudaBackend Handle generated");
 
     size_t n_inputs;
     AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs);
@@ -185,7 +183,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     AOTInductorModelContainerGetNumOutputs(
         handle->container_handle, &n_outputs);
 
-    ET_LOG(Debug, "AOTIBackend n_outputs %zd generated", n_outputs);
+    ET_LOG(Debug, "CudaBackend n_outputs %zd generated", n_outputs);
 
     if (n_inputs + n_outputs != args.size()) {
       ET_LOG(
@@ -211,7 +209,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     std::vector<AOTITensorHandle> gpu_outputs(
         n_outputs); // GPU tensors for kernel output
 
-    ET_LOG(Debug, "AOTIBackend input/output vectors generated");
+    ET_LOG(Debug, "CudaBackend input/output vectors generated");
 
     // Process input tensors: ExecutorTorch provides CPU tensors, create GPU
     // copies
@@ -255,7 +253,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i);
     }
 
-    ET_LOG(Debug, "AOTIBackend GPU inputs generated");
+    ET_LOG(Debug, "CudaBackend GPU inputs generated");
 
     // Process output tensors: create GPU counterparts for ExecutorTorch CPU
     // tensors
@@ -287,7 +285,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       ET_LOG(Debug, "Created GPU output tensor %d", i);
     }
 
-    ET_LOG(Debug, "AOTIBackend output generated");
+    ET_LOG(Debug, "CudaBackend output generated");
 
     // Run AOTI container with GPU tensors
     AOTIRuntimeError error = AOTInductorModelContainerRun(
@@ -307,7 +305,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       return Error::Internal;
     }
 
-    ET_LOG(Debug, "AOTIBackend running done");
+    ET_LOG(Debug, "CudaBackend running done");
 
     // Copy GPU output results back to CPU output tensors
     for (int i = 0; i < n_outputs; i++) {
@@ -332,7 +330,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
       aoti_torch_delete_tensor_object(gpu_outputs[i]);
     }
 
-    ET_LOG(Debug, "AOTIBackend execution completed successfully");
+    ET_LOG(Debug, "CudaBackend execution completed successfully");
 
     return Error::Ok;
   }
@@ -360,16 +358,15 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface {
     free(handle);
     cleanup_memory();
     cleanup_tensor_metadata();
-    cleanup_aoti_tensor_output();
-    ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_);
+    ET_LOG(Debug, "CudaBackend handle %p destroy", handle_);
   }
 };
 
 } // namespace aoti
 
 namespace {
-auto cls = aoti::AOTIBackend();
-executorch::runtime::Backend backend{"AotiBackend", &cls};
+auto cls = aoti::CudaBackend();
+executorch::runtime::Backend backend{"CudaBackend", &cls};
 static executorch::runtime::Error success_with_compiler =
     register_backend(backend);
 } // namespace
diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/cuda/runtime/shims/memory.cpp
similarity index 98%
rename from backends/aoti/runtime/shims/memory.cpp
rename to backends/aoti/cuda/runtime/shims/memory.cpp
index 99990f4aeab..7ca83973b8c 100644
--- a/backends/aoti/runtime/shims/memory.cpp
+++ b/backends/aoti/cuda/runtime/shims/memory.cpp
@@ -6,7 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "memory.h"
+#include <executorch/backends/aoti/cuda/runtime/shims/memory.h>
+#include <executorch/backends/aoti/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/aoti/cuda/runtime/utils.h>
+#include <executorch/backends/aoti/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
 #include <cstdio>
@@ -18,8 +21,6 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "tensor_attribute.h"
-#include "utils.h"
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/cuda/runtime/shims/memory.h
similarity index 97%
rename from backends/aoti/runtime/shims/memory.h
rename to backends/aoti/cuda/runtime/shims/memory.h
index 8e8e2910b03..41c03a1f552 100644
--- a/backends/aoti/runtime/shims/memory.h
+++ b/backends/aoti/cuda/runtime/shims/memory.h
@@ -9,12 +9,12 @@
 #pragma once
 
 #include <cuda_runtime.h>
+#include <executorch/backends/aoti/common_shims.h>
 #include <cstdint>
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "types.h"
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp b/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp
new file mode 100644
index 00000000000..cb564f10129
--- /dev/null
+++ b/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/aoti/cuda/runtime/shims/tensor_attribute.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+extern "C" {
+
+// Device type functions for tensor attributes
+AOTITorchError aoti_torch_get_device_type(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_type) {
+  // All tensors in aoti-cuda delegate are on CUDA
+  *ret_device_type = aoti_torch_device_type_cuda();
+  return Error::Ok;
+}
+
+// Device type constants
+__attribute__((__visibility__("default"))) int32_t
+aoti_torch_device_type_cuda() {
+  // Let's say cuda is 1 for ET as well
+  return 1;
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/cuda/runtime/shims/tensor_attribute.h
similarity index 74%
rename from backends/aoti/runtime/shims/types.h
rename to backends/aoti/cuda/runtime/shims/tensor_attribute.h
index 1bcae2058ca..d8866c19f24 100644
--- a/backends/aoti/runtime/shims/types.h
+++ b/backends/aoti/cuda/runtime/shims/tensor_attribute.h
@@ -8,7 +8,6 @@
 
 #pragma once
 
-#include <cuda_runtime.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
 #include <cstdint>
@@ -24,13 +23,19 @@ using executorch::runtime::etensor::Tensor;
 extern "C" {
 
 // Common AOTI type aliases
-// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility
 using AOTITensorHandle = Tensor*;
-using AOTIRuntimeError = Error;
 using AOTITorchError = Error;
 
+// Device type functions for tensor attributes
+AOTITorchError aoti_torch_get_device_type(
+    AOTITensorHandle tensor,
+    int32_t* ret_device_type);
+
+// Device type constants
+int32_t aoti_torch_device_type_cuda();
+
 } // extern "C"
 
 } // namespace aoti
 } // namespace backends
-} // namespace executorch
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/aoti/cuda/runtime/utils.cpp b/backends/aoti/cuda/runtime/utils.cpp
new file mode 100644
index 00000000000..aee585f3a2e
--- /dev/null
+++ b/backends/aoti/cuda/runtime/utils.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "utils.h"
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Enum for supported data types in et-cuda backend
+enum class SupportedDTypes : int32_t {
+  FLOAT32 = 6, // PyTorch's float32 dtype code
+
+  // BOOL = 11,    // PyTorch's bool dtype code
+  // UINT8 = 1,    // PyTorch's uint8 dtype code
+  // INT8 = 2,     // PyTorch's int8 dtype code
+  // INT16 = 3,    // PyTorch's int16 dtype code
+  // INT32 = 4,    // PyTorch's int32 dtype code
+  // INT64 = 5,    // PyTorch's int64 dtype code
+  // FLOAT16 = 7,  // PyTorch's float16 dtype code
+  // FLOAT64 = 8,  // PyTorch's float64 dtype code
+  // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code
+};
+
+extern "C" {
+
+// Helper function to check if a dtype is supported in ET CUDA backend
+bool is_dtype_supported_in_et_cuda(int32_t dtype) {
+  switch (dtype) {
+    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
+      return true;
+    // case static_cast<int32_t>(SupportedDTypes::BOOL):
+    // case static_cast<int32_t>(SupportedDTypes::UINT8):
+    // case static_cast<int32_t>(SupportedDTypes::INT8):
+    // case static_cast<int32_t>(SupportedDTypes::INT16):
+    // case static_cast<int32_t>(SupportedDTypes::INT32):
+    // case static_cast<int32_t>(SupportedDTypes::INT64):
+    // case static_cast<int32_t>(SupportedDTypes::FLOAT16):
+    // case static_cast<int32_t>(SupportedDTypes::FLOAT64):
+    // case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
+    //   return true;
+    default:
+      return false;
+  }
+}
+
+// Dtype validation utility function
+AOTITorchError validate_dtype(int32_t dtype) {
+  if (is_dtype_supported_in_et_cuda(dtype)) {
+    return Error::Ok;
+  }
+
+  ET_LOG(
+      Error,
+      "Unsupported dtype: %d. Supported dtypes: %d (float32)",
+      dtype,
+      static_cast<int32_t>(SupportedDTypes::FLOAT32));
+  return Error::InvalidArgument;
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/cuda/runtime/utils.h b/backends/aoti/cuda/runtime/utils.h
new file mode 100644
index 00000000000..c941917577c
--- /dev/null
+++ b/backends/aoti/cuda/runtime/utils.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <cstdint>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+
+extern "C" {
+
+// Common AOTI type aliases
+using AOTITorchError = Error;
+
+// Helper function to check if a dtype is supported in ET CUDA backend
+bool is_dtype_supported_in_et_cuda(int32_t dtype);
+
+// Dtype validation utility function
+AOTITorchError validate_dtype(int32_t dtype);
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
diff --git a/backends/aoti/cuda/targets.bzl b/backends/aoti/cuda/targets.bzl
new file mode 100644
index 00000000000..be692cbb5a2
--- /dev/null
+++ b/backends/aoti/cuda/targets.bzl
@@ -0,0 +1,28 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    # CUDA-specific AOTI functionality
+    runtime.cxx_library(
+        name = "aoti_cuda",
+        srcs = [
+            "runtime/cuda_backend.cpp",
+            "runtime/shims/memory.cpp",
+            "runtime/shims/tensor_attribute.cpp",
+            "runtime/utils.cpp",
+        ],
+        headers = [
+            "runtime/shims/memory.h",
+            "runtime/shims/tensor_attribute.h",
+            "runtime/utils.h",
+        ],
+        # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
+        link_whole = True,
+        supports_python_dlopen = True,
+        # Constructor needed for backend registration.
+        compiler_flags = ["-Wno-global-constructors"],
+        visibility = ["@EXECUTORCH_CLIENTS"],
+        deps = [
+            "//executorch/backends/aoti:aoti_common",
+            "//caffe2/torch/csrc/inductor:aoti_torch_cuda",
+        ],
+    )
diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp
deleted file mode 100644
index 8b1734082bd..00000000000
--- a/backends/aoti/runtime/shims/utils.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include "utils.h"
-#include <executorch/runtime/platform/log.h>
-#include <cstdint>
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <stdexcept>
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-namespace internal {
-// Constants for file operations
-const char* const TENSOR_OUTPUT_FILENAME =
-    "/home/gasoonjia/executorch/aoti_intermediate_output.txt";
-} // namespace internal
-
-extern "C" {
-
-// Function to cleanup the tensor output file (to be called from
-// aoti_backend.cpp)
-void cleanup_aoti_tensor_output() {
-  // No cleanup needed since file is opened and closed on each call
-}
-
-// Helper function to check if a dtype is supported
-bool is_dtype_supported_in_et_cuda(int32_t dtype) {
-  switch (dtype) {
-    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
-      return true;
-    // case static_cast<int32_t>(SupportedDTypes::BOOL):
-    // case static_cast<int32_t>(SupportedDTypes::UINT8):
-    // case static_cast<int32_t>(SupportedDTypes::INT8):
-    // case static_cast<int32_t>(SupportedDTypes::INT16):
-    // case static_cast<int32_t>(SupportedDTypes::INT32):
-    // case static_cast<int32_t>(SupportedDTypes::INT64):
-    // case static_cast<int32_t>(SupportedDTypes::FLOAT16):
-    // case static_cast<int32_t>(SupportedDTypes::FLOAT64):
-    // case static_cast<int32_t>(SupportedDTypes::BFLOAT16):
-    //   return true;
-    default:
-      return false;
-  }
-}
-
-// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
-// elementSize function)
-size_t dtype_to_element_size(int32_t dtype) {
-  // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing
-  // elementSize function
-  executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype);
-  if (scalar_type == executorch::aten::ScalarType::Undefined) {
-    ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype);
-    return 0; // Return 0 to indicate error
-  }
-
-  // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h
-  return executorch::runtime::elementSize(scalar_type);
-}
-
-// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
-// ScalarType::Float)
-executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
-  // First check if the dtype is supported
-  if (!is_dtype_supported_in_et_cuda(dtype)) {
-    ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype);
-    return executorch::aten::ScalarType::Undefined;
-  }
-
-  // If supported, use switch to convert
-  switch (dtype) {
-    case static_cast<int32_t>(SupportedDTypes::FLOAT32):
-      return executorch::aten::ScalarType::Float;
-    default:
-      ET_LOG(
-          Error, "Unexpected error in dtype conversion for dtype: %d", dtype);
-      return executorch::aten::ScalarType::Undefined;
-  }
-}
-
-// Dtype validation utility function
-AOTITorchError validate_dtype(int32_t dtype) {
-  if (is_dtype_supported_in_et_cuda(dtype)) {
-    return Error::Ok;
-  }
-
-  ET_LOG(
-      Error,
-      "Unsupported dtype: %d. Supported dtypes: %d (float32)",
-      dtype,
-      static_cast<int32_t>(SupportedDTypes::FLOAT32));
-  return Error::InvalidArgument;
-}
-
-// Storage offset validation utility function
-AOTITorchError validate_storage_offset(int64_t storage_offset) {
-  // Storage offset must always be 0
-  if (storage_offset != 0) {
-    ET_LOG(
-        Error,
-        "Storage offset must be 0. Got storage_offset: %ld",
-        storage_offset);
-    return Error::InvalidArgument;
-  }
-  return Error::Ok;
-}
-
-} // extern "C"
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h
deleted file mode 100644
index a2af9e95e56..00000000000
--- a/backends/aoti/runtime/shims/utils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <cstdint>
-#include "types.h"
-
-namespace executorch {
-namespace backends {
-namespace aoti {
-
-// Enum for supported data types in et-cuda backend
-enum class SupportedDTypes : int32_t {
-  FLOAT32 = 6, // PyTorch's float32 dtype code
-
-  // BOOL = 11,    // PyTorch's bool dtype code
-  // UINT8 = 1,    // PyTorch's uint8 dtype code
-  // INT8 = 2,     // PyTorch's int8 dtype code
-  // INT16 = 3,    // PyTorch's int16 dtype code
-  // INT32 = 4,    // PyTorch's int32 dtype code
-  // INT64 = 5,    // PyTorch's int64 dtype code
-  // FLOAT16 = 7,  // PyTorch's float16 dtype code
-  // FLOAT64 = 8,  // PyTorch's float64 dtype code
-  // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code
-};
-
-extern "C" {
-
-// Helper function to check if a dtype is supported
-bool is_dtype_supported_in_et_cuda(int32_t dtype);
-
-// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
-// elementSize function)
-size_t dtype_to_element_size(int32_t dtype);
-
-// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
-// ScalarType::Float)
-executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype);
-
-// Cleanup function for tensor output file (called during backend destruction)
-void cleanup_aoti_tensor_output();
-
-// Dtype validation utility function
-AOTITorchError validate_dtype(int32_t dtype);
-
-// Storage offset validation utility function
-AOTITorchError validate_storage_offset(int64_t storage_offset);
-
-} // extern "C"
-
-} // namespace aoti
-} // namespace backends
-} // namespace executorch
diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/targets.bzl
similarity index 68%
rename from backends/aoti/runtime/targets.bzl
rename to backends/aoti/targets.bzl
index d57a187366f..bd46550d81e 100644
--- a/backends/aoti/runtime/targets.bzl
+++ b/backends/aoti/targets.bzl
@@ -1,21 +1,18 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
+    # Common AOTI functionality (non-CUDA)
     runtime.cxx_library(
-        name = "aoti_backend",
+        name = "aoti_common",
         srcs = [
-            "aoti_backend.cpp",
             "aoti_model_container.cpp",
-            "shims/memory.cpp",
-            "shims/tensor_attribute.cpp",
-            "shims/utils.cpp",
+            "common_shims.cpp",
+            "utils.cpp",
         ],
         headers = [
             "aoti_model_container.h",
-            "shims/memory.h",
-            "shims/tensor_attribute.h",
-            "shims/types.h",
-            "shims/utils.h",
+            "common_shims.h",
+            "utils.h",
         ],
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
         link_whole = True,
@@ -27,6 +24,5 @@ def define_common_targets():
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
             "//caffe2/torch/csrc/inductor:aoti_torch",
-            "//caffe2/torch/csrc/inductor:aoti_torch_cuda",
         ],
     )
diff --git a/backends/aoti/utils.cpp b/backends/aoti/utils.cpp
new file mode 100644
index 00000000000..95b4f0c4b4f
--- /dev/null
+++ b/backends/aoti/utils.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "utils.h"
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+extern "C" {
+
+// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
+// elementSize function)
+size_t dtype_to_element_size(int32_t dtype) {
+  // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing
+  // elementSize function
+  executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype);
+  if (scalar_type == executorch::aten::ScalarType::Undefined) {
+    ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype);
+    return 0; // Return 0 to indicate error
+  }
+
+  // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h
+  return executorch::runtime::elementSize(scalar_type);
+}
+
+// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
+// ScalarType::Float)
+executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
+  // Convert based on known PyTorch dtype codes (without CUDA-specific dependency)
+  switch (dtype) {
+    case 6: // PyTorch's float32 dtype code
+      return executorch::aten::ScalarType::Float;
+    // Future support for additional dtypes can be added here
+    // case 11:    // PyTorch's bool dtype code
+    //   return executorch::aten::ScalarType::Bool;
+    // case 1:     // PyTorch's uint8 dtype code
+    //   return executorch::aten::ScalarType::Byte;
+    // case 2:     // PyTorch's int8 dtype code
+    //   return executorch::aten::ScalarType::Char;
+    // case 3:     // PyTorch's int16 dtype code
+    //   return executorch::aten::ScalarType::Short;
+    // case 4:     // PyTorch's int32 dtype code
+    //   return executorch::aten::ScalarType::Int;
+    // case 5:     // PyTorch's int64 dtype code
+    //   return executorch::aten::ScalarType::Long;
+    // case 7:     // PyTorch's float16 dtype code
+    //   return executorch::aten::ScalarType::Half;
+    // case 8:     // PyTorch's float64 dtype code
+    //   return executorch::aten::ScalarType::Double;
+    // case 15:    // PyTorch's bfloat16 dtype code
+    //   return executorch::aten::ScalarType::BFloat16;
+    default:
+      ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype);
+      return executorch::aten::ScalarType::Undefined;
+  }
+}
+
+// Storage offset validation utility function
+AOTITorchError validate_storage_offset(int64_t storage_offset) {
+  // Storage offset must always be 0
+  if (storage_offset != 0) {
+    ET_LOG(
+        Error,
+        "Storage offset must be 0. Got storage_offset: %ld",
+        storage_offset);
+    return Error::InvalidArgument;
+  }
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
\ No newline at end of file
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
new file mode 100644
index 00000000000..3fb710a24d8
--- /dev/null
+++ b/backends/aoti/utils.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <cstdint>
+#include <cstddef>
+
+namespace executorch {
+namespace backends {
+namespace aoti {
+
+// Common using declarations for ExecutorTorch types
+using executorch::runtime::Error;
+
+extern "C" {
+
+// Common AOTI type aliases
+using AOTITorchError = Error;
+
+// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's
+// elementSize function)
+size_t dtype_to_element_size(int32_t dtype);
+
+// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
+// ScalarType::Float)
+executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype);
+
+// Storage offset validation utility function
+AOTITorchError validate_storage_offset(int64_t storage_offset);
+
+} // extern "C"
+
+} // namespace aoti
+} // namespace backends
+} // namespace executorch
\ No newline at end of file
diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py
index 3430ad7a920..eb84d508c2c 100644
--- a/exir/emit/_emit_program.py
+++ b/exir/emit/_emit_program.py
@@ -156,7 +156,6 @@ def emit_program(
     instruction_id_to_num_outs_map = {}
     program_state = _ProgramState()
 
-
     # emit each entry point in order according to name.
     for name, exported_program in sorted(methods.items()):
         # create empty state
diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh
index a971df35b13..dd4aeef1017 100644
--- a/export_and_run_aoti.sh
+++ b/export_and_run_aoti.sh
@@ -136,7 +136,7 @@ build_runtime() {
 
     if [[ "$DEBUG_MODE" == true ]]; then
         echo "Building with debug configuration..."
-        cmake -DEXECUTORCH_BUILD_AOTI=ON \
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Debug \
@@ -146,7 +146,7 @@ build_runtime() {
               ..
     else
         echo "Building with release configuration..."
-        cmake -DEXECUTORCH_BUILD_AOTI=ON \
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
               -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
               -DEXECUTORCH_LOG_LEVEL=Info \
diff --git a/export_aoti.py b/export_aoti.py
index e644177568c..0fda74a04f7 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -23,7 +23,7 @@
 from typing import Any, Dict, Tuple
 
 import torch
-from executorch.backends.aoti.aoti_partitioner import AotiPartitioner
+from executorch.backends.aoti.cuda.cuda_partitioner import CudaPartitioner
 
 # from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.exir import to_edge, to_edge_transform_and_lower
@@ -402,7 +402,7 @@ def export_model_to_et_aoti(
         # Q: maybe need to turn on fallback_random?
 
         edge_program = to_edge_transform_and_lower(
-            aten_dialect, partitioner=[AotiPartitioner([])]
+            aten_dialect, partitioner=[CudaPartitioner([])]
         )
 
     # edge_program = to_edge(aten_dialect)
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 6911aea3e9b..fb993f7d5f0 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -161,10 +161,9 @@ define_overridable_option(
 )
 
 define_overridable_option(
-  EXECUTORCH_BUILD_AOTI "Build the AOTI backend" BOOL OFF
+  EXECUTORCH_BUILD_CUDA "Build the AOTI CUDA backend" BOOL OFF
 )
 
-
 if(EXECUTORCH_BUILD_ARM_BAREMETAL)
   set(_default_executorch_build_pthreadpool OFF)
   set(_default_executorch_build_cpuinfo OFF)
@@ -323,7 +322,7 @@ check_required_options_on(
 )
 
 check_required_options_on(
-  IF_ON EXECUTORCH_BUILD_AOTI REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
+  IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR
 )
 
 check_conflicting_options_on(

From 2fe871ce5e1810f5ad36062170bb083cd808e656 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 18 Sep 2025 10:51:44 -0700
Subject: [PATCH 48/50] code refacotr to backend/cuda and backend/aoti

---
 CMakeLists.txt                                              | 2 +-
 backends/aoti/aoti_model_container.h                        | 2 +-
 backends/aoti/common_shims.cpp                              | 1 -
 backends/aoti/utils.cpp                                     | 3 ++-
 backends/aoti/utils.h                                       | 2 +-
 backends/{aoti => }/cuda/CMakeLists.txt                     | 2 +-
 backends/{aoti => }/cuda/TARGETS                            | 0
 backends/{aoti => }/cuda/__init__.py                        | 0
 backends/{aoti => }/cuda/cuda_backend.py                    | 0
 backends/{aoti => }/cuda/cuda_partitioner.py                | 0
 backends/{aoti => }/cuda/runtime/cuda_backend.cpp           | 4 ++--
 backends/{aoti => }/cuda/runtime/shims/memory.cpp           | 6 +++---
 backends/{aoti => }/cuda/runtime/shims/memory.h             | 0
 backends/{aoti => }/cuda/runtime/shims/tensor_attribute.cpp | 2 +-
 backends/{aoti => }/cuda/runtime/shims/tensor_attribute.h   | 0
 backends/{aoti => }/cuda/runtime/utils.cpp                  | 0
 backends/{aoti => }/cuda/runtime/utils.h                    | 0
 backends/{aoti => }/cuda/targets.bzl                        | 0
 18 files changed, 12 insertions(+), 12 deletions(-)
 rename backends/{aoti => }/cuda/CMakeLists.txt (96%)
 rename backends/{aoti => }/cuda/TARGETS (100%)
 rename backends/{aoti => }/cuda/__init__.py (100%)
 rename backends/{aoti => }/cuda/cuda_backend.py (100%)
 rename backends/{aoti => }/cuda/cuda_partitioner.py (100%)
 rename backends/{aoti => }/cuda/runtime/cuda_backend.cpp (99%)
 rename backends/{aoti => }/cuda/runtime/shims/memory.cpp (99%)
 rename backends/{aoti => }/cuda/runtime/shims/memory.h (100%)
 rename backends/{aoti => }/cuda/runtime/shims/tensor_attribute.cpp (91%)
 rename backends/{aoti => }/cuda/runtime/shims/tensor_attribute.h (100%)
 rename backends/{aoti => }/cuda/runtime/utils.cpp (100%)
 rename backends/{aoti => }/cuda/runtime/utils.h (100%)
 rename backends/{aoti => }/cuda/targets.bzl (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 21ec1ba8e7e..586f1b1128f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -595,7 +595,7 @@ if(EXECUTORCH_BUILD_CUDA)
   # Build common AOTI functionality (required for CUDA)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti)
   # Build CUDA-specific AOTI functionality
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti/cuda)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda)
   # Add aoti_cuda to backends - it already depends on aoti_common
   list(APPEND _executorch_backends aoti_cuda)
 endif()
diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h
index d5cae26cd05..e8bc253d9c0 100644
--- a/backends/aoti/aoti_model_container.h
+++ b/backends/aoti/aoti_model_container.h
@@ -8,9 +8,9 @@
 
 #pragma once
 
+#include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/extension/tensor/tensor.h>
 #include <executorch/runtime/core/error.h>
-#include "cuda/runtime/shims/memory.h"
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp
index fbc596ce8b0..97a0478ba52 100644
--- a/backends/aoti/common_shims.cpp
+++ b/backends/aoti/common_shims.cpp
@@ -106,7 +106,6 @@ AOTITorchError aoti_torch_get_storage_size(
   throw std::runtime_error("Cannot get storage size on ETensor");
 }
 
-
 AOTITorchError aoti_torch_get_device_index(
     AOTITensorHandle tensor,
     int32_t* ret_device_index) {
diff --git a/backends/aoti/utils.cpp b/backends/aoti/utils.cpp
index 95b4f0c4b4f..68c28eed265 100644
--- a/backends/aoti/utils.cpp
+++ b/backends/aoti/utils.cpp
@@ -34,7 +34,8 @@ size_t dtype_to_element_size(int32_t dtype) {
 // Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded
 // ScalarType::Float)
 executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) {
-  // Convert based on known PyTorch dtype codes (without CUDA-specific dependency)
+  // Convert based on known PyTorch dtype codes (without CUDA-specific
+  // dependency)
   switch (dtype) {
     case 6: // PyTorch's float32 dtype code
       return executorch::aten::ScalarType::Float;
diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h
index 3fb710a24d8..828f15ee1a4 100644
--- a/backends/aoti/utils.h
+++ b/backends/aoti/utils.h
@@ -10,8 +10,8 @@
 
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/exec_aten.h>
-#include <cstdint>
 #include <cstddef>
+#include <cstdint>
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
similarity index 96%
rename from backends/aoti/cuda/CMakeLists.txt
rename to backends/cuda/CMakeLists.txt
index 971d92bd044..ef6a4ddb8bd 100644
--- a/backends/aoti/cuda/CMakeLists.txt
+++ b/backends/cuda/CMakeLists.txt
@@ -19,7 +19,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 # Source root directory for executorch.
 if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
 find_package(CUDAToolkit REQUIRED)
diff --git a/backends/aoti/cuda/TARGETS b/backends/cuda/TARGETS
similarity index 100%
rename from backends/aoti/cuda/TARGETS
rename to backends/cuda/TARGETS
diff --git a/backends/aoti/cuda/__init__.py b/backends/cuda/__init__.py
similarity index 100%
rename from backends/aoti/cuda/__init__.py
rename to backends/cuda/__init__.py
diff --git a/backends/aoti/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
similarity index 100%
rename from backends/aoti/cuda/cuda_backend.py
rename to backends/cuda/cuda_backend.py
diff --git a/backends/aoti/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
similarity index 100%
rename from backends/aoti/cuda/cuda_partitioner.py
rename to backends/cuda/cuda_partitioner.py
diff --git a/backends/aoti/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
similarity index 99%
rename from backends/aoti/cuda/runtime/cuda_backend.cpp
rename to backends/cuda/runtime/cuda_backend.cpp
index b6d9bb7d75d..6cd20537e80 100644
--- a/backends/aoti/cuda/runtime/cuda_backend.cpp
+++ b/backends/cuda/runtime/cuda_backend.cpp
@@ -25,8 +25,8 @@
 #include <vector>
 
 // Include our shim layer headers
-#include "../../aoti_model_container.h"
-#include "../../common_shims.h"
+#include <executorch/backends/aoti/aoti_model_container.h>
+#include <executorch/backends/aoti/common_shims.h>
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp
similarity index 99%
rename from backends/aoti/cuda/runtime/shims/memory.cpp
rename to backends/cuda/runtime/shims/memory.cpp
index 7ca83973b8c..4518b359646 100644
--- a/backends/aoti/cuda/runtime/shims/memory.cpp
+++ b/backends/cuda/runtime/shims/memory.cpp
@@ -6,9 +6,9 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/aoti/cuda/runtime/shims/memory.h>
-#include <executorch/backends/aoti/cuda/runtime/shims/tensor_attribute.h>
-#include <executorch/backends/aoti/cuda/runtime/utils.h>
+#include <executorch/backends/cuda/runtime/shims/memory.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/utils.h>
 #include <executorch/backends/aoti/utils.h>
 #include <executorch/runtime/platform/log.h>
 #include <cstdint>
diff --git a/backends/aoti/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h
similarity index 100%
rename from backends/aoti/cuda/runtime/shims/memory.h
rename to backends/cuda/runtime/shims/memory.h
diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp
similarity index 91%
rename from backends/aoti/cuda/runtime/shims/tensor_attribute.cpp
rename to backends/cuda/runtime/shims/tensor_attribute.cpp
index cb564f10129..789c16d7555 100644
--- a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp
+++ b/backends/cuda/runtime/shims/tensor_attribute.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/aoti/cuda/runtime/shims/tensor_attribute.h>
+#include <executorch/backends/cuda/runtime/shims/tensor_attribute.h>
 
 namespace executorch {
 namespace backends {
diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h
similarity index 100%
rename from backends/aoti/cuda/runtime/shims/tensor_attribute.h
rename to backends/cuda/runtime/shims/tensor_attribute.h
diff --git a/backends/aoti/cuda/runtime/utils.cpp b/backends/cuda/runtime/utils.cpp
similarity index 100%
rename from backends/aoti/cuda/runtime/utils.cpp
rename to backends/cuda/runtime/utils.cpp
diff --git a/backends/aoti/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h
similarity index 100%
rename from backends/aoti/cuda/runtime/utils.h
rename to backends/cuda/runtime/utils.h
diff --git a/backends/aoti/cuda/targets.bzl b/backends/cuda/targets.bzl
similarity index 100%
rename from backends/aoti/cuda/targets.bzl
rename to backends/cuda/targets.bzl

From 7542caec63bf7ada9d21933e611069ca45de6323 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 18 Sep 2025 12:15:01 -0700
Subject: [PATCH 49/50] solve cuda backend dependency issue

---
 backends/cuda/cuda_partitioner.py | 2 +-
 export_aoti.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py
index f48759afa80..227d13ba093 100644
--- a/backends/cuda/cuda_partitioner.py
+++ b/backends/cuda/cuda_partitioner.py
@@ -9,7 +9,7 @@
 from typing import Callable, Dict, final, List, Optional, Tuple
 
 import torch
-from executorch.backends.aoti.cuda.cuda_backend import CudaBackend  # usort: skip
+from executorch.backends.cuda.cuda_backend import CudaBackend  # usort: skip
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
diff --git a/export_aoti.py b/export_aoti.py
index 0fda74a04f7..d0bf916f387 100644
--- a/export_aoti.py
+++ b/export_aoti.py
@@ -23,7 +23,7 @@
 from typing import Any, Dict, Tuple
 
 import torch
-from executorch.backends.aoti.cuda.cuda_partitioner import CudaPartitioner
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
 
 # from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
 from executorch.exir import to_edge, to_edge_transform_and_lower

From f93d194d52dc2ae443e1f3a586304c0e19fc4d31 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@icloud.com>
Date: Thu, 18 Sep 2025 15:51:21 -0700
Subject: [PATCH 50/50] add cuda export ci

---
 .ci/scripts/test-cuda-export-aoti.sh    | 105 +++++++++++
 .ci/scripts/test_cuda_export_aoti.py    | 228 ++++++++++++++++++++++++
 .github/workflows/test-backend-cuda.yml |  68 +++++++
 3 files changed, 401 insertions(+)
 create mode 100755 .ci/scripts/test-cuda-export-aoti.sh
 create mode 100755 .ci/scripts/test_cuda_export_aoti.py
 create mode 100644 .github/workflows/test-backend-cuda.yml

diff --git a/.ci/scripts/test-cuda-export-aoti.sh b/.ci/scripts/test-cuda-export-aoti.sh
new file mode 100755
index 00000000000..6ea701b8f4b
--- /dev/null
+++ b/.ci/scripts/test-cuda-export-aoti.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+CUDA_VERSION=${1:-"12.6"}
+
+echo "=== Testing ExecutorTorch CUDA AOTI Export ${CUDA_VERSION} ==="
+
+# Function to test CUDA AOTI export functionality
+test_cuda_aoti_export() {
+    local cuda_version=$1
+
+    echo "Testing CUDA AOTI export with CUDA ${cuda_version} support..."
+
+    # Check available resources before starting
+    echo "=== System Information ==="
+    echo "Available memory: $(free -h | grep Mem | awk '{print $2}')"
+    echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')"
+    echo "CPU cores: $(nproc)"
+    echo "CUDA version check:"
+    nvcc --version || echo "nvcc not found"
+    nvidia-smi || echo "nvidia-smi not found"
+
+    # Set up environment for CUDA builds
+    export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+
+    echo "=== Installing ExecutorTorch with CUDA support ==="
+    # Install ExecutorTorch with CUDA support with timeout and error handling
+    timeout 5400 ./install_executorch.sh || {
+        local exit_code=$?
+        echo "ERROR: install_executorch.sh failed with exit code: $exit_code"
+        if [ $exit_code -eq 124 ]; then
+            echo "ERROR: Installation timed out after 90 minutes"
+        fi
+        exit $exit_code
+    }
+
+    echo "SUCCESS: ExecutorTorch CUDA installation completed"
+
+    # Verify the installation
+    echo "=== Verifying ExecutorTorch CUDA Installation ==="
+
+    # Test that ExecutorTorch was built successfully
+    python -c "
+import executorch
+print('SUCCESS: ExecutorTorch imported successfully')
+"
+
+    # Test CUDA availability and show details
+    python -c "
+try:
+    import torch
+    print('INFO: PyTorch version:', torch.__version__)
+    print('INFO: CUDA available:', torch.cuda.is_available())
+
+    if torch.cuda.is_available():
+        print('SUCCESS: CUDA is available for ExecutorTorch')
+        print('INFO: CUDA version:', torch.version.cuda)
+        print('INFO: GPU device count:', torch.cuda.device_count())
+        print('INFO: Current GPU device:', torch.cuda.current_device())
+        print('INFO: GPU device name:', torch.cuda.get_device_name())
+
+        # Test basic CUDA tensor operation
+        device = torch.device('cuda')
+        x = torch.randn(10, 10).to(device)
+        y = torch.randn(10, 10).to(device)
+        z = torch.mm(x, y)
+        print('SUCCESS: CUDA tensor operation completed on device:', z.device)
+        print('INFO: Result tensor shape:', z.shape)
+
+        print('SUCCESS: ExecutorTorch CUDA integration verified')
+    else:
+        print('WARNING: CUDA not detected, but ExecutorTorch built successfully')
+        exit(1)
+except Exception as e:
+    print('ERROR: ExecutorTorch CUDA test failed:', e)
+    exit(1)
+"
+
+    echo "=== Running CUDA AOTI Export Tests ==="
+    # Run the CUDA AOTI export tests using the Python script
+    python .ci/scripts/test_cuda_export_aoti.py \
+        --models linear conv2d add resnet18 \
+        --export-mode export_aoti_only \
+        --timeout 600 \
+        --cleanup
+
+    echo "SUCCESS: ExecutorTorch CUDA AOTI export ${cuda_version} tests completed successfully"
+}
+
+# Main execution
+echo "Current working directory: $(pwd)"
+echo "Directory contents:"
+ls -la
+
+# Run the CUDA AOTI export test
+test_cuda_aoti_export "${CUDA_VERSION}"
diff --git a/.ci/scripts/test_cuda_export_aoti.py b/.ci/scripts/test_cuda_export_aoti.py
new file mode 100755
index 00000000000..3748dc5fe33
--- /dev/null
+++ b/.ci/scripts/test_cuda_export_aoti.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Test script for CUDA AOTI export functionality.
+This script tests basic CUDA export functionality for a subset of models:
+linear, conv2d, add, and resnet18.
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+from typing import List, Optional
+
+
+def run_command(
+    cmd: List[str], cwd: Optional[str] = None, timeout: int = 300
+) -> subprocess.CompletedProcess:
+    """Run a command with proper error handling and timeout."""
+    print(f"Running command: {' '.join(cmd)}")
+    if cwd:
+        print(f"Working directory: {cwd}")
+
+    try:
+        result = subprocess.run(
+            cmd,
+            cwd=cwd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+            check=False,  # We'll handle the return code ourselves
+        )
+
+        if result.stdout:
+            print("STDOUT:")
+            print(result.stdout)
+        if result.stderr:
+            print("STDERR:")
+            print(result.stderr)
+
+        return result
+    except subprocess.TimeoutExpired as e:
+        print(f"ERROR: Command timed out after {timeout} seconds")
+        raise e
+    except Exception as e:
+        print(f"ERROR: Failed to run command: {e}")
+        raise e
+
+
+def test_cuda_export(
+    model_name: str, export_mode: str = "export_aoti_only", timeout: int = 300
+) -> bool:
+    """Test CUDA export for a specific model."""
+    print(f"\n{'='*60}")
+    print(f"Testing CUDA export for model: {model_name}")
+    print(f"Export mode: {export_mode}")
+    print(f"{'='*60}")
+
+    try:
+        # Run the export using export_aoti.py
+        cmd = ["python", "export_aoti.py", model_name]
+        if export_mode == "export_aoti_only":
+            cmd.append("--aoti_only")
+
+        result = run_command(cmd, timeout=timeout)
+
+        if result.returncode == 0:
+            print(f"SUCCESS: {model_name} export completed successfully")
+            return True
+        else:
+            print(
+                f"ERROR: {model_name} export failed with return code {result.returncode}"
+            )
+            return False
+
+    except subprocess.TimeoutExpired:
+        print(f"ERROR: {model_name} export timed out after {timeout} seconds")
+        return False
+    except Exception as e:
+        print(f"ERROR: {model_name} export failed with exception: {e}")
+        return False
+
+
+def cleanup_temp_files():
+    """Clean up temporary files generated during export."""
+    print("Cleaning up temporary files...")
+
+    # List of file patterns to clean up
+    cleanup_patterns = [
+        "*.cubin",
+        "*.pte",
+        "*.so",
+        "*kernel_metadata.json",
+        "*kernel.cpp",
+        "*wrapper_metadata.json",
+        "*wrapper.cpp",
+        "*wrapper.json",
+        "aoti_intermediate_output.txt",
+    ]
+
+    # Remove files matching patterns
+    for pattern in cleanup_patterns:
+        try:
+            import glob
+
+            files = glob.glob(pattern)
+            for file in files:
+                if os.path.isfile(file):
+                    os.remove(file)
+                    print(f"Removed file: {file}")
+        except Exception as e:
+            print(f"Warning: Failed to remove {pattern}: {e}")
+
+    # Remove temporary directories created by wrappers
+    try:
+        import glob
+
+        for wrapper_file in glob.glob("*wrapper.cpp"):
+            basename = wrapper_file.replace("wrapper.cpp", "")
+            if os.path.isdir(basename):
+                import shutil
+
+                shutil.rmtree(basename)
+                print(f"Removed directory: {basename}")
+    except Exception as e:
+        print(f"Warning: Failed to remove wrapper directories: {e}")
+
+    print("Cleanup completed.")
+
+
+def main():
+    """Main function to test CUDA export for specified models."""
+    parser = argparse.ArgumentParser(
+        description="Test CUDA AOTI export functionality",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        default=["linear", "conv2d", "add", "resnet18"],
+        help="List of models to test (default: linear, conv2d, add, resnet18)",
+    )
+
+    parser.add_argument(
+        "--export-mode",
+        choices=["export_aoti_only", "full"],
+        default="export_aoti_only",
+        help="Export mode: export_aoti_only (AOTI only) or full (full pipeline)",
+    )
+
+    parser.add_argument(
+        "--timeout",
+        type=int,
+        default=300,
+        help="Timeout for each model export in seconds (default: 300)",
+    )
+
+    parser.add_argument(
+        "--cleanup",
+        action="store_true",
+        default=True,
+        help="Clean up temporary files after testing (default: True)",
+    )
+
+    args = parser.parse_args()
+
+    print("CUDA AOTI Export Test")
+    print("=" * 60)
+    print(f"Models to test: {args.models}")
+    print(f"Export mode: {args.export_mode}")
+    print(f"Timeout per model: {args.timeout} seconds")
+    print(f"Cleanup enabled: {args.cleanup}")
+    print("=" * 60)
+
+    # Check if we're in the correct directory (should have export_aoti.py)
+    if not os.path.exists("export_aoti.py"):
+        print("ERROR: export_aoti.py not found in current directory")
+        print("Please run this script from the executorch root directory")
+        sys.exit(1)
+
+    # Test each model
+    successful_models = []
+    failed_models = []
+
+    for model in args.models:
+        # Clean up before each test
+        if args.cleanup:
+            cleanup_temp_files()
+
+        success = test_cuda_export(model, args.export_mode, args.timeout)
+
+        if success:
+            successful_models.append(model)
+        else:
+            failed_models.append(model)
+
+    # Final cleanup
+    if args.cleanup:
+        cleanup_temp_files()
+
+    # Print summary
+    print("\n" + "=" * 60)
+    print("CUDA AOTI Export Test Summary")
+    print("=" * 60)
+    print(f"Total models tested: {len(args.models)}")
+    print(f"Successful exports: {len(successful_models)}")
+    print(f"Failed exports: {len(failed_models)}")
+
+    if successful_models:
+        print(f"Successful models: {', '.join(successful_models)}")
+
+    if failed_models:
+        print(f"Failed models: {', '.join(failed_models)}")
+        print("\nERROR: One or more model exports failed!")
+        sys.exit(1)
+    else:
+        print("\nSUCCESS: All model exports completed successfully!")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
new file mode 100644
index 00000000000..bc8063b5b73
--- /dev/null
+++ b/.github/workflows/test-backend-cuda.yml
@@ -0,0 +1,68 @@
+# Test ExecutorTorch CUDA AOTI Export Functionality
+# This workflow tests whether ExecutorTorch can successfully export models using CUDA AOTI
+# across different CUDA versions (12.6, 12.8, 12.9) for a subset of models:
+# linear, conv2d, add, and resnet18
+#
+# The test focuses on export-only functionality and verifies that no errors are raised
+# during the AOTI export process.
+
+name: Test CUDA AOTI Export
+
+on:
+  pull_request:
+  push:
+    branches:
+      - main
+      - release/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: false
+
+jobs:
+  test-cuda-aoti-export:
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda-version: ["12.6", "12.8", "12.9"]
+
+    name: test-executorch-cuda-aoti-export-${{ matrix.cuda-version }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 120
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda-version }}
+      submodules: recursive
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        if [ -n "$CONDA_ENV" ]; then
+          conda activate "${CONDA_ENV}"
+        fi
+
+        # Test ExecutorTorch CUDA AOTI export - ExecutorTorch will automatically detect CUDA version
+        # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test-cuda-export-aoti.sh "${{ matrix.cuda-version }}"
+
+  # This job will fail if any of the CUDA AOTI export tests fail
+  check-all-cuda-aoti-exports:
+    needs: test-cuda-aoti-export
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Check if all CUDA AOTI export tests succeeded
+        run: |
+          if [[ "${{ needs.test-cuda-aoti-export.result }}" != "success" ]]; then
+            echo "ERROR: One or more ExecutorTorch CUDA AOTI export tests failed!"
+            echo "CUDA AOTI export test results: ${{ needs.test-cuda-aoti-export.result }}"
+            exit 1
+          else
+            echo "SUCCESS: All ExecutorTorch CUDA AOTI export tests (12.6, 12.8, 12.9) completed successfully!"
+          fi
\ No newline at end of file