From 5f1c6d79468e0c834f9925b3b6554406e0ea7ef5 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 15:47:44 -0700 Subject: [PATCH 01/50] rebase to latest main --- install_requirements.py | 161 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 153 insertions(+), 8 deletions(-) diff --git a/install_requirements.py b/install_requirements.py index cbae175e276..844ada0c7da 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -59,8 +59,16 @@ def python_is_compatible(): # The pip repository that hosts nightly torch packages. -TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu" +# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled. +TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly" +# Supported CUDA versions - modify this to add/remove supported versions +# Format: tuple of (major, minor) version numbers +SUPPORTED_CUDA_VERSIONS = [ + (12, 6), + (12, 8), + (12, 9), +] # Since ExecuTorch often uses main-branch features of pytorch, only the nightly # pip versions will have the required features. @@ -71,7 +79,137 @@ def python_is_compatible(): # # NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt # by picking the hash from the same date in https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ -NIGHTLY_VERSION = "dev20250906" +# +# NOTE: If you're changing, make the corresponding supported CUDA versions in +# SUPPORTED_CUDA_VERSIONS above if needed. +NIGHTLY_VERSION = "dev20250915" + + +def _check_cuda_enabled(): + """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable.""" + cmake_args = os.environ.get("CMAKE_ARGS", "") + return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args + + +def _cuda_version_to_pytorch_suffix(major, minor): + """ + Generate PyTorch CUDA wheel suffix from CUDA version numbers. + + Args: + major: CUDA major version (e.g., 12) + minor: CUDA minor version (e.g., 6) + + Returns: + PyTorch wheel suffix string (e.g., "cu126") + """ + return f"cu{major}{minor}" + + +def _get_cuda_version(): + """ + Get the CUDA version installed on the system using nvcc command. + Returns a tuple (major, minor). + + Raises: + RuntimeError: if nvcc is not found or version cannot be parsed + """ + try: + # Get CUDA version from nvcc (CUDA compiler) + nvcc_result = subprocess.run( + ["nvcc", "--version"], capture_output=True, text=True, check=True + ) + # Parse nvcc output for CUDA version + # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68" + match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + + # Check if the detected version is supported + if (major, minor) not in SUPPORTED_CUDA_VERSIONS: + available_versions = ", ".join( + [f"{maj}.{min}" for maj, min in SUPPORTED_CUDA_VERSIONS] + ) + raise RuntimeError( + f"Detected CUDA version {major}.{minor} is not supported. " + f"Only the following CUDA versions are supported: {available_versions}. " + f"Please install a supported CUDA version or try on CPU-only delegates." + ) + + return (major, minor) + else: + raise RuntimeError( + "CUDA delegate is enabled but could not parse CUDA version from nvcc output. " + "Please ensure CUDA is properly installed or try on CPU-only delegates." + ) + except FileNotFoundError: + raise RuntimeError( + "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. " + "Please install CUDA toolkit or try on CPU-only delegates." + ) + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"CUDA delegate is enabled but nvcc command failed with error: {e}. " + "Please ensure CUDA is properly installed or try on CPU-only delegates." + ) + + +def _get_pytorch_cuda_url(cuda_version): + """ + Get the appropriate PyTorch CUDA URL for the given CUDA version. + + Args: + cuda_version: tuple of (major, minor) version numbers + + Returns: + URL string for PyTorch CUDA packages + """ + major, minor = cuda_version + # Generate CUDA suffix (version validation is already done in _get_cuda_version) + cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor) + + return f"{TORCH_NIGHTLY_URL_BASE}/{cuda_suffix}" + + +# url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url(). +# please do not directly rely on it, but use _determine_torch_url() instead. +_torch_url = None + + +def _determine_torch_url(): + """ + Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS. + Uses caching to avoid redundant CUDA detection and print statements. + + Returns: + URL string for PyTorch packages + """ + global _torch_url + + # Return cached URL if already determined + if _torch_url is not None: + return _torch_url + + # Check if CUDA delegate is enabled + if not _check_cuda_enabled(): + print("CUDA delegate not enabled, using CPU-only PyTorch") + _torch_url = f"{TORCH_NIGHTLY_URL_BASE}/cpu" + return _torch_url + + print("CUDA delegate enabled, detecting CUDA version...") + + # Get CUDA version + cuda_version = _get_cuda_version() + + major, minor = cuda_version + print(f"Detected CUDA version: {major}.{minor}") + + # Get appropriate PyTorch CUDA URL + torch_url = _get_pytorch_cuda_url(cuda_version) + print(f"Using PyTorch URL: {torch_url}") + + # Cache the result + _torch_url = torch_url + return torch_url def install_requirements(use_pytorch_nightly): @@ -84,12 +222,16 @@ def install_requirements(use_pytorch_nightly): ) sys.exit(1) + # Determine the appropriate PyTorch URL based on CUDA delegate status + torch_url = _determine_torch_url() + # pip packages needed by exir. TORCH_PACKAGE = [ # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note # that we don't need to set any version number there because they have already # been installed on CI before this step, so pip won't reinstall them - f"torch==2.9.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", + f"torch==2.10.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", + f"torchao==0.14.0{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchao", ] # Install the requirements for core ExecuTorch package. @@ -105,13 +247,13 @@ def install_requirements(use_pytorch_nightly): "requirements-dev.txt", *TORCH_PACKAGE, "--extra-index-url", - TORCH_NIGHTLY_URL, + torch_url, ], check=True, ) LOCAL_REQUIREMENTS = [ - "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. + # "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. ] + ( [ "extension/llm/tokenizers", # TODO(larryliu0820): Setup a pypi package for this. @@ -147,10 +289,13 @@ def install_requirements(use_pytorch_nightly): def install_optional_example_requirements(use_pytorch_nightly): + # Determine the appropriate PyTorch URL based on CUDA delegate status + torch_url = _determine_torch_url() + print("Installing torch domain libraries") DOMAIN_LIBRARIES = [ ( - f"torchvision==0.24.0.{NIGHTLY_VERSION}" + f"torchvision==0.25.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchvision" ), @@ -165,7 +310,7 @@ def install_optional_example_requirements(use_pytorch_nightly): "install", *DOMAIN_LIBRARIES, "--extra-index-url", - TORCH_NIGHTLY_URL, + torch_url, ], check=True, ) @@ -180,7 +325,7 @@ def install_optional_example_requirements(use_pytorch_nightly): "-r", "requirements-examples.txt", "--extra-index-url", - TORCH_NIGHTLY_URL, + torch_url, "--upgrade-strategy", "only-if-needed", ], From 94d400140b7c74f095ae7ff61dc79e5871c763c2 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 16:41:16 -0700 Subject: [PATCH 02/50] add github ci for gpu pt install check --- .github/workflows/test-cuda-builds.yml | 68 ++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 .github/workflows/test-cuda-builds.yml diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/test-cuda-builds.yml new file mode 100644 index 00000000000..eef3287a920 --- /dev/null +++ b/.github/workflows/test-cuda-builds.yml @@ -0,0 +1,68 @@ +# Test ExecutorTorch CUDA Build Compatibility +# This workflow tests whether ExecutorTorch can be successfully built with CUDA support +# across different CUDA versions (12.6, 12.8, 12.9) using the command: +# CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" ./install_executorch.sh +# +# Note: ExecutorTorch automatically detects the system CUDA version using nvcc and +# installs the appropriate PyTorch wheel. No manual CUDA/PyTorch installation needed. + +name: Test CUDA Builds + +on: + pull_request: + push: + branches: + - main + - release/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + test-cuda-builds: + strategy: + fail-fast: false + matrix: + cuda-version: ["12.6", "12.8", "12.9"] + + name: test-executorch-cuda-build-${{ matrix.cuda-version }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 90 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda-version }} + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + if [ -n "$CONDA_ENV" ]; then + conda activate "${CONDA_ENV}" + fi + + # Test ExecutorTorch CUDA build - ExecutorTorch will automatically detect CUDA version + # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + PYTHON_EXECUTABLE=python bash .ci/scripts/test-cuda-build.sh "${{ matrix.cuda-version }}" + + # This job will fail if any of the CUDA versions fail + check-all-cuda-builds: + needs: test-cuda-builds + runs-on: ubuntu-latest + if: always() + steps: + - name: Check if all CUDA builds succeeded + run: | + if [[ "${{ needs.test-cuda-builds.result }}" != "success" ]]; then + echo "ERROR: One or more ExecutorTorch CUDA builds failed!" + echo "CUDA build results: ${{ needs.test-cuda-builds.result }}" + exit 1 + else + echo "SUCCESS: All ExecutorTorch CUDA builds (12.6, 12.8, 12.9) completed successfully!" + fi From a0332ffb10743e563019dacc6bf77fa9e475a486 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 16:41:39 -0700 Subject: [PATCH 03/50] add github ci for gpu pt install check --- .ci/scripts/test-cuda-build.sh | 84 ++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100755 .ci/scripts/test-cuda-build.sh diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh new file mode 100755 index 00000000000..8a9fedc4d7a --- /dev/null +++ b/.ci/scripts/test-cuda-build.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +CUDA_VERSION=${1:-"12.6"} + +echo "=== Testing ExecutorTorch CUDA ${CUDA_VERSION} Build ===" + +# Function to build and test ExecutorTorch with CUDA support +test_executorch_cuda_build() { + local cuda_version=$1 + + echo "Building ExecutorTorch with CUDA ${cuda_version} support..." + echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel" + + # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically + export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + + # Install ExecutorTorch with CUDA support - this will automatically: + # 1. Detect CUDA version using nvcc + # 2. Install appropriate PyTorch wheel for the detected CUDA version + # 3. Build ExecutorTorch with CUDA support + ./install_executorch.sh + + echo "SUCCESS: ExecutorTorch CUDA build completed" + + # Verify the installation + echo "=== Verifying ExecutorTorch CUDA Installation ===" + + # Test that ExecutorTorch was built successfully + python -c " +import executorch +print('SUCCESS: ExecutorTorch imported successfully') +" + + # Test CUDA availability and show details + python -c " +try: + import torch + print('INFO: PyTorch version:', torch.__version__) + print('INFO: CUDA available:', torch.cuda.is_available()) + + if torch.cuda.is_available(): + print('SUCCESS: CUDA is available for ExecutorTorch') + print('INFO: CUDA version:', torch.version.cuda) + print('INFO: GPU device count:', torch.cuda.device_count()) + print('INFO: Current GPU device:', torch.cuda.current_device()) + print('INFO: GPU device name:', torch.cuda.get_device_name()) + + # Test basic CUDA tensor operation + device = torch.device('cuda') + x = torch.randn(10, 10).to(device) + y = torch.randn(10, 10).to(device) + z = torch.mm(x, y) + print('SUCCESS: CUDA tensor operation completed on device:', z.device) + print('INFO: Result tensor shape:', z.shape) + + print('SUCCESS: ExecutorTorch CUDA integration verified') + else: + print('WARNING: CUDA not detected, but ExecutorTorch built successfully') + exit(1) +except Exception as e: + print('ERROR: ExecutorTorch CUDA test failed:', e) + exit(1) +" + + echo "SUCCESS: ExecutorTorch CUDA ${cuda_version} build and verification completed successfully" +} + +# Main execution +echo "Current working directory: $(pwd)" +echo "Directory contents:" +ls -la + +# Run the CUDA build test +test_executorch_cuda_build "${CUDA_VERSION}" From 433c239b9639963b37604e3d410a9c0965c281a4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 16:58:32 -0700 Subject: [PATCH 04/50] recover torchao --- .ci/docker/ci_commit_pins/pytorch.txt | 2 +- install_requirements.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt index 8c9330d6f2c..e3a53c8bcb5 100644 --- a/.ci/docker/ci_commit_pins/pytorch.txt +++ b/.ci/docker/ci_commit_pins/pytorch.txt @@ -1 +1 @@ -4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e +53a2908a10f414a2f85caa06703a26a40e873869 diff --git a/install_requirements.py b/install_requirements.py index 844ada0c7da..32303f80842 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -231,7 +231,6 @@ def install_requirements(use_pytorch_nightly): # that we don't need to set any version number there because they have already # been installed on CI before this step, so pip won't reinstall them f"torch==2.10.0.{NIGHTLY_VERSION}" if use_pytorch_nightly else "torch", - f"torchao==0.14.0{NIGHTLY_VERSION}" if use_pytorch_nightly else "torchao", ] # Install the requirements for core ExecuTorch package. @@ -253,7 +252,7 @@ def install_requirements(use_pytorch_nightly): ) LOCAL_REQUIREMENTS = [ - # "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. + "third-party/ao", # We need the latest kernels for fast iteration, so not relying on pypi. ] + ( [ "extension/llm/tokenizers", # TODO(larryliu0820): Setup a pypi package for this. From db7bef766e48b461c333438c1a95b1b79e103657 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 21:38:34 -0700 Subject: [PATCH 05/50] solve lint issue --- .ci/scripts/test-cuda-build.sh | 24 +++++++++++++++++++----- .github/workflows/test-cuda-builds.yml | 2 +- install_requirements.py | 4 ++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/.ci/scripts/test-cuda-build.sh b/.ci/scripts/test-cuda-build.sh index 8a9fedc4d7a..a9f8e7ec14f 100755 --- a/.ci/scripts/test-cuda-build.sh +++ b/.ci/scripts/test-cuda-build.sh @@ -21,14 +21,28 @@ test_executorch_cuda_build() { echo "Building ExecutorTorch with CUDA ${cuda_version} support..." echo "ExecutorTorch will automatically detect CUDA and install appropriate PyTorch wheel" + # Check available resources before starting + echo "=== System Information ===" + echo "Available memory: $(free -h | grep Mem | awk '{print $2}')" + echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')" + echo "CPU cores: $(nproc)" + echo "CUDA version check:" + nvcc --version || echo "nvcc not found" + nvidia-smi || echo "nvidia-smi not found" + # Set CMAKE_ARGS to enable CUDA build - ExecutorTorch will handle PyTorch installation automatically export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" - # Install ExecutorTorch with CUDA support - this will automatically: - # 1. Detect CUDA version using nvcc - # 2. Install appropriate PyTorch wheel for the detected CUDA version - # 3. Build ExecutorTorch with CUDA support - ./install_executorch.sh + echo "=== Starting ExecutorTorch Installation ===" + # Install ExecutorTorch with CUDA support with timeout and error handling + timeout 5400 ./install_executorch.sh || { + local exit_code=$? + echo "ERROR: install_executorch.sh failed with exit code: $exit_code" + if [ $exit_code -eq 124 ]; then + echo "ERROR: Installation timed out after 90 minutes" + fi + exit $exit_code + } echo "SUCCESS: ExecutorTorch CUDA build completed" diff --git a/.github/workflows/test-cuda-builds.yml b/.github/workflows/test-cuda-builds.yml index eef3287a920..eff26e72c67 100644 --- a/.github/workflows/test-cuda-builds.yml +++ b/.github/workflows/test-cuda-builds.yml @@ -17,7 +17,7 @@ on: concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} - cancel-in-progress: true + cancel-in-progress: false jobs: test-cuda-builds: diff --git a/install_requirements.py b/install_requirements.py index 32303f80842..e5a7c29c482 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -172,7 +172,7 @@ def _get_pytorch_cuda_url(cuda_version): # url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url(). # please do not directly rely on it, but use _determine_torch_url() instead. -_torch_url = None +_torch_url = "" def _determine_torch_url(): @@ -186,7 +186,7 @@ def _determine_torch_url(): global _torch_url # Return cached URL if already determined - if _torch_url is not None: + if _torch_url: return _torch_url # Check if CUDA delegate is enabled From 3d324c7c1799b87509b8ccc5af40825152f73bfa Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 17 Sep 2025 12:45:25 -0700 Subject: [PATCH 06/50] create install_utils.py for better structure --- install_requirements.py | 190 +------------------------------------ install_utils.py | 201 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+), 187 deletions(-) create mode 100644 install_utils.py diff --git a/install_requirements.py b/install_requirements.py index e5a7c29c482..409ed083970 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -7,56 +7,10 @@ import argparse import os -import platform -import re import subprocess import sys - -def python_is_compatible(): - # Scrape the version range from pyproject.toml, which should be in the current directory. - version_specifier = None - with open("pyproject.toml", "r") as file: - for line in file: - if line.startswith("requires-python"): - match = re.search(r'"([^"]*)"', line) - if match: - version_specifier = match.group(1) - break - - if not version_specifier: - print( - "WARNING: Skipping python version check: version range not found", - file=sys.stderr, - ) - return False - - # Install the packaging module if necessary. - try: - import packaging - except ImportError: - subprocess.run( - [sys.executable, "-m", "pip", "install", "packaging"], check=True - ) - # Compare the current python version to the range in version_specifier. Exits - # with status 1 if the version is not compatible, or with status 0 if the - # version is compatible or the logic itself fails. - try: - import packaging.specifiers - import packaging.version - - python_version = packaging.version.parse(platform.python_version()) - version_range = packaging.specifiers.SpecifierSet(version_specifier) - if python_version not in version_range: - print( - f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"', - file=sys.stderr, - ) - return False - except Exception as e: - print(f"WARNING: Skipping python version check: {e}", file=sys.stderr) - return True - +from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible # The pip repository that hosts nightly torch packages. # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled. @@ -85,133 +39,6 @@ def python_is_compatible(): NIGHTLY_VERSION = "dev20250915" -def _check_cuda_enabled(): - """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable.""" - cmake_args = os.environ.get("CMAKE_ARGS", "") - return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args - - -def _cuda_version_to_pytorch_suffix(major, minor): - """ - Generate PyTorch CUDA wheel suffix from CUDA version numbers. - - Args: - major: CUDA major version (e.g., 12) - minor: CUDA minor version (e.g., 6) - - Returns: - PyTorch wheel suffix string (e.g., "cu126") - """ - return f"cu{major}{minor}" - - -def _get_cuda_version(): - """ - Get the CUDA version installed on the system using nvcc command. - Returns a tuple (major, minor). - - Raises: - RuntimeError: if nvcc is not found or version cannot be parsed - """ - try: - # Get CUDA version from nvcc (CUDA compiler) - nvcc_result = subprocess.run( - ["nvcc", "--version"], capture_output=True, text=True, check=True - ) - # Parse nvcc output for CUDA version - # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68" - match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout) - if match: - major, minor = int(match.group(1)), int(match.group(2)) - - # Check if the detected version is supported - if (major, minor) not in SUPPORTED_CUDA_VERSIONS: - available_versions = ", ".join( - [f"{maj}.{min}" for maj, min in SUPPORTED_CUDA_VERSIONS] - ) - raise RuntimeError( - f"Detected CUDA version {major}.{minor} is not supported. " - f"Only the following CUDA versions are supported: {available_versions}. " - f"Please install a supported CUDA version or try on CPU-only delegates." - ) - - return (major, minor) - else: - raise RuntimeError( - "CUDA delegate is enabled but could not parse CUDA version from nvcc output. " - "Please ensure CUDA is properly installed or try on CPU-only delegates." - ) - except FileNotFoundError: - raise RuntimeError( - "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. " - "Please install CUDA toolkit or try on CPU-only delegates." - ) - except subprocess.CalledProcessError as e: - raise RuntimeError( - f"CUDA delegate is enabled but nvcc command failed with error: {e}. " - "Please ensure CUDA is properly installed or try on CPU-only delegates." - ) - - -def _get_pytorch_cuda_url(cuda_version): - """ - Get the appropriate PyTorch CUDA URL for the given CUDA version. - - Args: - cuda_version: tuple of (major, minor) version numbers - - Returns: - URL string for PyTorch CUDA packages - """ - major, minor = cuda_version - # Generate CUDA suffix (version validation is already done in _get_cuda_version) - cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor) - - return f"{TORCH_NIGHTLY_URL_BASE}/{cuda_suffix}" - - -# url for the PyTorch ExecuTorch depending on, which will be set by _determine_torch_url(). -# please do not directly rely on it, but use _determine_torch_url() instead. -_torch_url = "" - - -def _determine_torch_url(): - """ - Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS. - Uses caching to avoid redundant CUDA detection and print statements. - - Returns: - URL string for PyTorch packages - """ - global _torch_url - - # Return cached URL if already determined - if _torch_url: - return _torch_url - - # Check if CUDA delegate is enabled - if not _check_cuda_enabled(): - print("CUDA delegate not enabled, using CPU-only PyTorch") - _torch_url = f"{TORCH_NIGHTLY_URL_BASE}/cpu" - return _torch_url - - print("CUDA delegate enabled, detecting CUDA version...") - - # Get CUDA version - cuda_version = _get_cuda_version() - - major, minor = cuda_version - print(f"Detected CUDA version: {major}.{minor}") - - # Get appropriate PyTorch CUDA URL - torch_url = _get_pytorch_cuda_url(cuda_version) - print(f"Using PyTorch URL: {torch_url}") - - # Cache the result - _torch_url = torch_url - return torch_url - - def install_requirements(use_pytorch_nightly): # Skip pip install on Intel macOS if using nightly. if use_pytorch_nightly and is_intel_mac_os(): @@ -223,7 +50,7 @@ def install_requirements(use_pytorch_nightly): sys.exit(1) # Determine the appropriate PyTorch URL based on CUDA delegate status - torch_url = _determine_torch_url() + torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS) # pip packages needed by exir. TORCH_PACKAGE = [ @@ -289,7 +116,7 @@ def install_requirements(use_pytorch_nightly): def install_optional_example_requirements(use_pytorch_nightly): # Determine the appropriate PyTorch URL based on CUDA delegate status - torch_url = _determine_torch_url() + torch_url = determine_torch_url(TORCH_NIGHTLY_URL_BASE, SUPPORTED_CUDA_VERSIONS) print("Installing torch domain libraries") DOMAIN_LIBRARIES = [ @@ -332,17 +159,6 @@ def install_optional_example_requirements(use_pytorch_nightly): ) -# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source. -# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024). -def is_intel_mac_os(): - # Returns True if running on Intel macOS. - return platform.system().lower() == "darwin" and platform.machine().lower() in ( - "x86", - "x86_64", - "i386", - ) - - def main(args): parser = argparse.ArgumentParser() parser.add_argument( diff --git a/install_utils.py b/install_utils.py new file mode 100644 index 00000000000..19da1b2193b --- /dev/null +++ b/install_utils.py @@ -0,0 +1,201 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# Copyright 2024-25 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import os +import platform +import re +import subprocess + + +def _is_cuda_enabled(): + """Check if CUDA delegate is enabled via CMAKE_ARGS environment variable.""" + cmake_args = os.environ.get("CMAKE_ARGS", "") + return "-DEXECUTORCH_BUILD_CUDA=ON" in cmake_args + + +def _cuda_version_to_pytorch_suffix(major, minor): + """ + Generate PyTorch CUDA wheel suffix from CUDA version numbers. + + Args: + major: CUDA major version (e.g., 12) + minor: CUDA minor version (e.g., 6) + + Returns: + PyTorch wheel suffix string (e.g., "cu126") + """ + return f"cu{major}{minor}" + + +def _get_cuda_version(supported_cuda_versions): + """ + Get the CUDA version installed on the system using nvcc command. + Returns a tuple (major, minor). + + Args: + supported_cuda_versions: List of supported CUDA versions as tuples + + Raises: + RuntimeError: if nvcc is not found or version cannot be parsed + """ + try: + # Get CUDA version from nvcc (CUDA compiler) + nvcc_result = subprocess.run( + ["nvcc", "--version"], capture_output=True, text=True, check=True + ) + # Parse nvcc output for CUDA version + # Output contains line like "Cuda compilation tools, release 12.6, V12.6.68" + match = re.search(r"release (\d+)\.(\d+)", nvcc_result.stdout) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + + # Check if the detected version is supported + if (major, minor) not in supported_cuda_versions: + available_versions = ", ".join( + [f"{maj}.{min}" for maj, min in supported_cuda_versions] + ) + raise RuntimeError( + f"Detected CUDA version {major}.{minor} is not supported. " + f"Only the following CUDA versions are supported: {available_versions}. " + f"Please install a supported CUDA version or try on CPU-only delegates." + ) + + return (major, minor) + else: + raise RuntimeError( + "CUDA delegate is enabled but could not parse CUDA version from nvcc output. " + "Please ensure CUDA is properly installed or try on CPU-only delegates." + ) + except FileNotFoundError: + raise RuntimeError( + "CUDA delegate is enabled but nvcc (CUDA compiler) is not found in PATH. " + "Please install CUDA toolkit or try on CPU-only delegates." + ) + except subprocess.CalledProcessError as e: + raise RuntimeError( + f"CUDA delegate is enabled but nvcc command failed with error: {e}. " + "Please ensure CUDA is properly installed or try on CPU-only delegates." + ) + + +def _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base): + """ + Get the appropriate PyTorch CUDA URL for the given CUDA version. + + Args: + cuda_version: tuple of (major, minor) version numbers + torch_nightly_url_base: Base URL for PyTorch nightly packages + + Returns: + URL string for PyTorch CUDA packages + """ + major, minor = cuda_version + # Generate CUDA suffix (version validation is already done in _get_cuda_version) + cuda_suffix = _cuda_version_to_pytorch_suffix(major, minor) + + return f"{torch_nightly_url_base}/{cuda_suffix}" + + +# Global variable for caching torch URL +_torch_url_cache = "" + + +def determine_torch_url(torch_nightly_url_base, supported_cuda_versions): + """ + Determine the appropriate PyTorch installation URL based on CUDA availability and CMAKE_ARGS. + Uses caching to avoid redundant CUDA detection and print statements. + + Args: + torch_nightly_url_base: Base URL for PyTorch nightly packages + supported_cuda_versions: List of supported CUDA versions as tuples + + Returns: + URL string for PyTorch packages + """ + global _torch_url_cache + + # Return cached URL if already determined + if _torch_url_cache: + return _torch_url_cache + + # Check if CUDA delegate is enabled + if not _is_cuda_enabled(): + print("CUDA delegate not enabled, using CPU-only PyTorch") + _torch_url_cache = f"{torch_nightly_url_base}/cpu" + return _torch_url_cache + + print("CUDA delegate enabled, detecting CUDA version...") + + # Get CUDA version + cuda_version = _get_cuda_version(supported_cuda_versions) + + major, minor = cuda_version + print(f"Detected CUDA version: {major}.{minor}") + + # Get appropriate PyTorch CUDA URL + torch_url = _get_pytorch_cuda_url(cuda_version, torch_nightly_url_base) + print(f"Using PyTorch URL: {torch_url}") + + # Cache the result + _torch_url_cache = torch_url + return torch_url + + +# Prebuilt binaries for Intel-based macOS are no longer available on PyPI; users must compile from source. +# PyTorch stopped building macOS x86_64 binaries since version 2.3.0 (January 2024). +def is_intel_mac_os(): + # Returns True if running on Intel macOS. + return platform.system().lower() == "darwin" and platform.machine().lower() in ( + "x86", + "x86_64", + "i386", + ) + + +def python_is_compatible(): + # Scrape the version range from pyproject.toml, which should be in the current directory. + version_specifier = None + with open("pyproject.toml", "r") as file: + for line in file: + if line.startswith("requires-python"): + match = re.search(r'"([^"]*)"', line) + if match: + version_specifier = match.group(1) + break + + if not version_specifier: + print( + "WARNING: Skipping python version check: version range not found", + file=sys.stderr, + ) + return False + + # Install the packaging module if necessary. + try: + import packaging + except ImportError: + subprocess.run( + [sys.executable, "-m", "pip", "install", "packaging"], check=True + ) + # Compare the current python version to the range in version_specifier. Exits + # with status 1 if the version is not compatible, or with status 0 if the + # version is compatible or the logic itself fails. + try: + import packaging.specifiers + import packaging.version + + python_version = packaging.version.parse(platform.python_version()) + version_range = packaging.specifiers.SpecifierSet(version_specifier) + if python_version not in version_range: + print( + f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"', + file=sys.stderr, + ) + return False + except Exception as e: + print(f"WARNING: Skipping python version check: {e}", file=sys.stderr) + return True From a6e1918a77c1b53f36e9ebd110f31b66d9f8cb1c Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Mon, 4 Aug 2025 22:53:41 -0700 Subject: [PATCH 07/50] Add skeleton code --- CMakeLists.txt | 6 + backends/aoti/CMakeLists.txt | 47 +++ backends/aoti/README.md | 2 + backends/aoti/aoti_backend.py | 43 ++ backends/aoti/aoti_partitioner.py | 74 ++++ backends/aoti/runtime/AotiBackend.cpp | 570 ++++++++++++++++++++++++++ backends/aoti/runtime/TARGETS | 3 + backends/aoti/runtime/targets.bzl | 18 + install_requirements.py | 1 - tools/cmake/executorch-config.cmake | 1 + tools/cmake/preset/default.cmake | 9 + 11 files changed, 773 insertions(+), 1 deletion(-) create mode 100644 backends/aoti/CMakeLists.txt create mode 100644 backends/aoti/README.md create mode 100644 backends/aoti/aoti_backend.py create mode 100644 backends/aoti/aoti_partitioner.py create mode 100644 backends/aoti/runtime/AotiBackend.cpp create mode 100644 backends/aoti/runtime/TARGETS create mode 100644 backends/aoti/runtime/targets.bzl diff --git a/CMakeLists.txt b/CMakeLists.txt index fc427d517a9..4497fa133c0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -587,6 +587,12 @@ endif() if(EXECUTORCH_BUILD_CORTEX_M) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m) + list(APPEND _executorch_backends coretex_m_backend) +endif() + +if(EXECUTORCH_BUILD_AOTI) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) + list(APPEND _executorch_backends aoti_backend) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt new file mode 100644 index 00000000000..12886bc0cac --- /dev/null +++ b/backends/aoti/CMakeLists.txt @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) +endif() + +include(${EXECUTORCH_ROOT}/build/Utils.cmake) + +find_package(CUDA) + +set(_common_include_directories ${EXECUTORCH_ROOT}/..) + +set(_aoti_sources runtime/AotiBackend.cpp) + +add_library(aoti_backend STATIC ${_aoti_sources}) +target_include_directories( + aoti_backend PUBLIC ${_common_include_directories} ${CUDA_INCLUDE_DIRS} +) + +target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC) +target_link_libraries(aoti_backend PUBLIC extension_tensor ${CUDA_LIBRARIES}) +executorch_target_link_options_shared_lib(aoti_backend) + +install( + TARGETS aoti_backend + EXPORT ExecuTorchTargets + DESTINATION lib + INCLUDES + DESTINATION ${_common_include_directories} +) diff --git a/backends/aoti/README.md b/backends/aoti/README.md new file mode 100644 index 00000000000..9df05c99e07 --- /dev/null +++ b/backends/aoti/README.md @@ -0,0 +1,2 @@ +## Experimental AOTI backend +Proceed with caution. This is an experimental backend that is not yet ready for production use. diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py new file mode 100644 index 00000000000..d1e8a5b4896 --- /dev/null +++ b/backends/aoti/aoti_backend.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import copy + +from subprocess import check_call +from typing import final, List + +import torch +from executorch.exir.backend.backend_details import ( + BackendDetails, + ExportedProgram, + PreprocessResult, +) +from executorch.exir.backend.compile_spec_schema import CompileSpec + + +@final +class AotiBackend(BackendDetails): + @staticmethod + def preprocess( + edge_program: ExportedProgram, + compile_specs: List[CompileSpec], + ) -> PreprocessResult: + print("entering the lowerable parts in AotiBackend.preprocess....") + + print("here", edge_program.example_inputs) + copy_edge_program = copy.deepcopy(edge_program) + graph_module = copy_edge_program.graph_module + args, kwargs = copy_edge_program.example_inputs + so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] + print(so_path) + check_call( + f"patchelf --remove-needed libtorch.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}", + shell=True, + ) + + with open(so_path, "rb") as f: + data = f.read() + return PreprocessResult(data) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py new file mode 100644 index 00000000000..6cb7c6cc38a --- /dev/null +++ b/backends/aoti/aoti_partitioner.py @@ -0,0 +1,74 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +from typing import cast, final, List + +import torch +from executorch.backends.aoti.aoti_backend import AotiBackend # usort: skip +from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.backend.partitioner import ( + DelegationSpec, + Partitioner, + PartitionResult, +) +from executorch.exir.backend.utils import tag_constant_data +from executorch.exir.dialects._ops import ops as exir_ops +from torch.export.exported_program import ExportedProgram +from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner + +from torch.fx.passes.operator_support import OperatorSupportBase + + +class AOTISupportedOperators(OperatorSupportBase): + def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: + supported = node.op == "call_function" and node.target in [ + exir_ops.edge.aten.add.Tensor, + exir_ops.edge.aten._to_copy.default, + ] + + return supported + + def is_node_supported_custom(self, node: torch.fx.Node) -> bool: + if node.target == exir_ops.edge.aten.mean.dim: + keep_dim = node.args[2] if len(node.args) > 2 else False + return cast(bool, keep_dim) + if node.target == exir_ops.edge.aten.var.correction: + keep_dim = node.kwargs.get("keepdim", False) + return cast(bool, keep_dim) + return True + + +@final +class AotiPartitioner(Partitioner): + def __init__(self, compile_spec: List[CompileSpec]) -> None: + self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec) + print(self.delegation_spec) + + def partition(self, exported_program: ExportedProgram) -> PartitionResult: + # Run the CapabilityBasedPartitioner to return the largest possible + # subgraphs containing the nodes with the tags + # logger.info("AotiPartitioner::partition") + partition_tags = {} + + capability_partitioner = CapabilityBasedPartitioner( + exported_program.graph_module, + AOTISupportedOperators(), + allows_single_node_partition=True, + ) + partition_list = capability_partitioner.propose_partitions() + for partition in partition_list: + for node in partition.nodes: + tag = f"tag{partition.id}" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec + + tag_constant_data(exported_program) + + return PartitionResult( + tagged_exported_program=exported_program, partition_tags=partition_tags + ) diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp new file mode 100644 index 00000000000..0044a4155d6 --- /dev/null +++ b/backends/aoti/runtime/AotiBackend.cpp @@ -0,0 +1,570 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "cuda_runtime.h" + +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Here is where the aoti bouncers are going to be defined. +// I define the globals aoti generated compiled code calls +// They can be backed by ET systems + +using namespace std; + +using executorch::aten::ScalarType; +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::Tensor; + +extern "C" { +using AOTITensorHandle = Tensor*; + +// TODO: We should get a proper one +struct CUDAStreamGuardOpaque; +using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; + +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + +struct AOTInductorModelContainerOpaque; +using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*; +using AOTInductorStreamHandle = void*; +using AOTIProxyExecutorHandle = void*; + +using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir); + +using AOTInductorModelContainerDeleteFunc = + AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle); + +using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AOTITensorHandle* + output_handles, // array for writing output AOTITensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle); + +AOTInductorModelContainerCreateWithDeviceFunc + AOTInductorModelContainerCreateWithDevice = nullptr; +AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr; +AOTInductorModelContainerGetNumInputsFunc + AOTInductorModelContainerGetNumInputs = nullptr; +AOTInductorModelContainerGetNumOutputsFunc + AOTInductorModelContainerGetNumOutputs = nullptr; +AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr; +std::unordered_map> tensor_to_sizes; +std::unordered_map> tensor_to_strides; +std::unordered_set> tensors; + +int32_t aoti_torch_grad_mode_is_enabled() { + // No autograd ever + return false; +} + +void aoti_torch_grad_mode_set_enabled(bool enabled) { + if (enabled) { + throw std::runtime_error("Cannot enable autograd"); + } +} + +AOTITorchError aoti_torch_get_data_ptr( + AOTITensorHandle tensor, + void** ret_data_ptr) { + *ret_data_ptr = tensor->mutable_data_ptr(); + return Error::Ok; +} + +AOTITorchError aoti_torch_get_storage_offset( + AOTITensorHandle tensor, + int64_t* ret_storage_offset) { + // Storage offset is always 0 in ET + *ret_storage_offset = 0; + return Error::Ok; +} + +AOTITorchError aoti_torch_get_strides( + AOTITensorHandle tensor, + int64_t** ret_strides) { + auto it = tensor_to_strides.find(tensor); + if (it == tensor_to_strides.end()) { + std::vector strides(tensor->dim()); + auto tensor_strides = tensor->strides(); + for (int i = 0; i < tensor->dim(); i++) { + strides[i] = tensor_strides[i]; + } + it = tensor_to_strides.emplace(tensor, std::move(strides)).first; + } + *ret_strides = it->second.data(); + std::cout << "getting strides from tensor " << tensor << " with dim " + << tensor->dim() << std::endl; + for (int i = 0; i < tensor->dim(); i++) { + std::cout << "strides " << i << " = " << *ret_strides[i] << std::endl; + } + return Error::Ok; +} + +AOTITorchError aoti_torch_get_dtype( + AOTITensorHandle tensor, + int32_t* ret_dtype) { + *ret_dtype = static_cast(tensor->scalar_type()); + return Error::Ok; +} + +AOTITorchError aoti_torch_get_sizes( + AOTITensorHandle tensor, + int64_t** ret_sizes) { + auto it = tensor_to_sizes.find(tensor); + if (it == tensor_to_sizes.end()) { + std::vector sizes(tensor->dim()); + auto tensor_sizes = tensor->sizes(); + for (int i = 0; i < tensor->dim(); i++) { + sizes[i] = tensor_sizes[i]; + } + it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first; + } + *ret_sizes = it->second.data(); + std::cout << "getting sizes from tensor " << tensor << " with dim " + << tensor->dim() << std::endl; + for (int i = 0; i < tensor->dim(); i++) { + std::cout << "size " << i << " = " << *ret_sizes[i] << std::endl; + } + return Error::Ok; +} + +AOTITorchError aoti_torch_get_storage_size( + AOTITensorHandle tensor, + int64_t* ret_size) { + throw std::runtime_error("Cannot get storage size on ETensor"); +} + +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size) { + throw std::runtime_error("Not creating Tensor from blob here"); +} + +AOTITorchError aoti_torch_create_cuda_stream_guard( + void* stream, + int32_t device_index, + CUDAStreamGuardHandle* ret_guard) { + std::cout << "Entering stream guard for device " << device_index << std::endl; + return Error::Ok; +} + +AOTITorchError aoti_torch_delete_cuda_stream_guard( + CUDAStreamGuardHandle guard) { + std::cout << "Exiting stream guard" << std::endl; + return Error::Ok; +} + +int aoti_torch_device_type_cpu() { + // Let's say cpu is 0 for ET as well + return 0; +} + +__attribute__((__visibility__("default"))) int32_t +aoti_torch_device_type_cuda() { + // Let's say cuda is 1 for ET as well + return 1; +} + +__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { + // Let assume the dtype here is all we will support + return 6; +} + +AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { + std::cout << "Deleting " << tensor << std::endl; + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + tensors.erase(it); + break; // Exit the loop once the element is found and removed + } + } + return Error::Ok; +} +AOTITorchError aoti_torch_create_tensor_from_blob( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor) { + throw std::runtime_error("Should never create from blob"); +} + +AOTITorchError aoti_torch_empty_strided( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor) { + // This requires us to reserve CUDA memory and put it into a ETensor + void* ptr; + int64_t numel = 1; + for (int i = 0; i < ndim; i++) { + numel *= sizes_ptr[i]; + } + + if (dtype != 6) { // throw if not float32 + throw std::runtime_error("Need to implement empty_strided for non-float32"); + } + + int64_t nbytes = numel * 4; + + if (device_type == 1) { // cuda + std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl; + cudaError_t err = cudaMalloc(&ptr, nbytes); + if (err != cudaSuccess) { + std::cout << "failed to allocate " << nbytes << std::endl; + throw std::runtime_error("Failed to call cudaMalloc"); + } + } else if (device_type == 0) { // cpu + std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl; + ptr = malloc(nbytes); + if (ptr == nullptr) { + throw std::runtime_error("Failed to call malloc"); + } + } else { + throw std::runtime_error( + "Need to implement empty_strided for non-CUDA non-CPU"); + } + std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr " + << sizes_ptr << std::endl; + + // ETensor sizes + std::vector sizes(ndim); + for (int i = 0; i < ndim; i++) { + sizes[i] = sizes_ptr[i]; + } + // ETensor creation + auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr); + + // Store the tensor + tensors.insert(tensor); + + std::cout << "sizes.data(): " << sizes.data() + << ", tensor->sizes().data(): " << tensor->sizes().data() + << std::endl; + std::cout << "Size[0] of tensor " << tensor.get() << " is " + << tensor->sizes()[0] << std::endl; + *ret_new_tensor = tensor.get(); + return Error::Ok; +} + +void checkCudaError(cudaError_t err, const char* msg) { + if (err != cudaSuccess) { + std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")" + << std::endl; + exit(EXIT_FAILURE); + } +} + +AOTITorchError aoti_torch_copy_( + AOTITensorHandle self, + AOTITensorHandle src, + int32_t non_blocking) { + // check if size is the same + if (self->dim() != src->dim()) { + std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim() + << std::endl; + throw std::runtime_error("self.dim() != src.dim()"); + } + std::cout << "self->data_ptr(): " << self->data_ptr() + << " sizes: " << self->sizes().data() << std::endl; + std::cout << "src->data_ptr(): " << src->data_ptr() + << " sizes: " << src->sizes().data() << std::endl; + for (int i = 0; i < self->dim(); i++) { + if (self->sizes()[i] != src->sizes()[i]) { + std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] " + << src->sizes()[i] << std::endl; + throw std::runtime_error("size mismatch"); + } + } + + int size = src->nbytes(); + // should check for device + cudaPointerAttributes srcAttributes, dstAttributes; + cudaError_t err; + // Get attributes of the source pointer + err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr()); + checkCudaError(err, "Failed to get source pointer attributes"); + // Get attributes of the destination pointer + err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr()); + checkCudaError(err, "Failed to get destination pointer attributes"); + bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; + bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; + // Determine the memory locations and perform the appropriate copy + if (srcIsDevice && dstIsDevice) { + // Device to Device copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyDeviceToDevice); + checkCudaError(err, "Failed to copy from device to device"); + } else if (srcIsDevice && !dstIsDevice) { + // Device to Host copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyDeviceToHost); + std::cout << "Device to Host copy, self data: " + << ((float*)self->data_ptr())[0] << std::endl; + checkCudaError(err, "Failed to copy from device to host"); + } else if (!srcIsDevice && dstIsDevice) { + // Host to Device copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyHostToDevice); + std::cout << "Host to Device copy, src data: " + << ((float*)src->data_ptr())[0] << std::endl; + checkCudaError(err, "Failed to copy from host to device"); + } else if (!srcIsDevice && !dstIsDevice) { + // Host to Host copy + std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0] + << std::endl; + std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size); + } else { + std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type + << ", src: " << srcAttributes.type << std::endl; + throw std::runtime_error("Unknown memory type"); + } + // print first value of src and self + return Error::Ok; +} +} + +struct AOTIDelegateHandle { + void* so_handle; + AOTInductorModelContainerHandle container_handle; +}; + +class AOTIBackend final : public ::executorch::runtime::BackendInterface { + public: + // Once in program + AOTIBackend() { + ET_LOG(Info, "AOTIBackend ctor"); + } + + bool is_available() const override { + return 1; + } + + // Once per loaded binary blob + Result init( + BackendInitContext& context, + FreeableBuffer* processed, // This will be the buffer from aoti_backend + ArrayRef compile_specs // This will be my empty list + ) const override { + // We could load the .so content directly. But I don't want to deal with + // relocation. So dumping a file and using dlopen + + // Create a temporary file + std::ofstream outfile("/tmp/test.so", std::ios::binary); + + // Write the ELF buffer to the temporary file + outfile.write((char*)processed->data(), sizeof(void*) * processed->size()); + + // Finish writing the file to disk + outfile.close(); + + // Free the in-memory buffer + processed->Free(); + + // Load the ELF using dlopen + void* so_handle = dlopen("/tmp/test.so", RTLD_LAZY | RTLD_LOCAL); + if (so_handle == nullptr) { + std::cout << dlerror() << std::endl; + return Error::AccessFailed; + } + + AOTInductorModelContainerCreateWithDevice = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); + if (AOTInductorModelContainerCreateWithDevice == nullptr) { + perror("dlsym1"); + return Error::AccessFailed; + } + AOTInductorModelContainerDelete = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerDelete")); + if (AOTInductorModelContainerDelete == nullptr) { + perror("dlsym2"); + return Error::AccessFailed; + } + AOTInductorModelContainerGetNumInputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); + if (AOTInductorModelContainerGetNumInputs == nullptr) { + perror("dlsym3"); + return Error::AccessFailed; + } + AOTInductorModelContainerGetNumOutputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); + if (AOTInductorModelContainerGetNumOutputs == nullptr) { + perror("dlsym4"); + return Error::AccessFailed; + } + AOTInductorModelContainerRun = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerRun")); + if (AOTInductorModelContainerRun == nullptr) { + perror("dlsym5"); + return Error::AccessFailed; + } + + AOTInductorModelContainerHandle container_handle = nullptr; + + AOTIRuntimeError err; + + err = AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "cuda", nullptr); + printf("container_handle=%p\n", container_handle); + + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = so_handle; + handle->container_handle = container_handle; + return (DelegateHandle*)handle; // Return the handle post-processing + } + + // Once per execution + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + EValue** args) const override { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + size_t num_inputs; + AOTInductorModelContainerGetNumInputs( + handle->container_handle, &num_inputs); + + size_t num_outputs; + AOTInductorModelContainerGetNumOutputs( + handle->container_handle, &num_outputs); + + std::vector inputs(num_inputs); + std::vector outputs(num_outputs); + + for (int i = 0; i < num_inputs; i++) { + auto tensor_in = args[i]->toTensor(); + inputs[i] = &tensor_in; + } + + for (int i = num_inputs; i < num_inputs + num_outputs; i++) { + auto tensor_out = args[i]->toTensor(); + outputs[i - num_inputs] = &tensor_out; + } + + AOTInductorModelContainerRun( + handle->container_handle, + inputs.data(), + num_inputs, + outputs.data(), + num_outputs, + // Should these last two be something? + nullptr, + nullptr); + + // Still need to copy the output to args, because they are malloc'ed but + // not using the data_ptr from outputs. + for (int i = 0; i < num_outputs; i++) { + auto args_out = args[i + num_inputs]->toTensor(); + aoti_torch_copy_(&args_out, outputs[i], 0); + } + return Error::Ok; + } + + void destroy(DelegateHandle* handle_) const override { + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + dlclose(handle->so_handle); + AOTInductorModelContainerDelete(handle->container_handle); + free(handle); + tensor_to_sizes.clear(); + tensor_to_strides.clear(); + } +}; + +} // namespace aoti + +namespace { +auto cls = aoti::AOTIBackend(); +executorch::runtime::Backend backend{"AotiBackend", &cls}; +static executorch::runtime::Error success_with_compiler = + register_backend(backend); +} // namespace + +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/TARGETS b/backends/aoti/runtime/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/runtime/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl new file mode 100644 index 00000000000..d51097f306d --- /dev/null +++ b/backends/aoti/runtime/targets.bzl @@ -0,0 +1,18 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + runtime.cxx_library( + name = "aoti_backend", + srcs = ["AotiBackend.cpp"], + headers = [], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/runtime/backend:interface", + "//executorch/runtime/core:core", + ], + ) diff --git a/install_requirements.py b/install_requirements.py index 409ed083970..0e0084fe3dd 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -12,7 +12,6 @@ from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible -# The pip repository that hosts nightly torch packages. # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled. TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly" diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 6c27e8ba616..ba9a686ccb9 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -53,6 +53,7 @@ set(EXECUTORCH_FOUND ON) include("${CMAKE_CURRENT_LIST_DIR}/ExecuTorchTargets.cmake") set(optional_lib_list + aoti_backend flatccrt etdump bundled_program diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index fb0dc0a4ade..6911aea3e9b 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -160,6 +160,11 @@ define_overridable_option( OFF ) +define_overridable_option( + EXECUTORCH_BUILD_AOTI "Build the AOTI backend" BOOL OFF +) + + if(EXECUTORCH_BUILD_ARM_BAREMETAL) set(_default_executorch_build_pthreadpool OFF) set(_default_executorch_build_cpuinfo OFF) @@ -317,6 +322,10 @@ check_required_options_on( EXECUTORCH_BUILD_PTHREADPOOL ) +check_required_options_on( + IF_ON EXECUTORCH_BUILD_AOTI REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR +) + check_conflicting_options_on( IF_ON EXECUTORCH_BUILD_ARM_BAREMETAL CONFLICTS_WITH EXECUTORCH_BUILD_PTHREADPOOL EXECUTORCH_BUILD_CPUINFO From 687688b0886850881b51897a3c8fb766185d2a77 Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 5 Aug 2025 23:06:56 -0700 Subject: [PATCH 08/50] Add export_add.py --- backends/aoti/aoti_partitioner.py | 2 +- exir/backend/backend_api.py | 3 ++ exir/lowered_backend_module.py | 1 + export_add.py | 31 +++++++++++++++++++ install_requirements.py | 50 +++++++++++++++++++++++++++++++ 5 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 export_add.py diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index 6cb7c6cc38a..e1524480698 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -28,7 +28,7 @@ class AOTISupportedOperators(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: supported = node.op == "call_function" and node.target in [ exir_ops.edge.aten.add.Tensor, - exir_ops.edge.aten._to_copy.default, + exir_ops.edge.dim_order_ops._to_dim_order_copy.default, ] return supported diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index dd8d97d66ac..95c7c9caa6d 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -720,6 +720,9 @@ def to_backend( fake_edge_program = copy.deepcopy(edge_program) partitioner_result = partitioner_instance(fake_edge_program) tagged_exported_program = partitioner_result.tagged_exported_program + # Make sure tagged_exported_program has the same example_inputs as edge_program + tagged_exported_program.example_inputs = edge_program.example_inputs + method_to_tagged_exported_program[method_name] = tagged_exported_program # Check that the partitioner did not modify the original graph diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 61414990703..2e889c6d81d 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -735,6 +735,7 @@ def create_exported_program_from_submodule( ), ) ], + example_inputs=owning_program.example_inputs, constants=subgraph_constants, verifiers=[owning_program.verifier], ), diff --git a/export_add.py b/export_add.py new file mode 100644 index 00000000000..cfaf9ab1c96 --- /dev/null +++ b/export_add.py @@ -0,0 +1,31 @@ +import torch +from executorch.backends.aoti.aoti_partitioner import AotiPartitioner +from executorch.exir import to_edge +from torch.export import export + + +# Start with a PyTorch model that adds two input tensors (matrices) +class Add(torch.nn.Module): + def __init__(self): + super(Add, self).__init__() + + def forward(self, x: torch.Tensor, y: torch.Tensor): + # return triton_transpose_acc(x, y) + return (x.cuda() + y.cuda()).cpu() + + +# 1. torch.export: Defines the program with the ATen operator set. +aten_dialect = export( + Add(), (torch.ones(10, device="cpu"), torch.ones(10, device="cpu")) +) +# 2. to_edge: Make optimizations for Edge devices +edge_program = to_edge(aten_dialect) + +edge_program = edge_program.to_backend(AotiPartitioner([])) + +# 3. to_executorch: Convert the graph to an ExecuTorch program +executorch_program = edge_program.to_executorch() + +# 4. Save the compiled .pte program +with open("add.pte", "wb") as file: + file.write(executorch_program.buffer) diff --git a/install_requirements.py b/install_requirements.py index 0e0084fe3dd..7e6d3010f93 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -12,8 +12,58 @@ from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible +<<<<<<< HEAD # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled. TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly" +======= +def python_is_compatible(): + # Scrape the version range from pyproject.toml, which should be in the current directory. + version_specifier = None + with open("pyproject.toml", "r") as file: + for line in file: + if line.startswith("requires-python"): + match = re.search(r'"([^"]*)"', line) + if match: + version_specifier = match.group(1) + break + + if not version_specifier: + print( + "WARNING: Skipping python version check: version range not found", + file=sys.stderr, + ) + return False + + # Install the packaging module if necessary. + try: + import packaging + except ImportError: + subprocess.run( + [sys.executable, "-m", "pip", "install", "packaging"], check=True + ) + # Compare the current python version to the range in version_specifier. Exits + # with status 1 if the version is not compatible, or with status 0 if the + # version is compatible or the logic itself fails. + try: + import packaging.specifiers + import packaging.version + + python_version = packaging.version.parse(platform.python_version()) + version_range = packaging.specifiers.SpecifierSet(version_specifier) + if python_version not in version_range: + print( + f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"', + file=sys.stderr, + ) + return False + except Exception as e: + print(f"WARNING: Skipping python version check: {e}", file=sys.stderr) + return True + + +# The pip repository that hosts nightly torch packages. +TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cu126" +>>>>>>> fe438f9c92 (Add export_add.py) # Supported CUDA versions - modify this to add/remove supported versions # Format: tuple of (major, minor) version numbers From 0ce692887dd2f7d0e188790361846566988bcba4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 8 Aug 2025 00:19:57 -0700 Subject: [PATCH 09/50] prototype e2e works on latest ET --- CMakeLists.txt | 2 ++ backends/aoti/CMakeLists.txt | 31 +++++++++++++++++---------- backends/aoti/aoti_backend.py | 2 +- backends/aoti/runtime/AotiBackend.cpp | 1 - export_and_run_aoti.sh | 11 ++++++++++ requirements-dev.txt | 1 + 6 files changed, 35 insertions(+), 13 deletions(-) create mode 100644 export_and_run_aoti.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 4497fa133c0..f98246e1851 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,8 @@ # https://github.com/google/XNNPACK/commit/c690daa67f883e1b627aadf7684c06797e9a0684 cmake_minimum_required(VERSION 3.29) project(executorch) +# project(executorch LANGUAGES CXX CUDA) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 12886bc0cac..36059f16fe4 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -21,27 +21,36 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() -include(${EXECUTORCH_ROOT}/build/Utils.cmake) +# include(${EXECUTORCH_ROOT}/build/Utils.cmake) -find_package(CUDA) - -set(_common_include_directories ${EXECUTORCH_ROOT}/..) +find_package(CUDAToolkit REQUIRED) set(_aoti_sources runtime/AotiBackend.cpp) - add_library(aoti_backend STATIC ${_aoti_sources}) target_include_directories( - aoti_backend PUBLIC ${_common_include_directories} ${CUDA_INCLUDE_DIRS} + aoti_backend + PUBLIC + ${CUDAToolkit_INCLUDE_DIRS} + $ + $ ) - target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC) -target_link_libraries(aoti_backend PUBLIC extension_tensor ${CUDA_LIBRARIES}) +# Ensure symbols are exported properly +target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic) +# Link against CUDA::cudart (the CUDA runtime library) +target_link_libraries( + aoti_backend + PUBLIC + extension_tensor + CUDA::cudart + ${CMAKE_DL_LIBS} +) +# If you need other CUDA libraries, link them similarly: +# target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...) +# If you have a custom function, keep it executorch_target_link_options_shared_lib(aoti_backend) - install( TARGETS aoti_backend EXPORT ExecuTorchTargets DESTINATION lib - INCLUDES - DESTINATION ${_common_include_directories} ) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index d1e8a5b4896..b9491f023e3 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -34,7 +34,7 @@ def preprocess( so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] print(so_path) check_call( - f"patchelf --remove-needed libtorch.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}", + f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}", shell=True, ) diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp index 0044a4155d6..61b54cdc554 100644 --- a/backends/aoti/runtime/AotiBackend.cpp +++ b/backends/aoti/runtime/AotiBackend.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh new file mode 100644 index 00000000000..7113e44dfe5 --- /dev/null +++ b/export_and_run_aoti.sh @@ -0,0 +1,11 @@ +./install_executorch.sh +python export_add.py +./install_executorch.sh --clean +mkdir -p cmake-out +cd cmake-out +cmake -DEXECUTORCH_BUILD_AOTI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + .. +cd .. +cmake --build cmake-out -j9 +./cmake-out/executor_runner --model_path add.pte diff --git a/requirements-dev.txt b/requirements-dev.txt index 9df5e7b93ed..8c8f518a5ea 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ zstd # Imported by resolve_buck.py. certifi # Imported by resolve_buck.py. lintrunner==0.12.7 lintrunner-adapters==0.12.6 +patchelf From 9afb4ac62620d8b974b00617981330b5b824349e Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 11 Aug 2025 13:01:06 -0700 Subject: [PATCH 10/50] hacky support .so file seperation by hardcoding file path --- backends/aoti/aoti_backend.py | 18 ++++++++++++------ backends/aoti/runtime/AotiBackend.cpp | 22 +++++++++++++--------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index b9491f023e3..c0691d5c075 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -16,7 +16,8 @@ PreprocessResult, ) from executorch.exir.backend.compile_spec_schema import CompileSpec - +import os +import shutil @final class AotiBackend(BackendDetails): @@ -31,13 +32,18 @@ def preprocess( copy_edge_program = copy.deepcopy(edge_program) graph_module = copy_edge_program.graph_module args, kwargs = copy_edge_program.example_inputs - so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] - print(so_path) + temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] + so_path = os.path.join(os.getcwd(), 'aoti.so') + print("so_path after aot_compile: ", temp_so_path) + print("so path we will using ", so_path) + shutil.copyfile(temp_so_path, so_path) + check_call( f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}", shell=True, ) - with open(so_path, "rb") as f: - data = f.read() - return PreprocessResult(data) + # with open(so_path, "rb") as f: + # data = f.read() + + return PreprocessResult(so_path.encode("utf-8")) diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp index 61b54cdc554..42c58394f22 100644 --- a/backends/aoti/runtime/AotiBackend.cpp +++ b/backends/aoti/runtime/AotiBackend.cpp @@ -430,20 +430,24 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { // We could load the .so content directly. But I don't want to deal with // relocation. So dumping a file and using dlopen - // Create a temporary file - std::ofstream outfile("/tmp/test.so", std::ios::binary); + // // Create a temporary file + // std::ofstream outfile("/tmp/test.so", std::ios::binary); - // Write the ELF buffer to the temporary file - outfile.write((char*)processed->data(), sizeof(void*) * processed->size()); + // // Write the ELF buffer to the temporary file + // outfile.write((char*)processed->data(), sizeof(void*) * processed->size()); - // Finish writing the file to disk - outfile.close(); + // // Finish writing the file to disk + // outfile.close(); - // Free the in-memory buffer - processed->Free(); + // // Free the in-memory buffer + // processed->Free(); + + const char* so_path = static_cast(processed->data()); + + printf("so path: %s\n", so_path); // Load the ELF using dlopen - void* so_handle = dlopen("/tmp/test.so", RTLD_LAZY | RTLD_LOCAL); + void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); if (so_handle == nullptr) { std::cout << dlerror() << std::endl; return Error::AccessFailed; From cef27e1254e85d9dfeb866503db5a4a8c8fa7e95 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 11 Aug 2025 13:04:43 -0700 Subject: [PATCH 11/50] hacky support .so file seperation by hardcoding file path --- backends/aoti/aoti_backend.py | 3 --- backends/aoti/runtime/AotiBackend.cpp | 17 ++--------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index c0691d5c075..2653820e914 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -43,7 +43,4 @@ def preprocess( shell=True, ) - # with open(so_path, "rb") as f: - # data = f.read() - return PreprocessResult(so_path.encode("utf-8")) diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp index 42c58394f22..5a945403b3a 100644 --- a/backends/aoti/runtime/AotiBackend.cpp +++ b/backends/aoti/runtime/AotiBackend.cpp @@ -427,21 +427,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { FreeableBuffer* processed, // This will be the buffer from aoti_backend ArrayRef compile_specs // This will be my empty list ) const override { - // We could load the .so content directly. But I don't want to deal with - // relocation. So dumping a file and using dlopen - - // // Create a temporary file - // std::ofstream outfile("/tmp/test.so", std::ios::binary); - - // // Write the ELF buffer to the temporary file - // outfile.write((char*)processed->data(), sizeof(void*) * processed->size()); - - // // Finish writing the file to disk - // outfile.close(); - - // // Free the in-memory buffer - // processed->Free(); - const char* so_path = static_cast(processed->data()); printf("so path: %s\n", so_path); @@ -453,6 +438,8 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { return Error::AccessFailed; } + processed->Free(); + AOTInductorModelContainerCreateWithDevice = reinterpret_cast( dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); From c85fc4278734bd3486b0aa0afff2d7659e849a4e Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 12 Aug 2025 14:11:22 -0700 Subject: [PATCH 12/50] support latest backend interface --- backends/aoti/aoti_backend.py | 7 ++++--- backends/aoti/aoti_partitioner.py | 10 ++++++---- backends/aoti/runtime/AotiBackend.cpp | 3 ++- export_add.py | 2 +- export_and_run_aoti.sh | 4 ++-- export_mv2.py | 28 +++++++++++++++++++++++++++ 6 files changed, 43 insertions(+), 11 deletions(-) create mode 100644 export_mv2.py diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 2653820e914..efc61006fc2 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -5,6 +5,8 @@ # LICENSE file in the root directory of this source tree. import copy +import os +import shutil from subprocess import check_call from typing import final, List @@ -16,8 +18,7 @@ PreprocessResult, ) from executorch.exir.backend.compile_spec_schema import CompileSpec -import os -import shutil + @final class AotiBackend(BackendDetails): @@ -33,7 +34,7 @@ def preprocess( graph_module = copy_edge_program.graph_module args, kwargs = copy_edge_program.example_inputs temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] - so_path = os.path.join(os.getcwd(), 'aoti.so') + so_path = os.path.join(os.getcwd(), "aoti.so") print("so_path after aot_compile: ", temp_so_path) print("so path we will using ", so_path) shutil.copyfile(temp_so_path, so_path) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index e1524480698..836a8fcb8c4 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -23,13 +23,15 @@ from torch.fx.passes.operator_support import OperatorSupportBase +supported_fallback_operators = [] + class AOTISupportedOperators(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - supported = node.op == "call_function" and node.target in [ - exir_ops.edge.aten.add.Tensor, - exir_ops.edge.dim_order_ops._to_dim_order_copy.default, - ] + supported = ( + node.op == "call_function" + and node.target not in supported_fallback_operators + ) return supported diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp index 5a945403b3a..94e15f0596f 100644 --- a/backends/aoti/runtime/AotiBackend.cpp +++ b/backends/aoti/runtime/AotiBackend.cpp @@ -50,6 +50,7 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; using executorch::runtime::Result; using executorch::runtime::etensor::Tensor; +using executorch::runtime::Span; extern "C" { using AOTITensorHandle = Tensor*; @@ -494,7 +495,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { Error execute( BackendExecutionContext& context, DelegateHandle* handle_, - EValue** args) const override { + Span args) const override { AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; size_t num_inputs; diff --git a/export_add.py b/export_add.py index cfaf9ab1c96..d0d2489b885 100644 --- a/export_add.py +++ b/export_add.py @@ -27,5 +27,5 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): executorch_program = edge_program.to_executorch() # 4. Save the compiled .pte program -with open("add.pte", "wb") as file: +with open("aoti_model.pte", "wb") as file: file.write(executorch_program.buffer) diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 7113e44dfe5..01c023f0e8f 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -1,5 +1,5 @@ ./install_executorch.sh -python export_add.py +python $1 ./install_executorch.sh --clean mkdir -p cmake-out cd cmake-out @@ -8,4 +8,4 @@ cmake -DEXECUTORCH_BUILD_AOTI=ON \ .. cd .. cmake --build cmake-out -j9 -./cmake-out/executor_runner --model_path add.pte +./cmake-out/executor_runner --model_path aoti_model.pte diff --git a/export_mv2.py b/export_mv2.py new file mode 100644 index 00000000000..fa84084088f --- /dev/null +++ b/export_mv2.py @@ -0,0 +1,28 @@ +import torch +from executorch.backends.aoti.aoti_partitioner import AotiPartitioner +from executorch.examples.models.mobilenet_v2 import MV2Model +from executorch.exir import to_edge +from torch.export import export +from torchvision import models +from torchvision.models.mobilenetv2 import MobileNet_V2_Weights + +mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights) +mv2 = mv2.eval() + +model_inputs = (torch.randn(1, 3, 224, 224),) + + +# 1. torch.export: Defines the program with the ATen operator set. +aten_dialect = export(mv2, model_inputs) + +# 2. to_edge: Make optimizations for Edge devices +edge_program = to_edge(aten_dialect) + +edge_program = edge_program.to_backend(AotiPartitioner([])) + +# 3. to_executorch: Convert the graph to an ExecuTorch program +executorch_program = edge_program.to_executorch() + +# 4. Save the compiled .pte program +with open("aoti_model.pte", "wb") as file: + file.write(executorch_program.buffer) From 2ab79f09f042c5b66f7b5bfd2519509ce2ff7302 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 19 Aug 2025 15:45:46 -0700 Subject: [PATCH 13/50] temp submit for execute model with weight --- backends/aoti/CMakeLists.txt | 6 +- backends/aoti/aoti_backend.py | 28 +- backends/aoti/aoti_partitioner.py | 180 +- backends/aoti/runtime/AotiBackend.cpp | 561 -- backends/aoti/runtime/aoti_backend.cpp | 302 + .../aoti/runtime/aoti_model_container.cpp | 34 + backends/aoti/runtime/aoti_model_container.h | 91 + backends/aoti/runtime/shims/memory.cpp | 389 ++ backends/aoti/runtime/shims/memory.h | 102 + .../aoti/runtime/shims/tensor_attribute.cpp | 137 + .../aoti/runtime/shims/tensor_attribute.h | 76 + backends/aoti/runtime/targets.bzl | 13 +- ...ky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin | Bin 0 -> 11320 bytes ...rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin | Bin 0 -> 10048 bytes ...x5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin | Bin 0 -> 10816 bytes ...ci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin | Bin 0 -> 10176 bytes ...c3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp | 6 + ...3am2yslkkhyp4e7oaf7ej.kernel_metadata.json | 1 + ...vspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin | Bin 0 -> 11320 bytes ...6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin | Bin 0 -> 10936 bytes ...xauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin | Bin 0 -> 11320 bytes ...47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin | Bin 0 -> 10944 bytes ...ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin | Bin 0 -> 11320 bytes ...2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp | 965 +++ ...kmcvkgx3hnjvysymcgms.wrapper_metadata.json | 1 + ...nhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin | Bin 0 -> 10936 bytes ...jd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp | 6144 +++++++++++++++++ ...6ndtpaca5r3ct3piucq7.wrapper_metadata.json | 1 + ...x6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin | Bin 0 -> 11320 bytes ...f6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp | 965 +++ ...x5fugystcn2wozmmxwaf.wrapper_metadata.json | 1 + ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 0 -> 8968 bytes ...retoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin | Bin 0 -> 9784 bytes ...ugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin | Bin 0 -> 11320 bytes ...2j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin | Bin 0 -> 13832 bytes ...hvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin | Bin 0 -> 11320 bytes ...6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin | Bin 0 -> 10296 bytes ...kxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin | Bin 0 -> 13240 bytes ...lgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp | 6 + ...uw6cqsbpvx756nf43k7mq.kernel_metadata.json | 1 + ...ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin | Bin 0 -> 9528 bytes ...ksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp | 6 + ...4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json | 1 + ...rsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin | Bin 0 -> 9528 bytes ...pudstbhsobm3wlczsly46p5oeax43spr3eab.cubin | Bin 0 -> 21056 bytes ...reaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin | Bin 0 -> 10296 bytes ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 0 -> 11656 bytes ...oylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin | Bin 0 -> 10296 bytes ...wncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin | Bin 0 -> 15624 bytes ...47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin | Bin 0 -> 10816 bytes ...zkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin | Bin 0 -> 11320 bytes ...jkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin | Bin 0 -> 9528 bytes ...zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin | Bin 0 -> 11400 bytes ...zc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin | Bin 0 -> 13832 bytes ...vkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin | Bin 0 -> 6280 bytes exir/program/_program.py | 2 + export_add.py | 31 - export_and_run_aoti.sh | 141 +- export_aoti.py | 178 + export_mv2.py | 28 - runtime/executor/method.cpp | 2 + 61 files changed, 9754 insertions(+), 645 deletions(-) delete mode 100644 backends/aoti/runtime/AotiBackend.cpp create mode 100644 backends/aoti/runtime/aoti_backend.cpp create mode 100644 backends/aoti/runtime/aoti_model_container.cpp create mode 100644 backends/aoti/runtime/aoti_model_container.h create mode 100644 backends/aoti/runtime/shims/memory.cpp create mode 100644 backends/aoti/runtime/shims/memory.h create mode 100644 backends/aoti/runtime/shims/tensor_attribute.cpp create mode 100644 backends/aoti/runtime/shims/tensor_attribute.h create mode 100644 c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin create mode 100644 c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin create mode 100644 c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin create mode 100644 c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin create mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp create mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json create mode 100644 c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin create mode 100644 c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin create mode 100644 c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin create mode 100644 cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin create mode 100644 caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin create mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp create mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json create mode 100644 cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin create mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp create mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json create mode 100644 cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin create mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp create mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json create mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin create mode 100644 cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin create mode 100644 cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin create mode 100644 ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin create mode 100644 cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin create mode 100644 ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin create mode 100644 cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin create mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp create mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json create mode 100644 cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin create mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp create mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json create mode 100644 crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin create mode 100644 csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin create mode 100644 ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin create mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin create mode 100644 cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin create mode 100644 cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin create mode 100644 cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin create mode 100644 cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin create mode 100644 cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin create mode 100644 cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin create mode 100644 cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin create mode 100644 czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin delete mode 100644 export_add.py create mode 100644 export_aoti.py delete mode 100644 export_mv2.py diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 36059f16fe4..1c596fef6e6 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -25,7 +25,11 @@ endif() find_package(CUDAToolkit REQUIRED) -set(_aoti_sources runtime/AotiBackend.cpp) +set(_aoti_sources + runtime/aoti_backend.cpp + runtime/aoti_model_container.cpp + runtime/shims/memory.cpp + runtime/shims/tensor_attribute.cpp) add_library(aoti_backend STATIC ${_aoti_sources}) target_include_directories( aoti_backend diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index efc61006fc2..a0c4a2aa005 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -7,6 +7,7 @@ import copy import os import shutil +import typing from subprocess import check_call from typing import final, List @@ -29,18 +30,29 @@ def preprocess( ) -> PreprocessResult: print("entering the lowerable parts in AotiBackend.preprocess....") - print("here", edge_program.example_inputs) + # print("here", edge_program.example_inputs) copy_edge_program = copy.deepcopy(edge_program) - graph_module = copy_edge_program.graph_module + # graph_module = copy_edge_program.graph_module + edge_program_module = copy_edge_program.module() args, kwargs = copy_edge_program.example_inputs - temp_so_path = torch._inductor.aot_compile(graph_module, args, kwargs, options={}) # type: ignore[arg-type] - so_path = os.path.join(os.getcwd(), "aoti.so") - print("so_path after aot_compile: ", temp_so_path) - print("so path we will using ", so_path) - shutil.copyfile(temp_so_path, so_path) + # print("args, kwargs", args, kwargs) + print("len(args)", len(args)) + print("args[0].shape", args[0].shape) + print("len(kwargs)", len(kwargs)) + + output_path = os.path.join(os.getcwd(), "aoti.so") + + options: dict[str, typing.Any] = { + "aot_inductor.package_constants_in_so": True, + "aot_inductor.output_path": output_path, + } + + so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] + + assert so_path == output_path, f"Expected {output_path} but got {so_path}" check_call( - f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {so_path}", + f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}", shell=True, ) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index 836a8fcb8c4..f72b97f0253 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -6,6 +6,7 @@ # pyre-unsafe +import operator from typing import cast, final, List import torch @@ -25,14 +26,180 @@ supported_fallback_operators = [] +inductor_fallback_ops: dict[str, dict[str, list[str]]] = { + "aten._adaptive_avg_pool2d_backward.default": {}, + "aten._adaptive_avg_pool2d.default": {}, + "aten._adaptive_avg_pool3d_backward.default": {}, + "aten._adaptive_avg_pool3d.default": {}, + "aten._addmm_activation.default": {}, + "aten._cdist_backward.default": {}, + "aten._cdist_forward.default": {}, + "aten._cudnn_rnn.default": {}, + "aten._dyn_quant_matmul_4bit.default": {}, + "aten._dyn_quant_pack_4bit_weight.default": {}, + "aten._efficient_attention_backward.default": {}, + "aten._efficient_attention_forward.default": {}, + "aten._efficientzerotensor.default": {}, + "aten._embedding_bag_dense_backward.default": {}, + "aten._embedding_bag_forward_only.default": {}, + "aten._embedding_bag_per_sample_weights_backward.default": {}, + "aten._embedding_bag.default": {}, + "aten._fft_c2c.default": {}, + "aten._fft_r2c.default": {}, + "aten._flash_attention_backward.default": {}, + "aten._flash_attention_forward.default": {}, + "aten._fused_moving_avg_obs_fq_helper_functional.default": {}, + "aten._fused_moving_avg_obs_fq_helper.default": {}, + "aten._fused_rms_norm.default": {}, + "aten._histogramdd_from_bin_cts.default": {}, + "aten._int_mm.out": {}, + "aten._pdist_backward.default": {}, + "aten._pdist_forward.default": {}, + "aten._scaled_dot_product_attention_math_for_mps.default": {}, + "aten._scaled_dot_product_cudnn_attention_backward.default": {}, + "aten._scaled_dot_product_cudnn_attention.default": {}, + "aten._scaled_dot_product_efficient_attention_backward.default": {}, + "aten._scaled_dot_product_efficient_attention.default": {}, + "aten._scaled_dot_product_flash_attention_backward.default": {}, + "aten._scaled_dot_product_flash_attention_for_cpu_backward.default": {}, + "aten._scaled_dot_product_flash_attention_for_cpu.default": {}, + "aten._scaled_dot_product_flash_attention.default": {}, + "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {}, + "aten._scaled_dot_product_fused_attention_overrideable.default": {}, + "aten._scaled_mm.default": {}, + "aten._scaled_mm.out": {}, + "aten._segment_reduce_backward.default": {}, + "aten._thnn_fused_lstm_cell.default": {}, + "aten._to_sparse.default": {}, + "aten._trilinear.default": {}, + "aten._weight_int4pack_mm.default": {}, + "aten._weight_int8pack_mm.default": {}, + "aten.abs.default": {}, + "aten.adaptive_max_pool2d_backward.default": {}, + "aten.adaptive_max_pool2d.default": {}, + "aten.adaptive_max_pool3d_backward.default": {}, + "aten.adaptive_max_pool3d.default": {}, + "aten.add.Scalar": {}, + "aten.add.Tensor": {}, + "aten.addbmm.default": {}, + "aten.addmm.out": {}, + "aten.addmv.default": {}, + "aten.angle.default": {}, + "aten.avg_pool2d_backward.default": {}, + "aten.avg_pool2d.default": {}, + "aten.avg_pool3d_backward.default": {}, + "aten.avg_pool3d.default": {}, + "aten.baddbmm.out": {}, + "aten.bernoulli_.float": {}, + "aten.bernoulli_.Tensor": {}, + "aten.bmm.out": {}, + "aten.bucketize.Tensor": {}, + "aten.cat.default": {}, + "aten.cholesky_inverse.default": {}, + "aten.cholesky_solve.default": {}, + "aten.convolution_backward.default": {}, + "aten.convolution.default": {}, + "aten.cummax.default": {}, + "aten.cummin.default": {}, + "aten.cumprod.default": {}, + "aten.cumsum.default": {}, + "aten.exponential.default": {}, + "aten.fill_.Scalar": {}, + "aten.fractional_max_pool2d_backward.default": {}, + "aten.fractional_max_pool2d.default": {}, + "aten.fractional_max_pool3d_backward.default": {}, + "aten.fractional_max_pool3d.default": {}, + "aten.gcd.default": {}, + "aten.geqrf.default": {}, + "aten.grid_sampler_2d_backward.default": {}, + "aten.hann_window.default": {}, + "aten.histc.default": {}, + "aten.histogram.bin_ct": {}, + "aten.index_put.default": {}, + "aten.index_reduce.default": {}, + "aten.index.Tensor": {}, + "aten.kthvalue.default": {}, + "aten.logcumsumexp.default": {}, + "aten.lu_unpack.default": {}, + "aten.masked_scatter_backward.default": {}, + "aten.masked_scatter.default": {}, + "aten.masked_select.default": {}, + "aten.max_pool2d_with_indices_backward.default": {}, + "aten.max_pool2d_with_indices.default": {}, + "aten.max_pool3d_with_indices_backward.default": {}, + "aten.max_pool3d_with_indices.default": {}, + "aten.max_unpool2d.default": {}, + "aten.max_unpool3d.default": {}, + "aten.median.default": {}, + "aten.mm.out": {}, + "aten.mode.default": {}, + "aten.mul.Scalar": {}, + "aten.mul.Tensor": {}, + "aten.nanmedian.default": {}, + "aten.narrow.default": {}, + "aten.native_dropout.default": {}, + "aten.nonzero.default": {}, + "aten.normal_functional.default": {}, + "aten.ormqr.default": {}, + "aten.pad.default": {}, + "aten.permute.default": {}, + "aten.polar.default": {}, + "aten.pow.Scalar": {}, + "aten.pow.Tensor_Scalar": {}, + "aten.pow.Tensor_Tensor": {}, + "aten.rand.default": {}, + "aten.rand.generator": {}, + "aten.randint.default": {}, + "aten.randint.generator": {}, + "aten.randint.low_out": {}, + "aten.randint.low": {}, + "aten.randn.default": {}, + "aten.randn.generator": {}, + "aten.randperm.default": {}, + "aten.repeat_interleave.Tensor": {}, + "aten.replication_pad1d_backward.default": {}, + "aten.replication_pad2d_backward.default": {}, + "aten.reshape.default": {}, + "aten.resize_.default": {}, + "aten.resize_as_.default": {}, + "aten.scatter_reduce.two_out": {}, + "aten.scatter.src_out": {}, + "aten.scatter.value_out": {}, + "aten.searchsorted.Scalar": {}, + "aten.searchsorted.Tensor": {}, + "aten.segment_reduce.default": {}, + "aten.set_.source_Tensor": {}, + "aten.slice.Tensor": {}, + "aten.soft_margin_loss_backward.default": {}, + "aten.sort.default": {}, + "aten.sort.stable": {}, + "aten.squeeze.dim": {}, + "aten.to_sparse.default": {}, + "aten.topk.default": {}, + "aten.triangular_solve.default": {}, + "aten.uniform.default": {}, + "aten.upsample_bicubic2d_backward.default": {}, + "aten.upsample_linear1d_backward.default": {}, + "aten.upsample_trilinear3d_backward.default": {}, + "aten.view_as_complex.default": {}, + "aten.view_as_real.default": {}, + "aten.view.dtype": {}, + "aten._weight_int4pack_mm_with_scales_and_zeros.default": {}, +} + class AOTISupportedOperators(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - supported = ( - node.op == "call_function" - and node.target not in supported_fallback_operators + supported = node.op == "call_function" and ( + node.target == operator.getitem + or node.target._op not in inductor_fallback_ops ) + # if node.op == "call_function" and node.target != operator.getitem: + # print(node.target._op) + # print(supported) + # print('------------------') + return supported def is_node_supported_custom(self, node: torch.fx.Node) -> bool: @@ -55,6 +222,8 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: # Run the CapabilityBasedPartitioner to return the largest possible # subgraphs containing the nodes with the tags # logger.info("AotiPartitioner::partition") + print("entering partitioner...") + partition_tags = {} capability_partitioner = CapabilityBasedPartitioner( @@ -63,6 +232,11 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: allows_single_node_partition=True, ) partition_list = capability_partitioner.propose_partitions() + + assert len(partition_list) == 1, "Graph break is not supported yet" + + print(f"graph breaks into {len(partition_list)} parts") + for partition in partition_list: for node in partition.nodes: tag = f"tag{partition.id}" diff --git a/backends/aoti/runtime/AotiBackend.cpp b/backends/aoti/runtime/AotiBackend.cpp deleted file mode 100644 index 94e15f0596f..00000000000 --- a/backends/aoti/runtime/AotiBackend.cpp +++ /dev/null @@ -1,561 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "cuda_runtime.h" - -#include -#include -#include -#include -#include -#include - -namespace executorch { -namespace backends { -namespace aoti { - -// Here is where the aoti bouncers are going to be defined. -// I define the globals aoti generated compiled code calls -// They can be backed by ET systems - -using namespace std; - -using executorch::aten::ScalarType; -using executorch::runtime::ArrayRef; -using executorch::runtime::Backend; -using executorch::runtime::BackendExecutionContext; -using executorch::runtime::BackendInitContext; -using executorch::runtime::CompileSpec; -using executorch::runtime::DelegateHandle; -using executorch::runtime::Error; -using executorch::runtime::EValue; -using executorch::runtime::FreeableBuffer; -using executorch::runtime::MemoryAllocator; -using executorch::runtime::Result; -using executorch::runtime::etensor::Tensor; -using executorch::runtime::Span; - -extern "C" { -using AOTITensorHandle = Tensor*; - -// TODO: We should get a proper one -struct CUDAStreamGuardOpaque; -using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; - -using AOTIRuntimeError = Error; -using AOTITorchError = Error; - -struct AOTInductorModelContainerOpaque; -using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*; -using AOTInductorStreamHandle = void*; -using AOTIProxyExecutorHandle = void*; - -using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - const char* device_str, - const char* cubin_dir); - -using AOTInductorModelContainerDeleteFunc = - AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle); - -using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants); - -using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants); - -using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)( - AOTInductorModelContainerHandle container_handle, - AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AOTITensorHandle* - output_handles, // array for writing output AOTITensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle); - -AOTInductorModelContainerCreateWithDeviceFunc - AOTInductorModelContainerCreateWithDevice = nullptr; -AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr; -AOTInductorModelContainerGetNumInputsFunc - AOTInductorModelContainerGetNumInputs = nullptr; -AOTInductorModelContainerGetNumOutputsFunc - AOTInductorModelContainerGetNumOutputs = nullptr; -AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr; -std::unordered_map> tensor_to_sizes; -std::unordered_map> tensor_to_strides; -std::unordered_set> tensors; - -int32_t aoti_torch_grad_mode_is_enabled() { - // No autograd ever - return false; -} - -void aoti_torch_grad_mode_set_enabled(bool enabled) { - if (enabled) { - throw std::runtime_error("Cannot enable autograd"); - } -} - -AOTITorchError aoti_torch_get_data_ptr( - AOTITensorHandle tensor, - void** ret_data_ptr) { - *ret_data_ptr = tensor->mutable_data_ptr(); - return Error::Ok; -} - -AOTITorchError aoti_torch_get_storage_offset( - AOTITensorHandle tensor, - int64_t* ret_storage_offset) { - // Storage offset is always 0 in ET - *ret_storage_offset = 0; - return Error::Ok; -} - -AOTITorchError aoti_torch_get_strides( - AOTITensorHandle tensor, - int64_t** ret_strides) { - auto it = tensor_to_strides.find(tensor); - if (it == tensor_to_strides.end()) { - std::vector strides(tensor->dim()); - auto tensor_strides = tensor->strides(); - for (int i = 0; i < tensor->dim(); i++) { - strides[i] = tensor_strides[i]; - } - it = tensor_to_strides.emplace(tensor, std::move(strides)).first; - } - *ret_strides = it->second.data(); - std::cout << "getting strides from tensor " << tensor << " with dim " - << tensor->dim() << std::endl; - for (int i = 0; i < tensor->dim(); i++) { - std::cout << "strides " << i << " = " << *ret_strides[i] << std::endl; - } - return Error::Ok; -} - -AOTITorchError aoti_torch_get_dtype( - AOTITensorHandle tensor, - int32_t* ret_dtype) { - *ret_dtype = static_cast(tensor->scalar_type()); - return Error::Ok; -} - -AOTITorchError aoti_torch_get_sizes( - AOTITensorHandle tensor, - int64_t** ret_sizes) { - auto it = tensor_to_sizes.find(tensor); - if (it == tensor_to_sizes.end()) { - std::vector sizes(tensor->dim()); - auto tensor_sizes = tensor->sizes(); - for (int i = 0; i < tensor->dim(); i++) { - sizes[i] = tensor_sizes[i]; - } - it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first; - } - *ret_sizes = it->second.data(); - std::cout << "getting sizes from tensor " << tensor << " with dim " - << tensor->dim() << std::endl; - for (int i = 0; i < tensor->dim(); i++) { - std::cout << "size " << i << " = " << *ret_sizes[i] << std::endl; - } - return Error::Ok; -} - -AOTITorchError aoti_torch_get_storage_size( - AOTITensorHandle tensor, - int64_t* ret_size) { - throw std::runtime_error("Cannot get storage size on ETensor"); -} - -AOTITorchError aoti_torch_create_tensor_from_blob_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor, - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size) { - throw std::runtime_error("Not creating Tensor from blob here"); -} - -AOTITorchError aoti_torch_create_cuda_stream_guard( - void* stream, - int32_t device_index, - CUDAStreamGuardHandle* ret_guard) { - std::cout << "Entering stream guard for device " << device_index << std::endl; - return Error::Ok; -} - -AOTITorchError aoti_torch_delete_cuda_stream_guard( - CUDAStreamGuardHandle guard) { - std::cout << "Exiting stream guard" << std::endl; - return Error::Ok; -} - -int aoti_torch_device_type_cpu() { - // Let's say cpu is 0 for ET as well - return 0; -} - -__attribute__((__visibility__("default"))) int32_t -aoti_torch_device_type_cuda() { - // Let's say cuda is 1 for ET as well - return 1; -} - -__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { - // Let assume the dtype here is all we will support - return 6; -} - -AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { - std::cout << "Deleting " << tensor << std::endl; - for (auto it = tensors.begin(); it != tensors.end(); ++it) { - if (it->get() == tensor) { - tensors.erase(it); - break; // Exit the loop once the element is found and removed - } - } - return Error::Ok; -} -AOTITorchError aoti_torch_create_tensor_from_blob( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor) { - throw std::runtime_error("Should never create from blob"); -} - -AOTITorchError aoti_torch_empty_strided( - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor) { - // This requires us to reserve CUDA memory and put it into a ETensor - void* ptr; - int64_t numel = 1; - for (int i = 0; i < ndim; i++) { - numel *= sizes_ptr[i]; - } - - if (dtype != 6) { // throw if not float32 - throw std::runtime_error("Need to implement empty_strided for non-float32"); - } - - int64_t nbytes = numel * 4; - - if (device_type == 1) { // cuda - std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl; - cudaError_t err = cudaMalloc(&ptr, nbytes); - if (err != cudaSuccess) { - std::cout << "failed to allocate " << nbytes << std::endl; - throw std::runtime_error("Failed to call cudaMalloc"); - } - } else if (device_type == 0) { // cpu - std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl; - ptr = malloc(nbytes); - if (ptr == nullptr) { - throw std::runtime_error("Failed to call malloc"); - } - } else { - throw std::runtime_error( - "Need to implement empty_strided for non-CUDA non-CPU"); - } - std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr " - << sizes_ptr << std::endl; - - // ETensor sizes - std::vector sizes(ndim); - for (int i = 0; i < ndim; i++) { - sizes[i] = sizes_ptr[i]; - } - // ETensor creation - auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr); - - // Store the tensor - tensors.insert(tensor); - - std::cout << "sizes.data(): " << sizes.data() - << ", tensor->sizes().data(): " << tensor->sizes().data() - << std::endl; - std::cout << "Size[0] of tensor " << tensor.get() << " is " - << tensor->sizes()[0] << std::endl; - *ret_new_tensor = tensor.get(); - return Error::Ok; -} - -void checkCudaError(cudaError_t err, const char* msg) { - if (err != cudaSuccess) { - std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")" - << std::endl; - exit(EXIT_FAILURE); - } -} - -AOTITorchError aoti_torch_copy_( - AOTITensorHandle self, - AOTITensorHandle src, - int32_t non_blocking) { - // check if size is the same - if (self->dim() != src->dim()) { - std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim() - << std::endl; - throw std::runtime_error("self.dim() != src.dim()"); - } - std::cout << "self->data_ptr(): " << self->data_ptr() - << " sizes: " << self->sizes().data() << std::endl; - std::cout << "src->data_ptr(): " << src->data_ptr() - << " sizes: " << src->sizes().data() << std::endl; - for (int i = 0; i < self->dim(); i++) { - if (self->sizes()[i] != src->sizes()[i]) { - std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] " - << src->sizes()[i] << std::endl; - throw std::runtime_error("size mismatch"); - } - } - - int size = src->nbytes(); - // should check for device - cudaPointerAttributes srcAttributes, dstAttributes; - cudaError_t err; - // Get attributes of the source pointer - err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr()); - checkCudaError(err, "Failed to get source pointer attributes"); - // Get attributes of the destination pointer - err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr()); - checkCudaError(err, "Failed to get destination pointer attributes"); - bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; - bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; - // Determine the memory locations and perform the appropriate copy - if (srcIsDevice && dstIsDevice) { - // Device to Device copy - err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyDeviceToDevice); - checkCudaError(err, "Failed to copy from device to device"); - } else if (srcIsDevice && !dstIsDevice) { - // Device to Host copy - err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyDeviceToHost); - std::cout << "Device to Host copy, self data: " - << ((float*)self->data_ptr())[0] << std::endl; - checkCudaError(err, "Failed to copy from device to host"); - } else if (!srcIsDevice && dstIsDevice) { - // Host to Device copy - err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyHostToDevice); - std::cout << "Host to Device copy, src data: " - << ((float*)src->data_ptr())[0] << std::endl; - checkCudaError(err, "Failed to copy from host to device"); - } else if (!srcIsDevice && !dstIsDevice) { - // Host to Host copy - std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0] - << std::endl; - std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size); - } else { - std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type - << ", src: " << srcAttributes.type << std::endl; - throw std::runtime_error("Unknown memory type"); - } - // print first value of src and self - return Error::Ok; -} -} - -struct AOTIDelegateHandle { - void* so_handle; - AOTInductorModelContainerHandle container_handle; -}; - -class AOTIBackend final : public ::executorch::runtime::BackendInterface { - public: - // Once in program - AOTIBackend() { - ET_LOG(Info, "AOTIBackend ctor"); - } - - bool is_available() const override { - return 1; - } - - // Once per loaded binary blob - Result init( - BackendInitContext& context, - FreeableBuffer* processed, // This will be the buffer from aoti_backend - ArrayRef compile_specs // This will be my empty list - ) const override { - const char* so_path = static_cast(processed->data()); - - printf("so path: %s\n", so_path); - - // Load the ELF using dlopen - void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); - if (so_handle == nullptr) { - std::cout << dlerror() << std::endl; - return Error::AccessFailed; - } - - processed->Free(); - - AOTInductorModelContainerCreateWithDevice = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); - if (AOTInductorModelContainerCreateWithDevice == nullptr) { - perror("dlsym1"); - return Error::AccessFailed; - } - AOTInductorModelContainerDelete = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerDelete")); - if (AOTInductorModelContainerDelete == nullptr) { - perror("dlsym2"); - return Error::AccessFailed; - } - AOTInductorModelContainerGetNumInputs = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); - if (AOTInductorModelContainerGetNumInputs == nullptr) { - perror("dlsym3"); - return Error::AccessFailed; - } - AOTInductorModelContainerGetNumOutputs = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); - if (AOTInductorModelContainerGetNumOutputs == nullptr) { - perror("dlsym4"); - return Error::AccessFailed; - } - AOTInductorModelContainerRun = - reinterpret_cast( - dlsym(so_handle, "AOTInductorModelContainerRun")); - if (AOTInductorModelContainerRun == nullptr) { - perror("dlsym5"); - return Error::AccessFailed; - } - - AOTInductorModelContainerHandle container_handle = nullptr; - - AOTIRuntimeError err; - - err = AOTInductorModelContainerCreateWithDevice( - &container_handle, 1, "cuda", nullptr); - printf("container_handle=%p\n", container_handle); - - AOTIDelegateHandle* handle = new AOTIDelegateHandle(); - handle->so_handle = so_handle; - handle->container_handle = container_handle; - return (DelegateHandle*)handle; // Return the handle post-processing - } - - // Once per execution - Error execute( - BackendExecutionContext& context, - DelegateHandle* handle_, - Span args) const override { - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - - size_t num_inputs; - AOTInductorModelContainerGetNumInputs( - handle->container_handle, &num_inputs); - - size_t num_outputs; - AOTInductorModelContainerGetNumOutputs( - handle->container_handle, &num_outputs); - - std::vector inputs(num_inputs); - std::vector outputs(num_outputs); - - for (int i = 0; i < num_inputs; i++) { - auto tensor_in = args[i]->toTensor(); - inputs[i] = &tensor_in; - } - - for (int i = num_inputs; i < num_inputs + num_outputs; i++) { - auto tensor_out = args[i]->toTensor(); - outputs[i - num_inputs] = &tensor_out; - } - - AOTInductorModelContainerRun( - handle->container_handle, - inputs.data(), - num_inputs, - outputs.data(), - num_outputs, - // Should these last two be something? - nullptr, - nullptr); - - // Still need to copy the output to args, because they are malloc'ed but - // not using the data_ptr from outputs. - for (int i = 0; i < num_outputs; i++) { - auto args_out = args[i + num_inputs]->toTensor(); - aoti_torch_copy_(&args_out, outputs[i], 0); - } - return Error::Ok; - } - - void destroy(DelegateHandle* handle_) const override { - AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - dlclose(handle->so_handle); - AOTInductorModelContainerDelete(handle->container_handle); - free(handle); - tensor_to_sizes.clear(); - tensor_to_strides.clear(); - } -}; - -} // namespace aoti - -namespace { -auto cls = aoti::AOTIBackend(); -executorch::runtime::Backend backend{"AotiBackend", &cls}; -static executorch::runtime::Error success_with_compiler = - register_backend(backend); -} // namespace - -} // namespace backends -} // namespace executorch diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp new file mode 100644 index 00000000000..65d28a7a1ff --- /dev/null +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +// Include our shim layer headers +#include "aoti_model_container.h" +#include "shims/memory.h" +#include "shims/tensor_attribute.h" + +namespace executorch { +namespace backends { +namespace aoti { + +using namespace std; + +using executorch::aten::ScalarType; +using executorch::runtime::ArrayRef; +using executorch::runtime::Backend; +using executorch::runtime::BackendExecutionContext; +using executorch::runtime::BackendInitContext; +using executorch::runtime::CompileSpec; +using executorch::runtime::DelegateHandle; +using executorch::runtime::Error; +using executorch::runtime::EValue; +using executorch::runtime::FreeableBuffer; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::Result; +using executorch::runtime::Span; +using executorch::runtime::etensor::Tensor; + +class AOTIBackend final : public ::executorch::runtime::BackendInterface { + public: + // Once in program + AOTIBackend() { + ET_LOG(Info, "AOTIBackend ctor"); + } + + bool is_available() const override { + return 1; + } + + // Once per loaded binary blob + Result init( + BackendInitContext& context, + FreeableBuffer* processed, // This will be the buffer from aoti_backend + ArrayRef compile_specs // This will be my empty list + ) const override { + const char* so_path = static_cast(processed->data()); + + printf("so path: %s\n", so_path); + + // Load the ELF using dlopen + void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + if (so_handle == nullptr) { + std::cout << dlerror() << std::endl; + return Error::AccessFailed; + } + + processed->Free(); + + AOTInductorModelContainerCreateWithDevice = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerCreateWithDevice")); + if (AOTInductorModelContainerCreateWithDevice == nullptr) { + perror("dlsym1"); + return Error::AccessFailed; + } + AOTInductorModelContainerDelete = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerDelete")); + if (AOTInductorModelContainerDelete == nullptr) { + perror("dlsym2"); + return Error::AccessFailed; + } + AOTInductorModelContainerGetNumInputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumInputs")); + if (AOTInductorModelContainerGetNumInputs == nullptr) { + perror("dlsym3"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumConstants = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumConstants")); + if (AOTInductorModelContainerGetNumConstants == nullptr) { + perror("dlsym AOTInductorModelContainerGetNumConstants"); + return Error::AccessFailed; + } + + AOTInductorModelContainerGetNumOutputs = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); + if (AOTInductorModelContainerGetNumOutputs == nullptr) { + perror("dlsym4"); + return Error::AccessFailed; + } + AOTInductorModelContainerRun = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerRun")); + if (AOTInductorModelContainerRun == nullptr) { + perror("dlsym5"); + return Error::AccessFailed; + } + + AOTInductorModelContainerHandle container_handle = nullptr; + + AOTIRuntimeError err = AOTInductorModelContainerCreateWithDevice( + &container_handle, 1, "cuda", nullptr); + if (err != Error::Ok) { + return err; + } + printf("container_handle = %p\n", container_handle); + + AOTIDelegateHandle* handle = new AOTIDelegateHandle(); + handle->so_handle = so_handle; + handle->container_handle = container_handle; + return (DelegateHandle*)handle; // Return the handle post-processing + } + + // Once per execution + Error execute( + BackendExecutionContext& context, + DelegateHandle* handle_, + Span args) const override { + ET_LOG(Debug, "AOTIBackend execute"); + + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + + ET_LOG(Debug, "AOTIBackend Handle generated"); + + size_t n_inputs, n_constants; + AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); + + AOTInductorModelContainerGetNumConstants( + handle->container_handle, &n_constants); + size_t n_user_inputs = n_inputs - n_constants; + + if (n_user_inputs != n_inputs) { + ET_LOG( + Error, + "number of user input does not match number of inputs. n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.", + n_user_inputs, + n_constants, + n_inputs); + return Error::InvalidArgument; + } + + ET_LOG( + Debug, + "AOTIBackend n_inputs %zd generated, where %zd is constant input, %zd is user input", + n_inputs, + n_constants, + n_user_inputs); + + size_t n_outputs; + AOTInductorModelContainerGetNumOutputs( + handle->container_handle, &n_outputs); + + ET_LOG(Debug, "AOTIBackend n_outputs %zd generated", n_outputs); + + if (n_inputs + n_outputs != args.size()) { + ET_LOG( + Error, + "number of user input %zd and output %zd generated from AOT Inductor does not match ET runner's %zd. Exit.", + n_inputs, + n_outputs, + args.size()); + return Error::InvalidArgument; + } + + ET_LOG( + Debug, + "number of user input %zd and output %zd generated from AOT Inductor matches ET runner's %zd.", + n_inputs, + n_outputs, + args.size()); + + std::vector inputs(n_inputs); + std::vector outputs(n_outputs); + + ET_LOG(Debug, "AOTIBackend input/output vectors generated"); + + for (int i = 0; i < n_inputs; i++) { + ET_LOG(Debug, "Copying input %d from args to inputs vector", i); + ET_LOG( + Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor())); + inputs[i] = &(args[i]->toTensor()); + } + + ET_LOG(Debug, "AOTIBackend input generated"); + + for (int i = 0; i < n_outputs; i++) { + outputs[i] = &(args[i + n_inputs]->toTensor()); + } + + ET_LOG(Debug, "AOTIBackend output generated"); + + // Create a CUDA stream for this execution + cudaStream_t cuda_stream; + cudaError_t stream_err = cudaStreamCreate(&cuda_stream); + if (stream_err != cudaSuccess) { + ET_LOG( + Error, + "Failed to create CUDA stream: %s", + cudaGetErrorString(stream_err)); + return Error::Internal; + } + + ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream); + + // Run AOTI container with the stream (AOTI will create its own stream guard + // internally) + AOTIRuntimeError error = AOTInductorModelContainerRun( + handle->container_handle, + inputs.data(), + n_inputs, + outputs.data(), + n_outputs, + cuda_stream, // Pass the actual CUDA stream! + nullptr); // proxy_executor_handle can remain nullptr + + if (error != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerRun failed with error code %d", + error); + return Error::Internal; + } + + ET_LOG(Debug, "AOTIBackend running done"); + + // Synchronize and destroy the CUDA stream + cudaError_t sync_err = cudaStreamSynchronize(cuda_stream); + if (sync_err != cudaSuccess) { + ET_LOG( + Error, + "Failed to synchronize CUDA stream: %s", + cudaGetErrorString(sync_err)); + // Continue anyway to avoid fatal errors + } + + cudaStreamDestroy(cuda_stream); + ET_LOG(Debug, "CUDA stream synchronized and destroyed"); + + // Still need to copy the output to args, because they are malloc'ed but + // not using the data_ptr from outputs. + for (int i = 0; i < n_outputs; i++) { + auto args_out = args[i + n_inputs]->toTensor(); + aoti_torch_copy_(&args_out, outputs[i], 0); + } + + ET_LOG(Debug, "AOTIBackend output copied"); + + return Error::Ok; + } + + void destroy(DelegateHandle* handle_) const override { + ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); + AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; + dlclose(handle->so_handle); + AOTInductorModelContainerDelete(handle->container_handle); + free(handle); + cleanup_memory(); + cleanup_tensor_metadata(); + } +}; + +} // namespace aoti + +namespace { +auto cls = aoti::AOTIBackend(); +executorch::runtime::Backend backend{"AotiBackend", &cls}; +static executorch::runtime::Error success_with_compiler = + register_backend(backend); +} // namespace + +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/runtime/aoti_model_container.cpp new file mode 100644 index 00000000000..0809a677a81 --- /dev/null +++ b/backends/aoti/runtime/aoti_model_container.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "aoti_model_container.h" + +namespace executorch { +namespace backends { +namespace aoti { + +extern "C" { + +// Global function pointers for AOT Inductor model container operations +// These will be loaded dynamically from the shared library +AOTInductorModelContainerCreateWithDeviceFunc + AOTInductorModelContainerCreateWithDevice = nullptr; +AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr; +AOTInductorModelContainerGetNumInputsFunc + AOTInductorModelContainerGetNumInputs = nullptr; +AOTInductorModelContainerGetNumConstantsFunc + AOTInductorModelContainerGetNumConstants = nullptr; +AOTInductorModelContainerGetNumOutputsFunc + AOTInductorModelContainerGetNumOutputs = nullptr; +AOTInductorModelContainerRunFunc AOTInductorModelContainerRun = nullptr; + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/runtime/aoti_model_container.h new file mode 100644 index 00000000000..2078490022d --- /dev/null +++ b/backends/aoti/runtime/aoti_model_container.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include "shims/memory.h" + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Type definitions +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; + +// Forward declarations for AOT Inductor model container +struct AOTInductorModelContainerOpaque; +using AOTInductorModelContainerHandle = AOTInductorModelContainerOpaque*; +using AOTInductorStreamHandle = void*; +using AOTIProxyExecutorHandle = void*; + +// Function pointer types for AOT Inductor model container operations +using AOTInductorModelContainerCreateWithDeviceFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir); + +using AOTInductorModelContainerDeleteFunc = + AOTIRuntimeError (*)(AOTInductorModelContainerHandle container_handle); + +using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +using AOTInductorModelContainerGetNumOutputsFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants); + +using AOTInductorModelContainerRunFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + AOTITensorHandle* input_handles, // array of input AOTITensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AOTITensorHandle* + output_handles, // array for writing output AOTITensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t n_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle); + +// Global function pointers (will be loaded dynamically) +extern AOTInductorModelContainerCreateWithDeviceFunc + AOTInductorModelContainerCreateWithDevice; +extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete; +extern AOTInductorModelContainerGetNumInputsFunc + AOTInductorModelContainerGetNumInputs; +extern AOTInductorModelContainerGetNumConstantsFunc + AOTInductorModelContainerGetNumConstants; +extern AOTInductorModelContainerGetNumOutputsFunc + AOTInductorModelContainerGetNumOutputs; +extern AOTInductorModelContainerRunFunc AOTInductorModelContainerRun; + +} // extern "C" + +// AOTI Delegate Handle structure +struct AOTIDelegateHandle { + void* so_handle; + AOTInductorModelContainerHandle container_handle; +}; + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp new file mode 100644 index 00000000000..cadd021f51f --- /dev/null +++ b/backends/aoti/runtime/shims/memory.cpp @@ -0,0 +1,389 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "memory.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tensor_attribute.h" + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +// Global storage for tensors and their metadata +std::unordered_set> tensors; +std::unordered_map is_tensor_own_memory; + +extern "C" { + +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size) { + std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim + << ", dtype: " << dtype << ", device_type: " << device_type + << std::endl; + + // Convert sizes to the format expected by ExecutorTorch + std::vector sizes(ndim); + for (int i = 0; i < ndim; i++) { + sizes[i] = static_cast(sizes_ptr[i]); + std::cout << "Size[" << i << "] = " << sizes[i] << std::endl; + } + + // check the tensor format + // Only support contiguous format for now + int64_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; --i) { + if (strides_ptr[i] != expected_stride) { + std::cout + << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error" + << std::endl; + return Error::InvalidArgument; + } + expected_stride *= sizes_ptr[i]; + } + + // Adjust data pointer by storage_offset if needed + void* adjusted_data = data; + if (storage_offset > 0) { + // Calculate byte offset based on dtype size + size_t dtype_size = + 4; // Assuming float32 for now, you may need to handle other dtypes + if (dtype == 6) { // float32 + dtype_size = 4; + } else { + std::cout << "Error: Unhandled dtype " << dtype << std::endl; + return Error::NotImplemented; + } + adjusted_data = static_cast(data) + (storage_offset * dtype_size); + } + + // Create ExecutorTorch tensor that wraps the existing memory + // Note: We're NOT copying the data, just wrapping it + auto tensor = executorch::extension::make_tensor_ptr( + sizes, // tensor dimensions + adjusted_data, // existing memory (don't copy!) + executorch::aten::ScalarType::Float // only supported dtype + ); + + if (!tensor) { + std::cerr << "Failed to create tensor from blob" << std::endl; + return Error::InvalidArgument; + } + + // Store the tensor so it doesn't get destroyed + tensors.insert(tensor); + + *ret_new_tensor = tensor.get(); + + is_tensor_own_memory[tensor.get()] = false; + + std::cout << "Successfully created tensor from blob: " << tensor.get() + << " wrapping data at: " << adjusted_data << std::endl; + + return Error::Ok; +} + +AOTITorchError aoti_torch_create_tensor_from_blob( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor) { + throw std::runtime_error("Should never create from blob"); +} + +AOTITorchError aoti_torch_empty_strided( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor) { + // This requires us to reserve CUDA memory and put it into a ETensor + void* ptr; + int64_t numel = 1; + for (int i = 0; i < ndim; i++) { + numel *= sizes_ptr[i]; + } + + if (dtype != 6) { // throw if not float32 + throw std::runtime_error("Need to implement empty_strided for non-float32"); + } + + int64_t nbytes = numel * 4; + + if (device_type == 1) { // cuda + std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl; + cudaError_t err = cudaMalloc(&ptr, nbytes); + if (err != cudaSuccess) { + std::cout << "failed to allocate " << nbytes << std::endl; + throw std::runtime_error("Failed to call cudaMalloc"); + } + } else if (device_type == 0) { // cpu + std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl; + ptr = malloc(nbytes); + if (ptr == nullptr) { + throw std::runtime_error("Failed to call malloc"); + } + } else { + throw std::runtime_error( + "Need to implement empty_strided for non-CUDA non-CPU"); + } + std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr " + << sizes_ptr << std::endl; + + // ETensor sizes + std::vector sizes(ndim); + for (int i = 0; i < ndim; i++) { + sizes[i] = sizes_ptr[i]; + } + // ETensor creation + auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr); + + // Store the tensor + tensors.insert(tensor); + + std::cout << "sizes.data(): " << sizes.data() + << ", tensor->sizes().data(): " << tensor->sizes().data() + << std::endl; + std::cout << "Size[0] of tensor " << tensor.get() << " is " + << tensor->sizes()[0] << std::endl; + *ret_new_tensor = tensor.get(); + is_tensor_own_memory[tensor.get()] = true; + return Error::Ok; +} + +AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { + std::cout << "Called aoti_torch_delete_tensor_object for tensor " << tensor + << std::endl; + + // Check ownership before cleaning up metadata + auto ownership_it = is_tensor_own_memory.find(tensor); + bool owns_memory = (ownership_it != is_tensor_own_memory.end()) + ? ownership_it->second + : false; + + // Clean up ALL metadata maps immediately to prevent use-after-free + tensor_to_sizes.erase(tensor); + tensor_to_strides.erase(tensor); + is_tensor_own_memory.erase(tensor); + + if (!owns_memory) { + std::cout << "Tensor " << tensor << " does not own memory. Skipped \n\n" + << std::endl; + return Error::Ok; + } + + for (auto it = tensors.begin(); it != tensors.end(); ++it) { + if (it->get() == tensor) { + // Get the tensor before erasing + auto tensor_ptr = *it; + + void* data_ptr = tensor_ptr->mutable_data_ptr(); + + // Determine if it's GPU memory + cudaPointerAttributes attributes; + cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr); + + // et tensor does not own data; need to free them manually. + if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) { + // This is GPU memory - free with proper synchronization + std::cout << "Freeing GPU memory at " << data_ptr << std::endl; + cudaDeviceSynchronize(); // Wait for all operations to complete BEFORE + // freeing + cudaFree(data_ptr); + std::cout << "GPU memory freed successfully" << std::endl; + } else { + // This is CPU memory - free immediately + std::cout << "Freeing CPU memory at " << data_ptr << std::endl; + free(data_ptr); + std::cout << "CPU memory freed successfully" << std::endl; + } + + std::cout << "Memory freed. Now erasing tensor " << tensor << std::endl; + + // Remove from set (this will call the destructor if it's the last + // reference) + tensors.erase(it); + + std::cout << "Tensor erased. Now returning \n\n" << std::endl; + + return Error::Ok; + } + } + std::cout << "Error: Didn't find tensor " << tensor << std::endl; + return Error::InvalidArgument; +} + +void checkCudaError(cudaError_t err, const char* msg) { + if (err != cudaSuccess) { + std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")" + << std::endl; + exit(EXIT_FAILURE); + } +} + +AOTITorchError aoti_torch_copy_( + AOTITensorHandle self, + AOTITensorHandle src, + int32_t non_blocking) { + // check if size is the same + if (self->dim() != src->dim()) { + std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim() + << std::endl; + throw std::runtime_error("self.dim() != src.dim()"); + } + std::cout << "self->data_ptr(): " << self->data_ptr() + << " sizes: " << self->sizes().data() << std::endl; + std::cout << "src->data_ptr(): " << src->data_ptr() + << " sizes: " << src->sizes().data() << std::endl; + for (int i = 0; i < self->dim(); i++) { + if (self->sizes()[i] != src->sizes()[i]) { + std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] " + << src->sizes()[i] << std::endl; + throw std::runtime_error("size mismatch"); + } + } + + int size = src->nbytes(); + // should check for device + cudaPointerAttributes srcAttributes, dstAttributes; + cudaError_t err; + // Get attributes of the source pointer + err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr()); + checkCudaError(err, "Failed to get source pointer attributes"); + // Get attributes of the destination pointer + err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr()); + checkCudaError(err, "Failed to get destination pointer attributes"); + bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; + bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; + // Determine the memory locations and perform the appropriate copy + if (srcIsDevice && dstIsDevice) { + // Device to Device copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyDeviceToDevice); + checkCudaError(err, "Failed to copy from device to device"); + } else if (srcIsDevice && !dstIsDevice) { + // Device to Host copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyDeviceToHost); + std::cout << "Device to Host copy, self data: " + << ((float*)self->data_ptr())[0] << std::endl; + checkCudaError(err, "Failed to copy from device to host"); + } else if (!srcIsDevice && dstIsDevice) { + // Host to Device copy + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + size, + cudaMemcpyHostToDevice); + std::cout << "Host to Device copy, src data: " + << ((float*)src->data_ptr())[0] << std::endl; + checkCudaError(err, "Failed to copy from host to device"); + } else if (!srcIsDevice && !dstIsDevice) { + // Host to Host copy + std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0] + << std::endl; + std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size); + } else { + std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type + << ", src: " << srcAttributes.type << std::endl; + throw std::runtime_error("Unknown memory type"); + } + // print first value of src and self + return Error::Ok; +} + +AOTITorchError aoti_torch_create_cuda_stream_guard( + void* stream, + int32_t device_index, + CUDAStreamGuardHandle* ret_guard) { + std::cout << "Entering stream guard for device " << device_index + << " with stream " << stream << std::endl; + + // Set device + cudaError_t err = cudaSetDevice(device_index); + if (err != cudaSuccess) { + std::cerr << "Failed to set device " << device_index << ": " + << cudaGetErrorString(err) << std::endl; + return Error::Internal; + } + + // Create minimal guard structure + CUDAStreamGuardOpaque* guard = new CUDAStreamGuardOpaque(); + guard->device_index = device_index; + guard->original_stream = static_cast(stream); + guard->sync_event = nullptr; + + std::cout << "Stream guard created successfully for stream " << stream + << std::endl; + + *ret_guard = guard; + return Error::Ok; +} + +AOTITorchError aoti_torch_delete_cuda_stream_guard( + CUDAStreamGuardHandle guard) { + std::cout << "Exiting stream guard" << std::endl; + + if (guard == nullptr) { + return Error::Ok; + } + + // Clean up the guard structure + delete guard; + + std::cout << "Stream guard cleanup completed" << std::endl; + return Error::Ok; +} + +// Cleanup function for clearing global state +void cleanup_memory() { + is_tensor_own_memory.clear(); + if (!tensors.empty()) { + std::cout << "Warning: tensors not empty" << std::endl; + } +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h new file mode 100644 index 00000000000..bcbb33d0e99 --- /dev/null +++ b/backends/aoti/runtime/shims/memory.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Type definitions +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + +struct CUDAStreamGuardOpaque { + cudaStream_t original_stream; + int device_index; + cudaEvent_t sync_event; +}; +using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; + +// Global storage declarations +extern std::unordered_map is_tensor_own_memory; +extern std::unordered_set> tensors; + +// Memory-related operations +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size); + +AOTITorchError aoti_torch_create_tensor_from_blob( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor); + +AOTITorchError aoti_torch_empty_strided( + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor); + +AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor); + +AOTITorchError aoti_torch_copy_( + AOTITensorHandle self, + AOTITensorHandle src, + int32_t non_blocking); + +AOTITorchError aoti_torch_create_cuda_stream_guard( + void* stream, + int32_t device_index, + CUDAStreamGuardHandle* ret_guard); + +AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); + +// Utility functions +void checkCudaError(cudaError_t err, const char* msg); +void cleanup_memory(); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp new file mode 100644 index 00000000000..b5333f50ea9 --- /dev/null +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "tensor_attribute.h" +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +// Global storage for tensor metadata +std::unordered_map> tensor_to_sizes; +std::unordered_map> tensor_to_strides; + +extern "C" { + +int32_t aoti_torch_grad_mode_is_enabled() { + // No autograd ever + return false; +} + +void aoti_torch_grad_mode_set_enabled(bool enabled) { + if (enabled) { + throw std::runtime_error("Cannot enable autograd"); + } +} + +AOTITorchError aoti_torch_get_data_ptr( + AOTITensorHandle tensor, + void** ret_data_ptr) { + *ret_data_ptr = tensor->mutable_data_ptr(); + return Error::Ok; +} + +AOTITorchError aoti_torch_get_storage_offset( + AOTITensorHandle tensor, + int64_t* ret_storage_offset) { + // Storage offset is always 0 in ET + *ret_storage_offset = 0; + return Error::Ok; +} + +AOTITorchError aoti_torch_get_strides( + AOTITensorHandle tensor, + int64_t** ret_strides) { + auto it = tensor_to_strides.find(tensor); + if (it == tensor_to_strides.end()) { + std::vector strides(tensor->dim()); + auto tensor_strides = tensor->strides(); + for (int i = 0; i < tensor->dim(); i++) { + strides[i] = tensor_strides[i]; + } + it = tensor_to_strides.emplace(tensor, std::move(strides)).first; + } + *ret_strides = it->second.data(); + std::cout << "getting strides from tensor " << tensor << " with dim " + << tensor->dim() << std::endl; + for (int i = 0; i < tensor->dim(); i++) { + std::cout << "strides " << i << " = " << (*ret_strides)[i] << std::endl; + } + return Error::Ok; +} + +AOTITorchError aoti_torch_get_dtype( + AOTITensorHandle tensor, + int32_t* ret_dtype) { + *ret_dtype = static_cast(tensor->scalar_type()); + return Error::Ok; +} + +AOTITorchError aoti_torch_get_sizes( + AOTITensorHandle tensor, + int64_t** ret_sizes) { + auto it = tensor_to_sizes.find(tensor); + if (it == tensor_to_sizes.end()) { + std::vector sizes(tensor->dim()); + auto tensor_sizes = tensor->sizes(); + for (int i = 0; i < tensor->dim(); i++) { + sizes[i] = tensor_sizes[i]; + } + it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first; + } + *ret_sizes = it->second.data(); + std::cout << "getting sizes from tensor " << tensor << " with dim " + << tensor->dim() << std::endl; + for (int i = 0; i < tensor->dim(); i++) { + std::cout << "size " << i << " = " << (*ret_sizes)[i] << std::endl; + } + return Error::Ok; +} + +AOTITorchError aoti_torch_get_storage_size( + AOTITensorHandle tensor, + int64_t* ret_size) { + throw std::runtime_error("Cannot get storage size on ETensor"); +} + +int32_t aoti_torch_device_type_cpu() { + // Let's say cpu is 0 for ET as well + return 0; +} + +__attribute__((__visibility__("default"))) int32_t aoti_torch_layout_strided() { + // ET only support strided layout, the return value will always be 0, a.k.a + // at::Layout::Strided; + return 0; +} + +__attribute__((__visibility__("default"))) int32_t +aoti_torch_device_type_cuda() { + // Let's say cuda is 1 for ET as well + return 1; +} + +__attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { + // Let assume the dtype here is all we will support + return 6; +} + +void cleanup_tensor_metadata() { + tensor_to_sizes.clear(); + tensor_to_strides.clear(); +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h new file mode 100644 index 00000000000..3ed966f99dc --- /dev/null +++ b/backends/aoti/runtime/shims/tensor_attribute.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Type definitions +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + +// Global storage for tensor metadata +extern std::unordered_map> tensor_to_sizes; +extern std::unordered_map> tensor_to_strides; + +// Attribute-related operations (memory-irrelevant) +AOTITorchError aoti_torch_get_data_ptr( + AOTITensorHandle tensor, + void** ret_data_ptr); + +AOTITorchError aoti_torch_get_storage_offset( + AOTITensorHandle tensor, + int64_t* ret_storage_offset); + +AOTITorchError aoti_torch_get_strides( + AOTITensorHandle tensor, + int64_t** ret_strides); + +AOTITorchError aoti_torch_get_dtype( + AOTITensorHandle tensor, + int32_t* ret_dtype); + +AOTITorchError aoti_torch_get_sizes( + AOTITensorHandle tensor, + int64_t** ret_sizes); + +AOTITorchError aoti_torch_get_storage_size( + AOTITensorHandle tensor, + int64_t* ret_size); + +// Utility functions for device and layout information +int32_t aoti_torch_device_type_cpu(); +int32_t aoti_torch_device_type_cuda(); +int32_t aoti_torch_layout_strided(); +int32_t aoti_torch_dtype_float32(); + +// Autograd mode functions +int32_t aoti_torch_grad_mode_is_enabled(); +void aoti_torch_grad_mode_set_enabled(bool enabled); + +// Cleanup function for clearing global state +void cleanup_tensor_metadata(); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl index d51097f306d..7b02c1075a2 100644 --- a/backends/aoti/runtime/targets.bzl +++ b/backends/aoti/runtime/targets.bzl @@ -3,8 +3,17 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): runtime.cxx_library( name = "aoti_backend", - srcs = ["AotiBackend.cpp"], - headers = [], + srcs = [ + "aoti_backend.cpp", + "aoti_model_container.cpp", + "shims/memory.cpp", + "shims/tensor_attribute.cpp", + ], + headers = [ + "aoti_model_container.h", + "shims/memory.h", + "shims/tensor_attribute.h", + ], # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) link_whole = True, supports_python_dlopen = True, diff --git a/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin b/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d34f0ffd0262af56bf3c172beda228da57f93b3c GIT binary patch literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJu$8lYb`tkxG*DnnE20DTA{aWI?Mq^C?Bjef zPD^Q8rfHywPJ8(FfM-A;L9m3dCX z@tV*oyq&7ni*^-RsQBFbQoYtdG=82^Ym}Yh?36X>*)xs;s+Ah%o>P^?0Jb|2wWcM6 ziEcIQMned#fwxhLUV#AT`Yp?E!9%i(qA4#8w^HIrk`J)L_Z1hV|3D}oC5B||YIVx9 z-Dw`Z>(%|bKj$LCkiN_YR6um<<-C8?^s6jJ)$RAsI5S3laQgo~atoEj* z;QQ~n^Bzm^TN%r)R&4YyS|Yq$tX&p%myS;1ohR_Pn`+JcuPvZpS#@pz|Iq^4n9Woz z^vk*gO)N`|4fZU~Asi(;L)e&{-v+$?+)#)jrbXa2wzUWE0UrtB2PDTF=TG(UKO^}S zpVIg}&ab>!5qqBF<0}X``yI#k_293GlAr2yT_O7%zj*~AXWs&TXfTu?sq*)n|LPw8 zHm+R~OLMw37JGum)Rq(T^CK+LJ~2O%;8Vs3zq?0^L`Yk){Rl?L##9EL1S-n@1jb0@ zZ^_NLM}T3Gv_Ipf{km70uGGr2CCscTyHT w*r*(OU**Jqsc3W^SnvYmEn%|_a; z`;~OHQcSyZ{&c;TGqRa6Tm1didn=_A1PR};I z(R{r=I%>8W&8epQXt7bL=iJA~-J;v@@+EVsS;;vw<+(?yhC9b1zx>O0UhezNHo0P^ zovQ124WdLp{PiDR>ifkDh^imC7FR5yZH+9ydqF!A9gMuIMWbg~WKnAYc;}oJiRzIh zO^aqC=a;qP04@N~Ba7!RENf3jUuTh}#dFJx@1Ez3+A)wWzI72SXCsmG7vGLWwbvLF z!Nkvw(-_qbU|7=nao1*Px(M1qh((`^BHtRX0Ch%Z1%L=S%(QJ0?P^l-NxG4%+E?ke zi*8SlTiUfW-9op6bQ`3bhuhN9(&CbKBY_s(I3IWCQGn;PFB9@*a#~0`G^^2Vm~Jy& z%I^&*+}2eDHgQ{AT3lW}_l`)*aikBkOyt5Ma+rxMzkTuT_s;PoEc2|tv%IuKSwkA; zDGgK@qluzb>NUM!jN^II_NE+PZ_HSGGBz{pav3K}!)jEHIQkf!67nR`4V*1gH9e~v z#eyzQj)$_@hxO0sNB4~G*6Dw2e79bpv4DSx8_xij2R(Ca-#^nur_e*$@qekWK>AGR z-y`gi-9irkS%EL)VFxi*8kJ(z(G92Odvp3F@9`tdaBVzY>c;F?UN>Cd(|Zzg$wn-0 z!piKvg2u8}(wKQAjpeSSv3$r_&b;(~ldUisQhV9^Zj9YbA5k8|DN)w5W`P|ga0=ay z&vttMu)9B-yA_L=bJYZ3H{LJ{Iaufj8}%DTzfjQOn2~hE3}SY*n6YJYlTQ#ODlI&&Sw4m=9 z&F6FD+3|4-*;5T*;$Q+DTsg!{1fkb6g<=pOfb%uL;}jc~R5P=Wo2%9XD)p0Rf@gGm z4Ekn^0Y&r^kH*YNKOe7@9mP+dm%FmQdlc)43lzex!V}%s8zq`@9Z^E6CrZ?|D@s`AnDs`< zyJakhax4@j8oDz|zDCNlh*qm*7E5FuVUk!+oGN20-3p_Z6er&p3PGHELUCq!D|M_S zzIe-GQ|PTH?e^=56kFi^#<<)Cndop?g3s24j`k$D%{ zkq(-HgT8W5WP&@9&m81_D9Zs4bu6=RE%g)OAOFW+1b1G$i7-)|-OJKF=>pb_+wt$sE^iKOe56#I6ozn|WMC*FTH z#m=O_-`{@?-~)Y<|A_=UK)mD;$sbL&+1CM){S3i_ZT5_WU4n<&?3)rM|68JNZF-md z+cfBXZT2-uFDBU#=pO=XpMG`|+2867@BrEq1fODCT5MpV#+X}N)L=h$6W~41{<+YO zg8nMvZE5WtkZ0Qv6ql#>>(KY=sZH#S6pOdy%iA{Sh<8^j&Q7B-21p+K!>vS|S}no# z-OElq9Zm4sZ#TBEg&62bPEQhjAr+6e#F}8F5Bd1{5d?M;3Me12H{2TJ{DVS&bhtId z;UO-+1OAXTT+*N6R^I?&`Zzs}{KvZL74NE7JWlnB1@(%@JL?tesMm?7W62<%8zQhz zj^ZNwJU^T&;020*OCNvjzx@+i*hmWS=0WZ;<>QA*z=@#%M*ofaDXh8qTS*;T*ui%C z(dds~zGSU3e((*hXzPc+2ZVLCW6Nx(^7FAgt6*-kGO;~D1qQjr1l2lPM5ZeK?I zcf@vj@i^^Yd^14)B^|ck)7VSU-_nqtd9>_2;!DRs$5uh|7d7_ADFuW7eZa(@lKjvJ z|8#6!;J+0o8PAXYqK0;w|NIKf=_~L^lJpCH^b^I$FKl_Ve)Th4zu^>-jh?1aMR^rbKhWc%<0trf==Wmy z{)euU)W6`5Mh5`(6AJMnKPczv33e9wdm|Ma@EGquHjjKGKmA_3E%ZSi@qr(^GV$~L z5q}&AqyYJazD)%Z2c7c&I^esBOfd2b`}c6efY`jo&ZpSd=U5*ESl@PR^h8(t9ns%P zvCqzZz-a$&x1ZA3=@h#*phM{Y5Vov?U%cZs>#heL#FyfHdicIO4_fzs;l2kbX&+r5 zW#GRe1GGJheWlZ#2?>5zY#ITP_`?_NL95A+O5_b)#BO=|=DW)TSN|1$cri9D1&D{v z3wg@XlPSMG2YR^CC~-3Gi%8I zR37iQs2?uK2)gBO7I-%ZP)^Qn7n5BQR%M_)I(!`wfIAUJVjjC*AYPDolBc7vTYdzE zflod5UzGBCNS>xjUe%F)*Fm@mB8IL%N_kahvQOuel2iDftRcU+iafPcxBOq%kWa`D z6dCCg*{AcmTYekjU(Nr5lvjR|Jn?nQ7hoTMJ!{~wl&74BFD0*V^t&69m&-+=*}VEd zBnf1n*3W#vAtoczKGDP1{EY&VX$(kYD}2%Kd&sA5`5#F6FfYx!H@A&=kSS&llf);15v9_4gu#yfID_SraJCpZ#rem zs@cA~;8;c5FU?rBx;JZ8ooUy{)bc&st+};n%PyBKGaJgibzFqTk6idDA6z8X{if7L{a#3|_n0roDAj7rY1*k1p0W;yP zrrm4?OdG&$#J-;)gf;w@<+orV>BTNdEA@t3;y~gLp@HpdCJOrjkTwbk@z~Ysv}ZSF zxc3dO?$`ar20V<&%WN+AK~{c2{g=5-J0+9)2Zn|EV(}U z(0vbC0lk&6?5b-cO(8FODy+R0szz5r@B|Cs=uLSv{@>0?czU|e`R6((n=X>_D1UK> z%*4@C*l6DYRPbon6M?E?ZUU~I-x0wC)02QJY7>Nx zIlZ)EjqODaA6Ns3*+mW??1K-)$xivSTthn?zHJR4X3K!@7+s4`TbS?~r@yI>zJs<` zKxs}*Vd^t9riPc>?R_lKS#Ix3@F`;iAMcYA0HiMU3f2LyDkJb$fa0uyJ{J4O$Tsvb zc+6r4{Mp64c-eA!qB(aecRKg|N_H{VaO!$vk;Ptp@5R>#U){-A60QeLjVaq>mp{6s zEXGG;A1Tkp|G;7^t4a$iB}v`W=vG!&E-7j}69YaIyQC;_Ev7t6q?ay#xVoY&(ON~f z^1gBd=s#Hb;0lP^vDoU$N*wzbi~SMm;yg*aodq~z3{!52DL0YYJEVC(`DiDC$K)YfO%7Pj1EGd+P8y!coq)Lbw6p1$L7xxYN1aTUeA_MSEqL6Vsp=vwC~j#F*!+5C0lpPxnd|xt&08Tn3*k{u4V3LP z@`4<-Yh~%DfvTPM2x*w>`Dy~Q73So~oVpjP#RDYm@gN(f#UAhr1TY3Nm7 zZ}L5F1RleHd?A;iAkA0x8CPIE1CKHU1ZX2~lt_u28c<7xLgk5I3j=t*xX^RuUZeW$Z=pnV@$%D=6o^4P;zoh&|_~n6 zI9FOWxZQoif&4V#W+>cT6HXh@?N_h|@W00|9{RGu15-A$p?m6$u7+}7GAs| zPXsU<`j`uRhF&KI` zhKkHJctdP`?#NB<$=?5LYbHW>j7-EIDnZ>j9#nxcm7#SVl0p4^sYa!v&mkPrYu}&A zHHWfMhj4BJhbqByE=V)7x{}SC`C}7z9|J{m&hvGAay}yv52BxyRpjYOr)leQk&Oxy@hM6xFGhl$A0m@u_#X1Cz}b$}v}h?C=M^rmoq-hQIez$BU>sU|-EnGVdP_Ki zkC>*zYe!Pt2ie7<=Pcl@s<)l#BjwlT{e-=YTea>tVAd{?4aub>9AQj7kYG1r{0hyt z2XhYJ5t-*z)+YQd!-T?(NrAB|d&2b~%iza_?csWW?d!s;OzT=J?AER||D2D=i?Ku( zo+|}jIf46{uXdQaE5R-*jJ=INb#$29#xFcTsEkc9bz6dsU_Khhd{QN8ywUv%P5vdu ze~R%FG4g-_Pj?y=6WP8dCcSVem*D-kNP(Q_ZPdLR$dP(BJDy#Nbr@Y+kzrpSVPlIz z;vS5(5q5TIxWj&oInlg~bxUO{f`q-30zCpsjeaM3XN0}fO7J_TqdOrH__M7fo8mVW z!2c%2e%4AR>DDgs@wpUxBL(_o@@A}$4hs5b66`3}XaWCHqQjoYiu7k7Z)sbH{Yb1E zv>xfOpNKW=-_c_1IV?LD&keEP9{_%|!_EnOafEI6aD7Mk0`PCVz=lD8{&vP1#VZQz zRqtrU*>t5+>BND5m*lG*_Cvw{9^%tdI=iuq?}SsZf4ir|Nnq!#7lzoIDYmyYqDD%G-?l*C;PM3}!EZswZXafq zWN7~g$M05`FAU=3pppH>6pMR17OqEYVEx zv&uBAkuQ5ZWsoOC5byX6EjkJ9XW2a~3VV0AD%P;&GS(@UK)wQhl*F;OQcO!Ei2ev; zQ`}$e=N$jw+M45o^>4us{XAZy;NLsKG;c5gfoM~(Z#S(K?hjw5*f+ePLAovHwF3FN z#~WHJ8;H+C`-zuYTVWrh*z=1_C4#{*6gk4n^Gx{nQ|y7oPZ-s%PUi;-yO?6PhIlaj z7uJ?_?6G_Aw(fuQ3H*4(rzak{@0fM+JC8g{PJP+FjKKd?1gK1iCtjCbQ)1PQcYztv zM;rYvwI2U20ZjHFd->@XqaGDp{Jx-Quo=NXH1s~gYYj%cJ@Ix3_+FWW_^GHWGXYuH z_tHMF0sqH>|HF_onDp}d8}R>G82YEr;D2QU{#Sz{%I4`MF4B7Si*nhBknm|v1Zb~7 z$Zg=ynh5cHFmFsnU^IPP@c&xCiJwY*FaH=e1~&ES-!&F6YQYfZh~$_1NWX^?Y)a6{ zVM_4JeJ1_%9x3@G{QL&|kFUc|Db>sW_6Gb7!JiQZk$(Q3C)QH`5d2@y{`2eTC%Rt! z`=KBI`p&?w1V6<*+9bb(nV1m2I4%;**loeIEdoyV)A?x!;IJPP`UxLxZM;KL+Z7C9 zPr9OweitX3dij4L_@lHm@8zEe@`v5iLn!%A0598m`A2?}i}L?yM+$T9k`N&7PACg{~ZoL_eXt(UfKa$vOSqT)1rWMgq_5wn8*Y-`~&avI= z8`o(lb-D>n(*)Ca8PXmqo)Fu^9zkNL4^YuPp#2F|h4#>OD2W(C8pHd1&yVZt>tur^ zbeqIga?bhw{&BwNd#|s3>Csas!=X@S7ZX>Qz0F#3P1D*T0j%98=BMv^n$5Bt>&H04 zLW~*pg}UpwRvs%gFE0w1&t`q0QfxA#(lAPuxhgYUyXhLPQ*x^nvsNvcb4zu*Xqpwv zEj4U2Z@Gm9vr=^y&9XgTa0}7Me z$N6$KZngNTB*BM#hs#D8bv$5G;hv1)}pO|a;bu~W0xh-gRL$^rC|zTqMLQA zUKc`J;4S37SHQ#7+@|R^;UU?@E-5dKTB*R1B=2E`?;RJV|3D}oC5B|Ia(UjdY70F2 znp1VF?s5$g`t&8YmwZu`U()zhZPQMvqCT@|FB;Xkxw`F&@|K53(NXj`#SK%zXC8m_ zaZ~V{Nz*EqER-qq`A>y;&_vhhDhOU+0*^7Rj>iAnIq^*zUnsSjoLB`z4hmp-|hdyJ&eT=y0{`CZ78(y@rT-S zcrf&_b}9TP7Ft`^nt-$z4d-cyg!Rzvbxc=TNP9k<46S{rg@9YvE|S0oa5~0I82_U6 zVZ5d7f!uA*q=kXdR>C(zAx_)4y|FF`q#}%M4uwW|E@+Rsa{+|USEk(?((WdQCTZYK zo}txuXsDAb!s$_hPttIW(fHOSj2qejts5BD*Vfmy!vyj0c%btH-bF)&2CjF8R{LrA z5m!D=OD=MbR{Ln+!Y-{yZEXB?1CFh2Xd30^ z3gv}1$!OS4y;QB}IU|clH_Mr~UA?|&9!Xlvz~dO2ech~=F4+1EHE!A5bp!2wzM`je zBcIbnGk+$PdRG6gzH(%GLZ|n!WrX}gD*QM2$!ryEN%Q@T-e9la+f+tF`w z?PhY~&e%!aAv@_iWG8cn>`dJuJJUWpne?_QS60@HPve(a!C7{gK3truxTT_=O6S-L zf%7;r_&}z|#%mAcj-v-~R34$l#+?VIk{uL$1FHBydI~sp@RW(qk(Gj($7jY$@+h^g za#0*bBQJSVGZXqVJ8}4)J+_H*w6mR1o`Q0U2KQkKW3k#WmPGg|G_Uc3Yn6&d6Cu|Z z%7#n~1vc&wJ-3->Xs|osS$fm3(OGvwc?Qb6f<-#VQ5qDgKT^bkNI^_(5@{OkekYun zf-_d7=%p7i!YE9qr>8bEi@V8An4P8^R2^lOqJddqe1>-1c56o2HR`+`YOp%)5K}6b zOA&o>scbBiB-u%O^lWN6m(J#L8A_$2$|;TeBILo9K`#9S+#m%3!QvM0Da4H#ctK4c z_JT_Pp=MeG;Xtbbci>Xda6r!>nGQ=}q;^`KdV zA25rMIc96KJY`d!*>uM&ZCQ6NM0%04KrVRli&TW<6t+_40zLzQ zmJNJlF1UDh!z-v$>7z&OMycSMWvlM;jwqy322$uxtJ2g-D}323zEi0#m1Ln{6&7q0 zpB!AnmvmkjGhXylB6=PNl|xtRMb1Ud_eP1}|AQzY)e|M`a7~>kmF~bUe(or_=Q%IR ztS?G5^T*Ip24QbPub1xW*nWs9PML*q~A;SwPO$A0BSBho0%e-O8 zR;&oTv+@1F$)hC62Z#)CaOh+%Nf)#gOATXgsa*C{a&5$hHhbh)FP$H9LC=r^YHZ3X zzX4=~ZRE>O;@J}ZW~dp=En0?C@g#Y&GF)C%9jYt9!toidM;xmZ-6!a_CC#U}(;2P` z;(m$e7v@n~4~aF$5Bt{H z2x}4j-T@-vZCQ@7kH$Q>pC$3d(vS!DF?`A7qeqzD1#4u@M-S8cMjsqv(Jq)91zkIh zcXj*PEV4h!uHwbl4g5z22e@zaa)bW0y|XN`JIdl%4~DUxiI6sa(YrNTd?x3A$oW&u zJQ<+TyDZ9wy1p(KLA=sQ_};gNLXZC~&3=wG@w^JS8DVRZh22bmUlY(crkip0N;AseF+O+?G=hG)8Dq1QXu7T?*sq$g z7`?TPe)3|1y_W!gEH(o8!G6jAT$DXXyyOwduf*Hz1wdp!N$_Bs{Y=6&f`{Ae7ZN7_ zn_^$v^y>EgNzg~y>_th>$JsFGp8;%NeQ_7rA9Cs& ztd{>ogFlh`n_>3b*I)luI}H9mBL1e?b2X><@*#Ot~1Y5lx9^kdI&lq4gBj6`0F5>@y17y3hQ zUI6_v5R@;je~{}R6!h>b%}8K>`U3mY*GKyk@%E>$uX}$YUHh|oJrd_0?fb?6%g5lK zIP=`!n@_;MVQznz?0;||vU7Xa$fCojv5R7`-gZhBJ=h?$+8pS7J==w-w?<9zhW<~T3&?kvbFz6p^ z?EM7Oc`{nZar5Ezqrb-WeV@bdM<1mx4}A1DIPB5MKYoFW&Tq7-;VX}pI6}}*{t}4~ z@Hg5VlJpjjSAR`IxweEp)UW;qr+fOxoW5PWFY$aO!edUPpXXP=Om{Sq5AOc~f+M0M z%nJPu8x-Hc0Df;0bSB%Jpx@_c8dVJ6^=nVC`yA0v_W;ND!-;Eh$5GEOu`wsXyl;Jw zpT}`F#nsZ-?F4&anMH}ee;*n>(N#ZVpx;Wc$Cf{3)W6&9pJ?oAf{l7~2>lme(>(FR zM;q#^ePbFh!POsYzz5M zr2IfYAOJFV3;9S)SR4%q1VHxY7V`V$ZF7h>d*yv^tcL}u*5MGVN8vN>F|;HU=T|*dN^|1a&caHKOFzKGN?22sbPF)RdF*YM;qI z-Ds4Y!e7}!{`4mDR8m3t8(YXPNcp66i0t#{TnQ`tyAc0o{{MOt`@|Qte;D@h*K-Db zFXbuc{!7U#9Q|H{KkVTEqmn-jy1F*2KZ=|3s(#m~c-4Gb8dS5&AECD+r}jtLQM1}F sHT&)H2c=*_rJ&j;HT&f#MztTxj+&J^HT&f#K9xtsuV$53H7mJ)0z0U8!T-EP$ z$sJLvGRn=&|NCQR{<}NQGxr|5FCL4{jFvERZ ztk#NF6=6F*hf6iL;ajdhN4eC?c5!9Vta#S4ErF`zV(!^hK@4H52jMo&K$z%e!)i1F zp-u1>az7{#;_80O^jq+d>|&LqmwMePaU{uy*ueL$i^6{(q>lnaGFG*^=vnn89(~=b z`8B^?M}!f5nXM&1*p;7A|M}jgm6DtK#In7t*D95U?FZXi93Dx>rpGC6ni4+o;Jpu; z0l%3st*T>Tn?hgoRG52BRE?eu!8cfdM~}n)zj;o=)6?t!r#dHFE|T&ne|Co~i=(Ho z$$;;Y==_17JHI5%0z>mbqPx-X$l5LLP(M5>aI`C7Ik^Bf#uX6sI2KYN@dj*!})D)&Z zO=D_&eQj+YOLW%P_9gg~F~ZLbXb}iuOT7v>kX2;{&I1)^b@Z{=KPGmdk0D|fJK!(Z z5BN2&wB)$ul@ey=qSdI?+>?%V;E6V4r6<}a+qq(?)>?9McBS5|)-1O@-+s!?JI$v3 zgrD`ird4z7>fFgw^X=Jke#O=6Z5I3OdoR2``pPYgB@uei)R?j}cKL&gN;^Io`#^an z{uYaETvA$qlq7Yh&|SE^ap97p#xpT6WMUT;C9cJk9}(}x%YVPLp{x?DqT4vH+&}^w z7p{P5Ef%}9u@V1G9I^cw_Tqe(_C^bI#2cpE8dGi}yWgOWoKk*B-G1tPviPBL3(b#F zcZ9m{pgVu%1LY}fU73S4hj>l-p_UQAheNf2cfw;r+l6 z#SmCHOfz!h(xuBvoJWKS#UFp3!lEqdP1|cYHCHR>dAwU$-lFYmjb-y-#$r0&mr#ux zX2UsUYx7iDMfKHm)Z;~0Gc>(e(1L3Fm|;AweOp^Sn48h)Z$3Yx6=*ErPvG(y;O0Qj zoWA4ZbW!Dg%*cPDz5?mXLjS?Q9@#C-;?D^1g*n(kj84NTR&7nUUEgbKTd9ltm|nN= z&ZX%q^K+VB_dRVOCtJ~Pa_wej=K9z%u9KbYb+R*io$SnACp)=_o!RV`cCOf2^AU}o z5g&DK<`9*xYB@NHdQtEi^E29G+j00Fzhj7Ul(X$nZa_Ijg9jyrv0Q8F zD?#|Elj)LgIc2?th#N~)UE~GDE$xAtALbVD+u>GrXw0y@9m;2+JQ}HivYp1hpg=9R zEaNm#wbLFEjqI>kv!HB;J9Fet%?r)q0h0E3k_}@>inDNQJIv3rY3Sn!9Li4m#D^>j|ryP1x&ISt6lTI=xj~q#s>EL0=LTz`Qp>ambv@&>D}a9YKe8|~^;2V4}7KC^Wu|R4d z7Ub;OW0?zM$wy+L6VbCJc;&!~PyrdTFjPUVAsP}J2uCh*-|qd-mL?a5V`L)nP<`nO zi0{BM)tI3H$)tX|tw!}>ARrvlYhR!FZa}lqfN*YtfGWX5EI2acb!9f6%^%I(brc+p zCp}-c>bSyd=@tK!E}4U(ORG%fNMM7-<%b^!j~!@+WQI_Y7iX-W!cg_HzdR5w3p>fyoOR8q)I2m`5~f0jJgF%P}6X>BQ-5r z%4T?l%WJ2J*R%cbnQt8GzRTEdnO*@-;|rx>^V*RVw>?&|=-ExYGWEC91FZZ~e2}qU zaJ%3^2bQf8`H);qA`taJg53yPU%;QbGnkWUbt*E?t89&Mw~iADw;=__uIvfnQI=tB zV`m7Duzfvrm1#Y&!fxw<`ImY`UyLPs=-epi%5mJqe6hpST?zJ<0+aYtC&#&O{4xWK z%Jc$LcO=*Z=96*E=T*|i8{M7IX^rx-uNh*a=KcNLTm*}fzu{dlR9@PoHQbo;y*t6qtd4is&oC#RmjJg^w!s}#okDPKbgE4@R3o$ z|8#;K0gM*NuO>R|SwLhz1AVJII_#$ct`j`bVLulz{GVzu_6)$zYtN0bKOO*mvcsMe z^djPG#XHz-Ot~3Oy?d7J^l+U=`~~p8n_{=OlsI>n77YHLSyemiCzz9c#5WH8tcc&m zidPik-{~oV{Jyhe6wg#kQQpS>;r0PfdShf@t@Cz=>mO;6VzdsdK!3V5KBfX3kHh{S z9%Ad~;^TZTP3z~1m;aFw z%D)=se`KUL|7uVE)2$ucza4?S(e-mmg5Qcxe|el$lA-?-oW5IKKR1fghDQDuQ!MUH zF&+RJMZVIl39fIF^B+KbW8L_8K2#o`%IRkzk5rL9l1KdO=cdLfey;!g1mLlW5dJH{ zObz|Rdgc5i#Mcgs{KfY8@%XP_|9YNOus0rJiAI8-O{M|H{@LRxqkKVwe8+ES(aBi* z5xX1J<=x$?fZ@w!zWxd9U(k<`I`*d&(-H~dKa2+qk5~I8r+>E#b9w;3nd0_&z9u2R zcY>###6~)-5;4&x89(mfD)aE~MDAZKe{# z=rjr=(dGFj`uP;Qzx@%Tde`avSYfZF*li&lLjMP_X&(LlJ$IS+J@P1iyyDa258r## zJn_AUAE6|E-agO3e^mykEb;o(=hcFM*W$ed;*U1^U2K^AE`dz;5&Pxo7pp!KT>ZYF zNU{KbeYP5b5jP2}Gbj{fBn z=bkkpKVy literal 0 HcmV?d00001 diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp new file mode 100644 index 00000000000..7d7e30069f9 --- /dev/null +++ b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp @@ -0,0 +1,6 @@ +// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp + +// Compile cmd +// g++ /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o +// Link cmd +// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin b/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin new file mode 100644 index 0000000000000000000000000000000000000000..b2ba290a27dadc32cf1efdced36c497c5f85ee33 GIT binary patch literal 11320 zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<*}C&<61Q}1pwJ<$h>kK%f}zvdz9cS=eVi|@ z(^Beo6MNA{C-!O5q#?8qo5V*nv1!`)Kv6YSh)?^lPhD0`h?Xh|1tE>${r=C(fu8N!=X@a0~1%6{hqbuoQ8M5D1dj9Vtn$JC)ormumRj} zVp55{c&#id%n(5Rl=HzU{E}Le} z@+wW+ELvV^+N{;x8MA6nRXj{h&$TMGN^Q!Vw%oF3)uzpHU*Gy@YCyAHURPbN-LTyz zl%~0_Q`LIWsv-pykHe*Ut$|RyJmu9W+r`-_bJDeDYz0&+HOyVRDv3U9bs=g^QwS5? zY*>wk5LyLqp$`25KF;x4rq_apWET}vUK&oN#E~TLV}FY0m=+YZ_|E%A8SD|Y3BQy4lQJo1Q4C}L^?ud&U2_&V^R0DeevG&z5wkN>-pU-2o8 zFL8e5#j@DXIX=3Kkh2RMKhTHY940^2sk%&lmg6@rBjoHyz>f?E@*`Eg&G~Qb4OJkAeXiRK6-EQw>gPqguor8SJ7~yyIX^{wNE3y~C2-%3r!1F+b*|F`-Iq%RBoyM;Zn zTgc%rBk+Yh>>$QUqf)Hex?$HmcTT_FJ$@%M91G8tx-mPJ*A2&W^}f7ZuMLx1Fw?tk zo~_JHw3WSywsJSoRz6@Wm%Z*zlU*@o@t0#)9dB^=vv$i&-dtz;JQ=2=Vcp*^Qs7VF0-O5O~X z?NjTuVxGQ#1p{O2XCMOu6ctWu3TLL?G-gE_GT3>NCC{pqjTS<0Ojiw=pbYlBq)YVO zD|v^Va=pUOt{6VH;Po<)gMqG~QM)91g+dNSjEo~<5Hl-9jQx7O9LmEXt5)_?jby?w z%#UW$aJBCG6l4JuLC;fxY|k;Op3>b8u#W45s{OX0$*IGqY7=x+3Xr22Klx-J|(@ zZagzSP9eLh{8=1Epo1%in28`1yIG+bgi*L!F8g?#TuDjI!dEg^t@~8!CN~S7OdifO zW{W;WbQ2FJo0EPXtCVfUP?oMxEYhM{X{57EWx+r;mum^vcj@UC`5{{x)+J^l>11>) zTlpT@%<`O`fD#^BRzHb)a?Sde^t4Vb!lUVSO7Oz@9;bMqWhxmth1^M};4={5=U$au z+wyFuUageo47{3_JUq7G^UtB|o;`N6Qu55I)$sV9BBU}#PkgY`6Q9!E6W^gsmrAlw zvP#poi6?W<2t?04&->BmMD#ok>K{>TnR*ePXutj_(Uoh65>kCpqP9Iz!ZOFKHA>zs zV}6ulfhe&8x})T4q)e-5rCMgNMAi@{iS@;)GPc~UFnV2a@|~gJ$GJNYC*MlC%9-Jd zw=6b={(910zm`ZL))y&q%2h2#>gRE-CFNzxr2SYk*49YBpWnH$fJ z9~gb$fY??Po%c4iS%A(O&$^yrIruKwGA6y_hGOPv&SEQEjX`W|V{n2tJtf9n=1G^k zp(NzQwo45MzbQxZ(9O|)HkmtENYe#vtaP#c*NO`agypH=8X<C=WJfxf z^&Rw-gCY~$iG2D1_d{9sd8ngNK(2n7N%&W@RPvk*FIGB3kpn;*~dJY?v&rXbbRTEGaGaiA&4kQWTF(xV8C5abhk1O=6Jxr`I2WR9MN z`TE7F)qSuAkJ4Q_KW4c_tkRQ+iMh7t=R3b=`5D`;mFbc6Bv_1w%@4?!{1$2zi>}?o z!~B}AlYRR5?@nvfwk8*Dh){oAGggU0i=Bv37Fi@U$Zo;7h31>Zoa09V^YI946a90W zh=kwq(u`f(?!!^Gld*-(J{)0s7hPkwcfn!$rW=qCvB55w8v$KAjPIdaJ1lbNAiGE( z7x5`IG7KN^VTZo$(4TfkfkieBvN-0$Va&%Oq>W$n{gNi1%JCm^{6vmd1slFqGM)no zvb$EnE4{?u|CN@$d2$z(8urjCdVGd8PR^VNbr^jPCWr2hvpeRbqU{)Kadzg!V26DV zbK-dga4W(VBnz8QfM0&WP69a}XFqKX@^82Mw?HH4XIe2fLCK}-tpt0i6^qf=@WGE? zOtA9_@W*1e0p33#`JWqP`-zu4BKZ^X4toI**-sNZ++jbEutV@jhy75(Admc2C~1|9pV9W9wGQF+tgx1?9u>piVGU-NA3l@`^CQ& zI$_ZNhv2= zMmRje<+s5fvW83glWYwP5oUnXQ^b%>1m^7+lVg}0Ug^1$zRjhrLzhK|HpudKOy;{ z5&r4e&cMG9ryI|Y{>gLssG_`zs2}K2(eV>}J;dAy zeg~rK4D~Pg!{H%7F+w3;3mybZzS05J{>~;hp=fL_{KvImEXv8K42dC`r}Vf(muOB z%fNp{255U0`%1Sv6B7Kf*fe}1@dq#3gI1DHOXOWW#9n#&1-!=uSAUovUW`pk0pg+a zLY{K;w92o8p!ez|!t3(kZ(6YV_g>y3tH|rp(3u{A9+0(Hk$*+*j~jXfdO&t=75QJu z<6T$(fk8&lD}SHBJAQz2a$&2O?2xc31MShlOK!u?L=cJjf|P$v;z^#4!e03#3Im_| z?Em6UA*hSN?-3=h>R5*`;U*-X8qQ03RcEqK=aZ6C_*Ykvzqo=twN$VCl~v?Fmhx%o z5ZR~mx>x>I#J`gNJLLiBv!_uG{RsINlJ#wl(%8nXUz0?@ANB!&9MG2@;6;O^rIYOy=D0Vff V`lvA|NAamVDtL{`=+p=bYKIe*Tf8N3=vDH^4;G*l$=%j;Sl(BY^TA(LZ&|Q|u6%X8m~H z#uAKKK{*Kh&?#V~`h}W+d2a{hda22*`n*-C&or17y3NoE{Yu!V+g_t$&&&mG$+qiG zSebY2f)f_YcD>=(?5aCk2{E)o->K9q^;x^@_@&UPm+fq%Z*w#S(Cn5rRhRDuu0Ic@ zY3}Q6wNY@YNI@m!aIsMj5K1^fd9_MzVQ$u*@tvBhfNG_VvF}zTF@l{AM19^C!bG^(Mim5w6eMJMohVk7wfTszT<>h%AAT05JviK-=jV;5f* zOHyZBdlqIAN679N){x_az?(nY7Ndxv4ZP0AdhiY46EXZ2$uZCQlRf<3m;8!PX?&IQ zD=${Xe#-HEs|Y!IkK_A$@M9YJsZP~Z@=1;#UPZ{!kAUB{HI^S~>V3|CYY%@LO&-aN z#?*9XJV$+UrzmkqYhwDu12A>%yzdbCJ{v}N3eU<`^~C?M*yKxynR7$p+_9393p z0Srr|!kU)~8-B4|sh8%87}>K<&}h`3tvIP#ou$f+nwwfh(V0d($h6i+Qo78}`9*?dsV zd2?azRG1HZCqG*}anf^Y*~t@)Vx^qVmp#Wh;aT1SOZ@t;KYpwK58LI6DZ5qA^#h`4 zpZxYuZ}$J{7l>*Ixfabx=wpeMYnSy?+SbH1UDGbG#FE|wuyP6i^n{j9EbGr}W&$)b z@zKiVOIJSBhe5vtn#Agh+J!_y?*j~(Wkb{7Wkj2c7>E_74x`h~Cy?7e!o0>;NoTf! zjycEl?Fs!>7}M{j2f3x&^w>?0=d|Cmgua7@BlI|e$F&cyd~{`LX+{4WVTR~&f*#+Z z$L|ZkFMxO^J83DDE0@B2@>=nQ!}lmD;!rb%BG z`u7WaWOq7;Pe$OUCtwFLR)R{Q>YA2Y5B&x6dUxo(%<>$(6`I!E)P!kyp>Ou&<$7({ z+=89nck^szZlbO1O|+G}iMA$UwsP6)t~6N%lQFF~y5gqTJ@lR8B=+`_nVp`?Fc#*q);{zDm}Mb^%`->*)iuz z`B(+Xo58Amdb1YHGxx1wU~2OWWMF`z!cj}%)Ee{FoJd0k8_#$#bSfpQiO_>`)shLy zV9Oi#iN1d=@32vBR`}U9!^aN1Sq5@2&=EA|ZXBgTA;%*|#t|`ynYALucD-2+O~4_i zUW!tUWWq4aPiE3^wc$q;WC0XGp8%;a<6427IyfXID2vksE(c{_O=}is3~1oU=3%N- zQbJiAG9YlZS=fqsL0ZOrVz@V#g|@juBt%7`gg|1oxvKT;N-1)sB<(>hoaE-zG9k$l z0S-(Xr@CODK9$3vTNf_IJ9DcbEhYwz;hyS9s<=wN5Vy9qGEajL?LY; zKuGljh#Bn)5SBS+0|D~#n2G|NiUo-B?F^7teMwYsZ-5yT{stl>v7QiBvR2Cz^}W#$ z`4XOvLfjt7kb5vq8xg*nQYS31y%v&>a)v8jN!qN zTBz{kYe5FD1-NeWtg0&&dLjxwJ3LYzY=G~(kwsb{Gm6F&MLu|R6m_Z+0n-K%u@g^B z6|Jm@##NEC9>SCkW+MkfY>sPRg7ThY8`{ z$H*dYBJtByKIFNo1~FBT&7`NNa@mP|j(d(Wi?d}!HiZci@laH8h6!FSF?OPwcY{H)w(@^h5?f~9#Hu`u5aBMlP=xGw(k>()#3u6P&I}%xU)MCryLjpgyo28lmrOvyUC3$)NUjlD*pO z>!aIf;~y_2**i(__x0Td_;*t%e~J7VZnGasI8F4CHhWRR z1^C|-bKj;b={I))Hrm@U9H6`1i;uHypnrm4`}|7->|&CA0seCLXo3Kb`2#$l_M@P~ zzcJr12+`jAGKaT;{{m`+%Y(kvPY$5;TI~yMuJ7!k(PrO+0Qvi+Q8wPxlAtq9gTD7t z?976u5zKa4Bb2Z4<{+mJ68(dh*)LDQ@w1?lJn}W(9Om?4qF+q*omm(hjN~tp{0OIy z2)cG=VR$&A?;&}E{9^{E?}Gi2k%+!G3Ao?jxA!~0IKn;{Ws_kN>!NkAO-VuicZZtx z`deSl0|NXwU|o-3_%|4c+GZx`=~2L=xc5gR81=FR`G&slu!oj(c5#;>Vd%Q5vp17O z$MPil3g{`K6Ab#_b@qOenS5cj4l;I#(R`S%aeZIsF#I!jj1vDU{TzoQI{C*N{j_iw z{A=nczm_O5=qG=n02>58@OQA8l=K#l-+WC+I4z-%?4RTGNdIoXzaK7w84dHd8}&03 z!Ql6Gkq_?wE`s}ceh4Q2eQi|yIs^Ef6u|=;>5}xjeSMht`Of-wGI6)BZ6P}C$yBdH zem{QqpmuhE>Sf3uXbXA3pd+7{-t&jp1?2l;lJbjkCK&l8*k}EG{#!IZ;P;tW04>Z1 z@-wiP$l#|2`oJ`i32rg=Zj!yQzwglJ}_4A66gY1YaRKQuvt_4v*@B{xBOe{$j{O_jdqUCk67!Lmw!g) z#aQ^z+$k`7BtcDs(ZwjYP47?_{$VNqMogYu>6RbI#K5N>`&XsB8Iz}>l2`NCfH2_> z5i!Kgy2k)qJTwZjVoe`l6==sD zw3NEt#0J{v#J+6Oq*Z*{7$4EZrfKU3%Bl$=K27`BMQB2_R7ofZX$lNe9FN1rTD5^tykW|#QL+ouljekLRcr;6ohs(8U6w>2ww55OO;ZRH-E3Hm zh7ei-Z=nvo0zR(pwM?%C56Lbnro1%jPLU%?-p2~x7hRP81EGAB7?QEd|y?Yg^Mn6=sD@JW%qG5ZY<>jJMbSwm{@}{ZaLytW4 zh$;BZv}u(c3$2T82(A=!n~7azNt5uV6L{RSYPtNctsQ?wEnWNn(b`#)O;j!M>$~`} zSdtnW>{*xz93fi+*od6(0bV~l6rhNy3B1NO_TX#4hXVKk$6+5+vqBEmprR-|6k+N!@lPWuf zRDH&qs#SAFCY@?Hp8b`&Reai-v>QTk%2cXT?sV02a5S>w(WCB>!_8{Nv7Dk+@+PXS z+)R1&aOH^Gw1(@BTdo|*7MzAVG1GcF=N+*thaICn!$QCN+mFxp{c)3AF;jNAZo3Vl zgg^VkpWo>F?Q4iChFps)64Ew?7CyS79SaYJKGMSBi!3y!wE%p0NehMb(7dLF)1k`? z+DQOc0O+B)%U2e(XT$HX(EQw`g}IL|b4Kk1NLSy#3YLqZ(B-QiM8euz42odl*C%NV zYkM)wYcbrlX`0S~whvE_}#KR-V=uiZtUNjJ{N-FX_|CGA^;JWoywX`5zMx+UpW=~DivPvN$1 zBCv_u-2B|a!le&IT23N;n59Ek=8(g5XyJpaAAEd?Ct-nS{lkU%dCD5n(55s{VT>k< z=G3Zs-WbDkq~%W9p5CaKJJS|3tWpUlOT%n9$83F+P6>IE=myS~$*P{wjY3`*C&!bS z%v1VT_2WB7cIfmO9owPjY0Tpzabs!VhCxrCxbI)-qEqO}%-FxxmnVH$=-(;qk==X_ zpNzoghhYaXIt`~#wspgb_+RjIN!53=B|IIISt1O08*3i!@}g^CXL&<&=yTLT^l!4Vj<}_PnG^^qtFj zhn;e*!p|-nKDOYsGLVCTB|)QhN%RVZ9E=zlN5mjzmWvqs^;$VJ42P_0$xk(s3Bxcy zl1anWn(I@L1yBTim4HxX z&BUY0!UZmO)G67Dp(H&4hcc)Z8mTN%RWOjr#Ztoa9eTP&Zpe;?afumE)-yU*t^69< z%<_~ThZ5ntegfs>n)NU1X`MP`8K(pw=qS zpGV4rZQ$FuZ;=+pgrf08kq>o*+@*?J%R%8HJMjcmQ77t~Qr+D7bbL&+@CfPW0&G zKRc~b+p1i+E<(L=RjeX~7CRn0!6K=Cb`$Jw!Y8s(%*l<&P+&e5VQs>FVFQuy8(x~R zt6P0I%C<8$x6y|qOkYCR*sV+8Ful(W$cI?}5||qST|0<(p_@A_a(h3!NG}%g8Z|Nq zAMjF#-s{juyDiTm>-$+8^T9CYqY=`^FM4-Llh5S%4>^8{@rq!0KvxPpM%KuAs+fxW>upQlBAMfw5?_*9pF9U8x z*qmfxvkCCaH`qxaXXEUrt$zOQcJC%=1pRa?#>OeZbiJ8iFSTMZdK=#V$%_egAp!na z>=wX#`y~G}{cJDsl1C(eJl< zyOLgrvmwwwgFl^fFRml|8{GjOK<8nC&#(`{v%!Wigwj28l`$g{CY*BOY0R`Qm<3bN8)}wcZOh}9K}WU zd44!mz_S$phCcrKfAbeNuj$~Vl#gG;0rwC2F#2!AOJFUw_mDa^yM=A_!r@oX z-#x}{%$@|fYIaD@7q+b4N99H;to_{|i%%hfchD6b;w2YOU={5W3^F*kzWf#^C- z{R{qZcmPn0P>2`#K{?Njvx~^zyNSqv%Xt5>Hu8=9#N22{=z~1s13z|a;^+Az{wNSg z0rCrd>+&QHI_3Wzzz-9dVB{C}AK``pv9`u8C)jf{tPcXLZ!-=6qO1Lm=~Tuk=hx>M_^-$S zZO>v~S?bQX1b-wp4WCH-!Hf2w<>b>6c}EwqTb_Oa?=r#FALNG@V^dOqc<8*4ryM=8 z@@qfn-8zZzx_k(n5^VmxoA=NP^13v1x=Wx7WbGB?UzYphx-Nk(key#a{+IH2-=hA3 zK}OIme~-Y|{Q%|U!e%kqCSg?u+M|P)+=lDpK_upvrTl9WPx5pWcFQMG82Hp<|LpBT zP#1&WBT8P?u?AtnjY~c?oRjjZ&Sam?CncxwudX2fmu2LsrMl%WuOL4s<wkk`9($fN%F+kEk6SL`0H5%c_~jh4_-=M;pqQwNM0@%g=TH_MM)CKKCPd& z&mksBX`kr9t9_S%WEy-D*$Q3>Ajqd~`JYMoATQ0kg5r|P5Zs8Q8RjX`_VzkXemfErZ+TG#J9ARf9+geF35A?8Wv*>)p-l z*sNm$o3uhzRS{I4ctbq#(1$*P)S^5a~JJOX};`XYJ09zu9mCwwo@$H>1kir_O&z?9Jf^5F+FanQF51|FvUL3 zS892ug0RaT!-ZP4;W<@rl5%MkOZlaFd(L$hOB$$@tC+i`iX!^3(}Aci+ftb5cEf2j zq|g?42f6PS@NspoX?sn0NOrMG+DofmE-;eheXR6-!$svk5ZXtDAsMGqnRlJ~0!LqW zYhKM;sUyOGzSQ=TFRSu9x_@8Wv{JgLPc4=ft=imNqvXl*R+~rDQS{iw%eID3J@)Wp zw&b@{wo@rPC{ySQ?h5;$jjqvA5L{pqk9$UMjsLZM;_sf$eg3WX$+nxM-pb$ICd**! z8DgMo0~$CQMg!QGnrDDFUmFTgGvrCNc?ZWp&$xnS+Zjeii-@SoQv-7|Y4FvL|RKCspw|DWkajBzL(3~zq z#Ga!u*>}FxIwa!l^Q}X1o(kp}@6w_W%2q6gV5DqJXJ8jlQBenDB=V2`-MB}95s?XR zu|DC|+`>Y+T3jk%X3sl~TCIAz>`ctKLKJ4))7e@vKVLXocN)!REt_pN3*P)pVY1oC z0if5LxwbN&&md?yA< zzwU5_7y>7&!X2f_tnu@G_W<)T*Zk!|jb>K|gU%>ri zqX+kE#vT%5CL;=ju^Rm_5@Fiz}S)2j{wB zP+nf9ywDn1%O$r_u2s#PHG?NN$DJ>EW@FJll5&KFr!n;UhTSNiEt%8Qw^eU9Ep+?& zs+l&ee9n};{ONT18S|&+>XE5&lRnck<7SS=96ky+lLBrM^wha~|D7&67oJYfe67A5 z>B~U>5owR?=Cb&tB|bL^JBYE|DCa9B(<)Uxcg5UJ*F7Yxx`QV=(^{IIG_AVlnq4{B zj((f9+o|zeV<&x!>|}0{o$M{LGkJ^bOa<&@Gux_MRaw&kjo)Pj&xphH-QrBuD;Ldl zCMQ-2oX3{I_c1duR=+QI3_XBRd4!f2bMMO@*+9WJpo;fpCV>+NPuTbxSuNOkd}XYr zj#BHY6y;X5@`^V-J#Id|6PxdudpA*zcD57B(@;*);25Sb7HiAak_ zBIL$G#Zrl(z{Xsn=Qi^U4R$9y%WN7pI_pj-&q8@eut?_^N`pcTMv7PvDTwJ!BF&)P z?}RgxaK@ zF{R-a{1W-OlA^g8CQ?OYaNHqRjRjZNQ3eMh1i_JkR-EC!5Ll%~+%p|1$ly>05)40? zhN;F<-WQ^(WH)dpFBX?7*0bfJ?@CeGgQFm`IlUiAvM7reT#)rE0CA6-sb=cl00wR| zW_GNeHnF+Y*U04zCu|kEWJb($SPs@~{@6^Jl++Hh9Db2%2-d@95n;$IBL5n*oVH2d zY-Yo3cP?a#k+V=PIKM@zF>)&F=K`q$0{rYl!7Vvnsa~s;3o91B5Enc=n&Ac0iOkWX zrR8$Lvnx)+$jBjf_Wxq-Jp{7%TSkq`FpbMg$%@JUPMHJ6>q%p9M3;JEB*nl9_4lP(%( zT+edqc>me7=Df3(X6D^aO?WE?UW?2OoS=@Y#Ijso+%QNqK;T$-J+SjAN$U9^OIe!B zK9Wn(1#QLhvNg9mCrXEcbd)#Iz^QEv~-jVH}gwMY+W=kWcpQQ}6NP;dI2eBLcBGVs(iZgEk^;E9U+i5`rbjVW`^AOTzP9*1=EUdYm&+}(J8EL!ak+sxn7q{_h6hE>=bYxeC% zDT}@+^#AQ~asEoQk85k6)hDjTL=W=M7x6=XPgA|*B^db{YxWD`=qLHB$;ex0LB9Y5 z`Oo?XSpR^eM_+BmLhIENTCbiSTCbSDUOhdX>lN!*uk%-8{p``cANGlS0{+Q9!SP*v z9R3Zl{UNgd(Sg|cE4>4Le8mXp!%QD$|0W<$j^k4CoxjpI#2KJ`yxtEw?QWkQ8EeKl z{y52BzKHzo9`^a)=m*>%_x;0qPI-gq1Fd7EUR*sOM!jhCPw)O{hIJyJqn=^-F!C2~ z$kGrdI8yz92WUM7!>F%4wEm*?ym)X8>wh4oVCcSKh+AJ^<}=n`hLMM{4q!9$Akd=O@@6s`NtQyXg|lB27Ur*$t?x_c9QAw=a8mfc0)hV6&xw() zmLWb#iWgTzocMe9;cy_j?r%iDmK2Y!d?~2sx7)ul#HFMd@#zryKY(rf_^%#%z<%V( zr|{z~PftGa@NxUpFQ0gllKB1geFpxkGC*xc_LEMxW);|qcH)UYcu^nOOn$#YCPKup zJpE!DGQsK(sbhFS2@ntUEpF!+N0>{9qo9X%66t02(72%Z^bqEKbPM?}l>DBMKnN7x z7V<_yS{w-pgh27u7V`VlF}>zDW2rAJum7D)$EU}>yX5SkLg;1SvqdvuT$1zpYyd~o zl#+i(;Yps(xv>0REDU_AD>i?)q2x`43ES7}NdJ#OI%gH19!B=bnOQ6*8c+u+Td&YSVCDQ8$1NQM3XEw*biiH&L#MNSNxVAt^?V6V zOQ~C?X`qQtd-*VF9}u4=HffrKCN@o5KLDyGw4r^N_OXl5glOrKP!Q4>-tYgsoX>W$ zmo%-miBodV^M8MRp67h_@p}*47Y>E8o0z!5?ANR%=QO-^s{q~_72~5sdw2)VMG!U_P3SnOeEYjM}_WsZG_H;n_{k@Z5@5ubEE0VouF9?6PUr zEUz+en?=hj&6u^iJ8M?$>57M`>A6;=R;f*!GnQNStlEs39`_Y)kg5iB+vSZ_=GqP0 zorl&G_jbBkFIrV(q2h74RIfDY(Ug~lQz>yI$@^I0d&foTKM=}Ci6I%QTAg++ zXNE`bxOK1YEjWlUpf9xn6%d_zIqx4g{VI!5b^F7!_N-B#nrhgd=zF=e6dkJpt9{;7 z@WT(>`+zC<&6H_XD;D|}EfHKU<}MSvOIN4x&J%dtGiuHJuPq>dS#@s!|Iq^4n9Woz z^vk*gGgy`y8}3`2BREQS2Cxx1zYBQ%`H=ucOpCy4Y-=Ch1wItO4@r&%&Y$Sxe^&A< zKBe*doL_m-5qp8-qaB2t{hs3o`ta9;$xn5-mH`K>TYSKQY+7uFf*sEM!jD9TE$8puCwG!eb!EPP;_vVt)yL>ZzQd{ zS4mbY#iX;~&D3jIBb`b%DxQ6tW0ekB({@8BPMS(}(w(b$6`YT(bhv(`UUA3g%K6&i zVyT|5P2?(>{GsuM>G{K%Bh#5fj#Diio?Cb#SI<;SPh@lX3By@npwD+9lL2~iCq*W!wVw5_3)_bzBh!o#8Wv~c(=3oU6)0PmjDLSa3$ ztZCs?==_Ry9KZztdT8nVg%$0o@Ea_&ymW45>AmxuQ9B0G#kVhl$gofww2LEN=Dnl6F1A7bI>!^pSBD?pvmT>&5h4l`|ANV}R;+(b8WRr@O4 zcG2xga!b3Grd#N?pKil+b8%Z5W-)Ps(D3RW9RXX_$@55nIpGDIrf1-N4y0UDMOL zQ7q`<gSq_OOkG?oh(%VsXU-()L{2h?8nzRR;)=p)KuoDyX{GoEKh37kf^ zwt+*r?rw<#3#{OYA@|Q)v{Pd zMp5#nv3wufu!ZyZ?W4_{L1tkSR)I z_ZxMIzGpS}uxD;q0y3*60K4&qS;)ddSJZW>NY zdV0K&nJ5&p6tb%tAcKPmbZ})6bC4hu(|H9?%Vni7gHvqO&2VZ4u9CTG-KSDNxx|w# zz?sHe(Wi)h;?ZQ_0~dUvQnnRCS-JuaNOFs6wUN#;l?4OY?9>vj@77aI@&H+}u37)Ip37H~6W!MzC7N;_Q9`OOO4PO|N?7KY^+w6N zCGSU>4@5c98zo;OWm-h5)iRAGvW_rGtS?TLu}-(b=q1I;H->^A=bk{Ed?)EDXPPhG zve*>*>q)!)dLo5bU!=$>SJjEs&*OSa%FC2a`LU)m!C+}m;lWOcqb+c0G%}syPjb=cv0Cjro1DDV&-YijAKh%gF$R;*em&_r^L9+Jn3>b zl!UCaq4;|vz zvZX>taP#c*NO`agd^Gnh(!!WhG@dB(!9mD9s<^c*6n5B&C!mUYv0D|HcaRT($;P{}Mk5A*el zQ>*)64IZVtb$-lpi&&+{5EFB4&(C*mZ|6DNu9fMb^cYx-hRqMiLHS+xI9Arfz4Vaif!=dAf+*qjIpaQZln2DF(*$VBZ2uugtZ9wi7iCJ zcY7(uF7EWvvJX{!%MnBGO#*tK17nBMOOMqp>#oCg#NR z0^nwZElC!(m;k?gb3F>=Vx0Y;8RPG4@7xBBpr33GvPnuLU9TnBcbbEP^cFn!!E*_A zCISAz!D|5DIUxC;jIle3mpmf*qwzNT1|YJZB6zsXo|Uje@JO3|OTy%TQ`D_Z?~;F) z1bv{*zAov-I2!@|Bly!k{oE$9zttV$0krQY_!QgHWJBzx5Oa!48tg}I2K?g}U%a&) z2K~2)x2d&vKpoo#C0w4~uS4G(r#7)S6D-=4FK^qRBi>!jC~KiXhDaX#qs>^9S}n%) zJ;+Wx6OQrPZ#TBE#R%wePLC6PF%gY6#hPHG5Bd1T5d?M;3Me12H`*NL{KG=D-kP1~@&5{6~7~744~4G)nb~`1OiLyXzI{s@I8UB5^;S8$z&8 zj^ZNwJU^T&;6;jm%K(4vzx`8N*jNJa=3(wJ<>N_aX+Dhn#~NboFdZDJIN&(17l#+aY^NKE@C-G1Nxt0cdwxS zJ0d&XXq5IZz8N6@k`CMNYwQ*1Z)!+S3oYA1e8~vt*eXc=qQ>4lrC{)X0GRj_k{=r3 zpN_2y{CD6a@4#bm(l7YYPZS@&u;tPE)z5JKf*=0sXE?urUnl-J z-^=({U|0{jtBHJ&{WCiVj!-{y`y}s%<5YhRzm{Y-x|&866Bz;(_ws&e4f!!?=wy#T56GTgL;gj%KW^#~=mFW8HROLL zkN4Zu4;N$vz4EsRyyFKbCug^d$u0@2GSD6!yp9OKnG7N^k6bShFG)Pf(^1$fKZe4< zr#|~HOL;vYPg5nY>PWxqAlxJo1J|FVys9(Vr}Ih4Df~~@kY8Fwo?5C`{;zAu$K(f! zl=O-0(|O%1zYX!P=6^xTD?dq|_YMQ_h2zl2!JkIs0t{@pd6u8Jruhd WRejVLl%x1m9u>bDRbJJo +// Definition of AOTI runtime interface functions + +#include +#include + +#include +#include + +#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ + try { \ + __VA_ARGS__ \ + } catch (const std::exception& e) { \ + std::cerr << "Error: " << e.what() << '\n'; \ + return AOTI_RUNTIME_FAILURE; \ + } catch (...) { \ + std::cerr << "Unknown exception occurred.\n"; \ + return AOTI_RUNTIME_FAILURE; \ + } \ + return AOTI_RUNTIME_SUCCESS; + +#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ + do { \ + AOTI_RUNTIME_CHECK( \ + actual_size == expected_size, \ + "expected " + std::string(name) + " vector size to be " + \ + std::to_string(expected_size) + ", but got " + \ + std::to_string(actual_size)); \ + } while (0) + +// AOTInductor uses at::addmm_out, which doesn't supports +// arguments that requires gradient. For this reason, we +// enforce no_grad context for run APIs. +// +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct AOTINoGradGuard { + AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(false); + } + AOTINoGradGuard(const AOTINoGradGuard&) = delete; + AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; + ~AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(prev_mode); + } + AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; + AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; + bool prev_mode{aoti_torch_grad_mode_is_enabled()}; +}; + +extern "C" { + +AOTIRuntimeError AOTInductorModelContainerCreate( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + bool is_cpu, + const char* cubin_dir) { + return AOTInductorModelContainerCreateWithDevice( + container_handle, + num_models, + is_cpu ? "cpu" : "cuda", + cubin_dir); +} + +AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir) { + if (num_models == 0) { + std::cerr << "Error: num_models must be positive, but got 0\n"; + return AOTI_RUNTIME_FAILURE; + } + CONVERT_EXCEPTION_TO_ERROR_CODE({ + std::optional cubin_dir_opt; + if (cubin_dir != nullptr) { + cubin_dir_opt.emplace(cubin_dir); + } + auto* container = new torch::aot_inductor::AOTInductorModelContainer( + num_models, std::string(device_str), cubin_dir_opt); + *container_handle = + reinterpret_cast(container); + }) +} + +AOTIRuntimeError AOTInductorModelContainerDelete( + AOTInductorModelContainerHandle container_handle) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto* container = + reinterpret_cast( + container_handle); + delete container; + }); +} + +AOTIRuntimeError AOTInductorModelContainerRun( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_single_threaded( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumConstants( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *num_constants = container->num_constants(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantName( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** name) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *name = container->constant_name(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** original_fqn) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *original_fqn = container->constant_original_fqn(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( + AOTInductorModelContainerHandle container_handle, + size_t idx, + bool* from_folded) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantType( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* type) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* dtype) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *dtype = container->constant_dtype(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( + AOTInductorModelContainerHandle container_handle, + size_t idx, + size_t* data_size) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *data_size = container->constant_data_size(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive) { + auto* container = + reinterpret_cast( + container_handle); + auto constants_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { const auto ret = container->extract_constants_map(use_inactive); + for (const auto& pair: ret) { + constants_map->emplace(pair.first, pair.second); + } + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update, /* user_managed = */ true); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle) { + return AOTInductorModelContainerUpdateConstantBuffer(container_handle, + constant_map_handle, + /*use_inactive*/ true, + /*validate_full_update*/ true); +} + +AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->free_inactive_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( + AOTInductorModelContainerHandle container_handle, + bool use_inactive, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_const_fold(use_inactive, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->swap_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumInputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_inputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_inputs = container->num_inputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetInputName( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** ret_input_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_input_names = container->input_name(input_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_outputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_outputs = container->num_outputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetOutputName( + AOTInductorModelContainerHandle container_handle, + size_t output_idx, + const char** ret_output_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_output_names = container->output_name(output_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetCallSpec( + AOTInductorModelContainerHandle container_handle, + const char** in_spec, + const char** out_spec) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + *in_spec = container->get_in_spec(); + *out_spec = container->get_out_spec(); + }) +} + +AOTIRuntimeError AOTInductorModelCreate( + AOTInductorModelHandle* model_handle, + AOTInductorConstantMapHandle constant_map_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto constant_array = std::make_shared>(); + auto input_map = reinterpret_cast*>(constant_map_handle); + + auto model = new torch::aot_inductor::AOTInductorModel( + constant_map, + constant_array, + "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models + "" + ); + + if (input_map) { + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + } else { + model->load_constants(); + } + + *model_handle = reinterpret_cast(model); + })} + +AOTIRuntimeError AOTInductorModelRun( + AOTInductorModelHandle model_handle, + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + model->run_impl( + input_handles, + output_handles, + (torch::aot_inductor::DeviceStreamType) nullptr, + nullptr); + }) +} + +AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast( + model_handle); + delete model; + })} + +AOTIRuntimeError AOTInductorModelGetNumOutputs( + AOTInductorModelHandle model_handle, + size_t* ret_num_outputs) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast(model_handle); + *ret_num_outputs = model->num_outputs(); + }) +} + +AOTIRuntimeError AOTInductorModelUpdateConstantsMap( + AOTInductorModelHandle model_handle, + AOTInductorConstantMapHandle constant_map_handle) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto input_map = + reinterpret_cast*>( + constant_map_handle); + + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + model->update_constants_map(std::move(constant_map)); + }) +} + +} // extern "C" + + +#define CUDA_DRIVER_CHECK(EXPR) \ +do { \ + CUresult code = EXPR; \ + const char *msg; \ + CUresult code_get_error = cuGetErrorString(code, &msg); \ + if (code_get_error != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string("invalid error code!")); \ + } \ + if (code != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string(msg)); \ + } \ +} while (0); + +static inline CUfunction loadKernel( + std::string filePath, + const std::string &funcName, + uint32_t sharedMemBytes, + const std::optional &cubinDir = std::nullopt) { + if (cubinDir) { + std::filesystem::path p1{*cubinDir}; + std::filesystem::path p2{filePath}; + filePath = (p1 / p2.filename()).string(); + } + + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline void launchKernel( + CUfunction func, + uint32_t gridX, + uint32_t gridY, + uint32_t gridZ, + uint32_t numWarps, + uint32_t sharedMemBytes, + void* args[], + cudaStream_t stream) { + CUDA_DRIVER_CHECK(cuLaunchKernel( + func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr + )); +} +CACHE_TORCH_DTYPE(float32); +CACHE_TORCH_DEVICE(cuda); +CACHE_TORCH_LAYOUT(strided); +namespace torch::aot_inductor { +namespace { +class AOTInductorModelKernels : public AOTInductorModelKernelsBase { + public: + CUfunction triton_poi_fused_convolution_0{nullptr}; + CUfunction triton_poi_fused_convolution_1{nullptr}; + CUfunction triton_poi_fused_convolution_2{nullptr}; +}; +} // namespace + + + +AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) + : AOTInductorModelBase(1, + 1, + 1, + device_str, + std::move(cubin_dir), + true) { + inputs_info_[0].name = "arg2_1"; + constants_info_[0].name = "conv_weight"; + constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[0].offset = 0; + constants_info_[0].data_size = 540; + constants_info_[0].from_folded = false; + constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[0].shape = {5, 3, 3, 3}; + constants_info_[0].stride = {27, 9, 3, 1}; + constants_info_[0].layout = static_cast(cached_torch_layout_strided); + constants_info_[0].original_fqn = "conv.weight"; + update_constants_map(std::move(constants_map)); + update_constants_array(std::move(constants_array)); + in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; + out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; + outputs_info_[0].name = "output0"; + this->kernels_ = std::make_unique(); +} + +std::unordered_map AOTInductorModel::const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization +) { + + if (!initialization) { + std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " + << "aot_inductor.use_runtime_constant_folding=False\n"; + } + return {}; +} +} // namespace torch::aot_inductor +using namespace torch::aot_inductor; + +template +static inline void call_triton_poi_fused_convolution_0( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_0', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 12 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_0 == nullptr) { + kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); + } + CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); + int var_2 = ynumel; + int var_3 = xnumel; + CUdeviceptr global_scratch_4 = 0; + void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; + launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_1( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_1', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 15 + xnumel = 9 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_1 == nullptr) { + kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); + } + CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); + int var_7 = ynumel; + int var_8 = xnumel; + CUdeviceptr global_scratch_9 = 0; + void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; + launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_2( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_2', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 20 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y0 = (yindex % 5) + y1 = yindex // 5 + y3 = yindex + tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') + tmp1 = y0 + tmp2 = tl.full([1, 1], 2, tl.int64) + tmp3 = tmp1 < tmp2 + tmp4 = tl.full([1, 1], 1, tl.int64) + tmp5 = tmp1 < tmp4 + tmp6 = -0.16373057663440704 + tmp7 = 0.04603243246674538 + tmp8 = tl.where(tmp5, tmp6, tmp7) + tmp9 = tl.full([1, 1], 3, tl.int64) + tmp10 = tmp1 < tmp9 + tmp11 = tl.full([1, 1], 4, tl.int64) + tmp12 = tmp1 < tmp11 + tmp13 = 0.16525162756443024 + tmp14 = 0.022457100450992584 + tmp15 = tl.where(tmp12, tmp13, tmp14) + tmp16 = -0.08230065554380417 + tmp17 = tl.where(tmp10, tmp16, tmp15) + tmp18 = tl.where(tmp3, tmp8, tmp17) + tmp19 = tmp0 + tmp18 + tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); + uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_2 == nullptr) { + kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); + } + CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); + int var_12 = ynumel; + int var_13 = xnumel; + CUdeviceptr global_scratch_14 = 0; + void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; + launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); +} + +namespace torch::aot_inductor { + +void AOTInductorModel::_const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) {} + +AOTI_NOINLINE static void check_input_0( + AtenTensorHandle* input_handles +) { + ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); + int32_t arg2_1_dtype; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); + + int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); + if (arg2_1_expected_dtype != arg2_1_dtype) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dtype, " + << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " + << "but got: " << arg2_1_dtype << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_size = arg2_1.sizes(); + + if (4 != arg2_1_size[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 0, " + << "expected: 4, " << "but got: " << arg2_1_size[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (3 != arg2_1_size[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 1, " + << "expected: 3, " << "but got: " << arg2_1_size[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 2, " + << "expected: 8, " << "but got: " << arg2_1_size[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 3, " + << "expected: 8, " << "but got: " << arg2_1_size[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_stride = arg2_1.strides(); + + if (192 != arg2_1_stride[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 0, " + << "expected: 192, " << "but got: " << arg2_1_stride[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (64 != arg2_1_stride[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 1, " + << "expected: 64, " << "but got: " << arg2_1_stride[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_stride[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 2, " + << "expected: 8, " << "but got: " << arg2_1_stride[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (1 != arg2_1_stride[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 3, " + << "expected: 1, " << "but got: " << arg2_1_stride[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + int32_t arg2_1_device_type; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); + + int32_t arg2_1_expected_device_type = 1; + if (arg2_1_expected_device_type != arg2_1_device_type) { + std::stringstream ss; + ss << "input_handles[0]: unmatched device type, " + << "expected: " << arg2_1_expected_device_type << "1(cuda), " + << "but got: " << arg2_1_device_type << "\n"; + throw std::runtime_error(ss.str()); + } +} + +static bool _check_aoti_runtime_check_inputs_env() { + const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); + const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; + return result; +} + +AOTI_NOINLINE static void __check_inputs_outputs( + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + if (!_check_aoti_runtime_check_inputs_env()){ + return; + } + check_input_0(input_handles); +} + +void AOTInductorModel::run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) { + __check_inputs_outputs(input_handles, output_handles); + + auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); + auto arg2_1 = std::move(inputs[0]); + [[maybe_unused]] auto& conv_weight = constants_->at(0); + + if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { + AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); + AtenTensorHandle arg2_1_aligned; + aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); + arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); + } + inputs.clear(); + [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); + + AOTICudaStreamGuard stream_guard(stream, this->device_idx_); + static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; + static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; + AtenTensorHandle buf0_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); + RAIIAtenTensorHandle buf0(buf0_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + arg2_1.reset(); + static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; + static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; + AtenTensorHandle buf1_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); + RAIIAtenTensorHandle buf1(buf1_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + AtenTensorHandle buf2_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); + RAIIAtenTensorHandle buf2(buf2_handle); + buf0.reset(); + buf1.reset(); + static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; + static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; + AtenTensorHandle buf3_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); + RAIIAtenTensorHandle buf3(buf3_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf2.reset(); + output_handles[0] = buf3.release(); +} // AOTInductorModel::run_impl +} // namespace torch::aot_inductor + + + + +// Compile cmd +// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o +// Link cmd +// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin b/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin new file mode 100644 index 0000000000000000000000000000000000000000..1be0cd3083897a28e082defde99cc1da6a9ef442 GIT binary patch literal 10936 zcmeHNTWlQF89qC+zQ%ZyfYP9ZW)k8O+I046dlS-N2uYC|L`vWVM2Fekv3JROcQP|x z)-frYv}&tf0MVyD74->G9{SJ+T2*Zx03n1BQq@*{OA)H7HjQkPRFz8Ee*d|QXLs#Z zI8H<*j^a7zzhBOO&Y3xzXO12}rp02}0VbNpe$85POkMd-0hITO{?VHrWm7EA`tiJl z#TYaFvLASXUBF27^Roixz2%o{r6x0L4YN|4t}`=mnt>U3m7rd;+^DZKc=u2Ju1c)lVj?Yi)y?Yg^LO(g{%$oJ-Y2OJ%$;(Bh=vW9?0Cm%Ta zfF<~?lx0^dHcA)O5M3$OUJJWQN0IQN6L>r`YPtNst(|a1b*}x7wRSdS6IDz6`Yyf< zmZZ)$_bki=j*#6EY+Q~H18=;zH9`?X3wWJv>A}~5k45mCBu9huCwln5E%_Cn()a@B zS6-}${g~tXRuFRZHpln(;J0Yxr#e+v$fr4ecm*LxKLCE~=16{|sdqU4%{}~WG1QJL|dG(%gJ?@~KK;E+~5L^rY#|v)Hfx{KMD#f45Dpn6guK9nU9< z_R(*C|62bqe~zezkZaN6F?~yH`RZl;jJ7#;RoAqOEVigO0qDAxis>;tE-&lPXhw`b z7B4R^US7O<2?Ss?dMs!cV==uC!{tlMOL0wqixGM*WFS`9If71mH-^Oi7Pd9ML^@Lq zbi^~JZ;R3Ntk$ZJ33;A-4R- z(&@*IFBuC5#`hcaH!->2$Wx!kpTte3fXjiNI(yfD(?!R}W9iBNsV`6ZGSGiO*dx37 zEdJ60pU=S#VyyU;Le()$rxtkg#s@gZ>`wZeIE`(+WMsxC*aCqwD06&<8AD_4 zp~=HoyBt+Oh(lxEpqbTO8X1QYea`xa8$Y7dGkLNR!`Fxfl^3?2RaMXYft}H@|5^&ZC z0+iy!R0=aVm&Uvdr)J;^nXA@ADiw=IJO?JAZmtj#M6GaVGjM^+Jyj_=ilHPu0f#cE z))A>JQPR0Sx#c)CLyDKFZ^vSZ;bIWQxEbEk7*;8_tk4}pqC%&<(jh)k4J zZ-A(@bp!~ho&YhUT>-)}$E+hjULF%+fD@4bC%XgWYobJpVzn7eV@a$dLK5om>$F5Gd(|LSb$z@#rOm!4NSNrF$1sKKm`Lt$Iu(YQ z&P3y+t%JuoE%vtzpLeRH>Fi`?^2qo-N5poe=)9Pzt|D~ipY{UNc5x}#G^c|zrefw9 z&WvN{T7yCCU)b9C-loL3%RJv$OSI`pLRM_9Gz0LPYCxWxS=zRyv&Zr&YS31yG|cI_ zYBiMP+K3Gs8tS))&eyY~=b?bMVP%zXwZe8bBX(#-;@Qe@f3uXz=~>(KYM~@gR+h{2 zRN!3i;^62kSHp8bg(u$|(s*ycRh(;99jVae{%3jid89m8AK${EMOqHiipCQ~KDcxi zb*d5xf%z47;)$rDGVD}E&br8ebTAV-7$^rt8o3j>)DiB7vK;b=baL%0Ji@)2t#aq` zlw^5MvX~I=eS|DBD-w^a3eROVW~6duUjIU+yobKF@sTt zoM3>FPCaxINA-wQa$mTJi>j3=%F?SWj-}3%GkR9;H~2ZqO<`%CMJ&v7f>1;5AlJp; zj-6VGUMA0i#q=G10uIUBN4rq)oCe;)*VWGU=;OcitW(>%T(~Ymy>ZRjMG9?bVF+1_ zCkNS0Fif|U@hxIZe#Ey%#*=Z@BHU+2h=kwGQjA^M9m0KVA7hJKLO9Nh4!X{^cfkCD zC?X$YgB^5k1a$ob?jLuwS$t=lP2ox}z8Us$7em)E^rzpNXYt`dHj43P4daP8Y2uCU z5NYs{?EfwM57A#1EV!s*Ja!4P+t$D-Z#^&vlF&G-QE$BF*_F31lKhVp+I1>7I! zx9vMVJHp;0eZySeu&^H+YmRVugv%$ibMwQ)q5QrCu{Uq2>IR} zXxb~U-!TaY@PmMLJ%r)kpf75c8K9>|0gvLIAB|wt$0p<(`o7NYUDV-!T*Aj&k^kvYKL?;;Zf9UL;1T*-8X&q*4is#>WiR=3chvA>GeU$iD=&y1(q?3QV(N76` z;9pZmxwS+|K|lEm1z7a>gTI5#grv84{KiW=)PZ#;%{_i5VpXY~Q^54@&#ji1d-%Jubpph;~zunV^iJ$M5ZzN*3d)g+V(>_b} zH{|u>cMfXj2WUMFc>`@B4;Xah6XkJXid{s$-%n6}QSSsJzXW@%pVv=|<_G*96APk+ z`9OXK_7NHU)IcA|6Pe%^qxJEvdB*Fne+M>H#PfjKFIY=we@(D2&wt2hpKQ0kp|cAK zwmqan-zN~ZtRs)ycaL@K;Yab~EFYeD=;#sah}P zb0aK#cxZ>f?3DyH4e>5UxsAG0UHB)Y{LdotGE9DFn;?W_6-$$P{| zx)GwuReYOyK5CCoh5DdV0;-quq(0iBa(r&r$fueg)kozhK9xshM)fMMs#kLV E0n);sHUIzs literal 0 HcmV?d00001 diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp new file mode 100644 index 00000000000..cc963cd88f0 --- /dev/null +++ b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp @@ -0,0 +1,6144 @@ + +#include +// Definition of AOTI runtime interface functions + +#include +#include + +#include +#include + +#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ + try { \ + __VA_ARGS__ \ + } catch (const std::exception& e) { \ + std::cerr << "Error: " << e.what() << '\n'; \ + return AOTI_RUNTIME_FAILURE; \ + } catch (...) { \ + std::cerr << "Unknown exception occurred.\n"; \ + return AOTI_RUNTIME_FAILURE; \ + } \ + return AOTI_RUNTIME_SUCCESS; + +#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ + do { \ + AOTI_RUNTIME_CHECK( \ + actual_size == expected_size, \ + "expected " + std::string(name) + " vector size to be " + \ + std::to_string(expected_size) + ", but got " + \ + std::to_string(actual_size)); \ + } while (0) + +// AOTInductor uses at::addmm_out, which doesn't supports +// arguments that requires gradient. For this reason, we +// enforce no_grad context for run APIs. +// +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct AOTINoGradGuard { + AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(false); + } + AOTINoGradGuard(const AOTINoGradGuard&) = delete; + AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; + ~AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(prev_mode); + } + AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; + AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; + bool prev_mode{aoti_torch_grad_mode_is_enabled()}; +}; + +extern "C" { + +AOTIRuntimeError AOTInductorModelContainerCreate( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + bool is_cpu, + const char* cubin_dir) { + return AOTInductorModelContainerCreateWithDevice( + container_handle, + num_models, + is_cpu ? "cpu" : "cuda", + cubin_dir); +} + +AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir) { + if (num_models == 0) { + std::cerr << "Error: num_models must be positive, but got 0\n"; + return AOTI_RUNTIME_FAILURE; + } + CONVERT_EXCEPTION_TO_ERROR_CODE({ + std::optional cubin_dir_opt; + if (cubin_dir != nullptr) { + cubin_dir_opt.emplace(cubin_dir); + } + auto* container = new torch::aot_inductor::AOTInductorModelContainer( + num_models, std::string(device_str), cubin_dir_opt); + *container_handle = + reinterpret_cast(container); + }) +} + +AOTIRuntimeError AOTInductorModelContainerDelete( + AOTInductorModelContainerHandle container_handle) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto* container = + reinterpret_cast( + container_handle); + delete container; + }); +} + +AOTIRuntimeError AOTInductorModelContainerRun( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_single_threaded( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumConstants( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *num_constants = container->num_constants(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantName( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** name) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *name = container->constant_name(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** original_fqn) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *original_fqn = container->constant_original_fqn(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( + AOTInductorModelContainerHandle container_handle, + size_t idx, + bool* from_folded) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantType( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* type) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* dtype) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *dtype = container->constant_dtype(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( + AOTInductorModelContainerHandle container_handle, + size_t idx, + size_t* data_size) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *data_size = container->constant_data_size(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive) { + auto* container = + reinterpret_cast( + container_handle); + auto constants_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { const auto ret = container->extract_constants_map(use_inactive); + for (const auto& pair: ret) { + constants_map->emplace(pair.first, pair.second); + } + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update, /* user_managed = */ true); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle) { + return AOTInductorModelContainerUpdateConstantBuffer(container_handle, + constant_map_handle, + /*use_inactive*/ true, + /*validate_full_update*/ true); +} + +AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->free_inactive_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( + AOTInductorModelContainerHandle container_handle, + bool use_inactive, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_const_fold(use_inactive, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->swap_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumInputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_inputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_inputs = container->num_inputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetInputName( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** ret_input_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_input_names = container->input_name(input_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_outputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_outputs = container->num_outputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetOutputName( + AOTInductorModelContainerHandle container_handle, + size_t output_idx, + const char** ret_output_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_output_names = container->output_name(output_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetCallSpec( + AOTInductorModelContainerHandle container_handle, + const char** in_spec, + const char** out_spec) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + *in_spec = container->get_in_spec(); + *out_spec = container->get_out_spec(); + }) +} + +AOTIRuntimeError AOTInductorModelCreate( + AOTInductorModelHandle* model_handle, + AOTInductorConstantMapHandle constant_map_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto constant_array = std::make_shared>(); + auto input_map = reinterpret_cast*>(constant_map_handle); + + auto model = new torch::aot_inductor::AOTInductorModel( + constant_map, + constant_array, + "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models + "" + ); + + if (input_map) { + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + } else { + model->load_constants(); + } + + *model_handle = reinterpret_cast(model); + })} + +AOTIRuntimeError AOTInductorModelRun( + AOTInductorModelHandle model_handle, + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + model->run_impl( + input_handles, + output_handles, + (torch::aot_inductor::DeviceStreamType) nullptr, + nullptr); + }) +} + +AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast( + model_handle); + delete model; + })} + +AOTIRuntimeError AOTInductorModelGetNumOutputs( + AOTInductorModelHandle model_handle, + size_t* ret_num_outputs) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast(model_handle); + *ret_num_outputs = model->num_outputs(); + }) +} + +AOTIRuntimeError AOTInductorModelUpdateConstantsMap( + AOTInductorModelHandle model_handle, + AOTInductorConstantMapHandle constant_map_handle) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto input_map = + reinterpret_cast*>( + constant_map_handle); + + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + model->update_constants_map(std::move(constant_map)); + }) +} + +} // extern "C" + + +#define CUDA_DRIVER_CHECK(EXPR) \ +do { \ + CUresult code = EXPR; \ + const char *msg; \ + CUresult code_get_error = cuGetErrorString(code, &msg); \ + if (code_get_error != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string("invalid error code!")); \ + } \ + if (code != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string(msg)); \ + } \ +} while (0); + +static inline CUfunction loadKernel( + std::string filePath, + const std::string &funcName, + uint32_t sharedMemBytes, + const std::optional &cubinDir = std::nullopt) { + if (cubinDir) { + std::filesystem::path p1{*cubinDir}; + std::filesystem::path p2{filePath}; + filePath = (p1 / p2.filename()).string(); + } + + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline void launchKernel( + CUfunction func, + uint32_t gridX, + uint32_t gridY, + uint32_t gridZ, + uint32_t numWarps, + uint32_t sharedMemBytes, + void* args[], + cudaStream_t stream) { + CUDA_DRIVER_CHECK(cuLaunchKernel( + func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr + )); +} +CACHE_TORCH_DTYPE(float32); +CACHE_TORCH_DEVICE(cuda); +CACHE_TORCH_LAYOUT(strided); +namespace torch::aot_inductor { +namespace { +class AOTInductorModelKernels : public AOTInductorModelKernelsBase { + public: + CUfunction triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_10{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_14{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_17{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_21{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_24{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_3{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_6{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_12{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_16{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_19{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_23{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_8{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7{nullptr}; + CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9{nullptr}; + CUfunction triton_poi_fused_convolution_0{nullptr}; + CUfunction triton_poi_fused_convolution_1{nullptr}; + CUfunction triton_poi_fused_permute_copy_26{nullptr}; +}; +} // namespace + + + +AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) + : AOTInductorModelBase(1, + 1, + 262, + device_str, + std::move(cubin_dir), + true) { + inputs_info_[0].name = "arg262_1"; + constants_info_[0].name = "mv2_features_0_0_weight"; + constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[0].offset = 0; + constants_info_[0].data_size = 3456; + constants_info_[0].from_folded = false; + constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[0].shape = {32, 3, 3, 3}; + constants_info_[0].stride = {27, 9, 3, 1}; + constants_info_[0].layout = static_cast(cached_torch_layout_strided); + constants_info_[0].original_fqn = "mv2.features.0.0.weight"; + constants_info_[1].name = "mv2_features_0_1_weight"; + constants_info_[1].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[1].offset = 0; + constants_info_[1].data_size = 128; + constants_info_[1].from_folded = false; + constants_info_[1].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[1].shape = {32}; + constants_info_[1].stride = {1}; + constants_info_[1].layout = static_cast(cached_torch_layout_strided); + constants_info_[1].original_fqn = "mv2.features.0.1.weight"; + constants_info_[2].name = "mv2_features_0_1_bias"; + constants_info_[2].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[2].offset = 0; + constants_info_[2].data_size = 128; + constants_info_[2].from_folded = false; + constants_info_[2].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[2].shape = {32}; + constants_info_[2].stride = {1}; + constants_info_[2].layout = static_cast(cached_torch_layout_strided); + constants_info_[2].original_fqn = "mv2.features.0.1.bias"; + constants_info_[3].name = "mv2_features_1_conv_0_0_weight"; + constants_info_[3].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[3].offset = 0; + constants_info_[3].data_size = 1152; + constants_info_[3].from_folded = false; + constants_info_[3].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[3].shape = {32, 1, 3, 3}; + constants_info_[3].stride = {9, 9, 3, 1}; + constants_info_[3].layout = static_cast(cached_torch_layout_strided); + constants_info_[3].original_fqn = "mv2.features.1.conv.0.0.weight"; + constants_info_[4].name = "mv2_features_1_conv_0_1_weight"; + constants_info_[4].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[4].offset = 0; + constants_info_[4].data_size = 128; + constants_info_[4].from_folded = false; + constants_info_[4].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[4].shape = {32}; + constants_info_[4].stride = {1}; + constants_info_[4].layout = static_cast(cached_torch_layout_strided); + constants_info_[4].original_fqn = "mv2.features.1.conv.0.1.weight"; + constants_info_[5].name = "mv2_features_1_conv_0_1_bias"; + constants_info_[5].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[5].offset = 0; + constants_info_[5].data_size = 128; + constants_info_[5].from_folded = false; + constants_info_[5].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[5].shape = {32}; + constants_info_[5].stride = {1}; + constants_info_[5].layout = static_cast(cached_torch_layout_strided); + constants_info_[5].original_fqn = "mv2.features.1.conv.0.1.bias"; + constants_info_[6].name = "mv2_features_1_conv_1_weight"; + constants_info_[6].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[6].offset = 0; + constants_info_[6].data_size = 2048; + constants_info_[6].from_folded = false; + constants_info_[6].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[6].shape = {16, 32, 1, 1}; + constants_info_[6].stride = {32, 1, 1, 1}; + constants_info_[6].layout = static_cast(cached_torch_layout_strided); + constants_info_[6].original_fqn = "mv2.features.1.conv.1.weight"; + constants_info_[7].name = "mv2_features_1_conv_2_weight"; + constants_info_[7].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[7].offset = 0; + constants_info_[7].data_size = 64; + constants_info_[7].from_folded = false; + constants_info_[7].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[7].shape = {16}; + constants_info_[7].stride = {1}; + constants_info_[7].layout = static_cast(cached_torch_layout_strided); + constants_info_[7].original_fqn = "mv2.features.1.conv.2.weight"; + constants_info_[8].name = "mv2_features_1_conv_2_bias"; + constants_info_[8].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[8].offset = 0; + constants_info_[8].data_size = 64; + constants_info_[8].from_folded = false; + constants_info_[8].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[8].shape = {16}; + constants_info_[8].stride = {1}; + constants_info_[8].layout = static_cast(cached_torch_layout_strided); + constants_info_[8].original_fqn = "mv2.features.1.conv.2.bias"; + constants_info_[9].name = "mv2_features_2_conv_0_0_weight"; + constants_info_[9].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[9].offset = 0; + constants_info_[9].data_size = 6144; + constants_info_[9].from_folded = false; + constants_info_[9].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[9].shape = {96, 16, 1, 1}; + constants_info_[9].stride = {16, 1, 1, 1}; + constants_info_[9].layout = static_cast(cached_torch_layout_strided); + constants_info_[9].original_fqn = "mv2.features.2.conv.0.0.weight"; + constants_info_[10].name = "mv2_features_2_conv_0_1_weight"; + constants_info_[10].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[10].offset = 0; + constants_info_[10].data_size = 384; + constants_info_[10].from_folded = false; + constants_info_[10].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[10].shape = {96}; + constants_info_[10].stride = {1}; + constants_info_[10].layout = static_cast(cached_torch_layout_strided); + constants_info_[10].original_fqn = "mv2.features.2.conv.0.1.weight"; + constants_info_[11].name = "mv2_features_2_conv_0_1_bias"; + constants_info_[11].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[11].offset = 0; + constants_info_[11].data_size = 384; + constants_info_[11].from_folded = false; + constants_info_[11].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[11].shape = {96}; + constants_info_[11].stride = {1}; + constants_info_[11].layout = static_cast(cached_torch_layout_strided); + constants_info_[11].original_fqn = "mv2.features.2.conv.0.1.bias"; + constants_info_[12].name = "mv2_features_2_conv_1_0_weight"; + constants_info_[12].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[12].offset = 0; + constants_info_[12].data_size = 3456; + constants_info_[12].from_folded = false; + constants_info_[12].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[12].shape = {96, 1, 3, 3}; + constants_info_[12].stride = {9, 9, 3, 1}; + constants_info_[12].layout = static_cast(cached_torch_layout_strided); + constants_info_[12].original_fqn = "mv2.features.2.conv.1.0.weight"; + constants_info_[13].name = "mv2_features_2_conv_1_1_weight"; + constants_info_[13].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[13].offset = 0; + constants_info_[13].data_size = 384; + constants_info_[13].from_folded = false; + constants_info_[13].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[13].shape = {96}; + constants_info_[13].stride = {1}; + constants_info_[13].layout = static_cast(cached_torch_layout_strided); + constants_info_[13].original_fqn = "mv2.features.2.conv.1.1.weight"; + constants_info_[14].name = "mv2_features_2_conv_1_1_bias"; + constants_info_[14].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[14].offset = 0; + constants_info_[14].data_size = 384; + constants_info_[14].from_folded = false; + constants_info_[14].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[14].shape = {96}; + constants_info_[14].stride = {1}; + constants_info_[14].layout = static_cast(cached_torch_layout_strided); + constants_info_[14].original_fqn = "mv2.features.2.conv.1.1.bias"; + constants_info_[15].name = "mv2_features_2_conv_2_weight"; + constants_info_[15].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[15].offset = 0; + constants_info_[15].data_size = 9216; + constants_info_[15].from_folded = false; + constants_info_[15].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[15].shape = {24, 96, 1, 1}; + constants_info_[15].stride = {96, 1, 1, 1}; + constants_info_[15].layout = static_cast(cached_torch_layout_strided); + constants_info_[15].original_fqn = "mv2.features.2.conv.2.weight"; + constants_info_[16].name = "mv2_features_2_conv_3_weight"; + constants_info_[16].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[16].offset = 0; + constants_info_[16].data_size = 96; + constants_info_[16].from_folded = false; + constants_info_[16].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[16].shape = {24}; + constants_info_[16].stride = {1}; + constants_info_[16].layout = static_cast(cached_torch_layout_strided); + constants_info_[16].original_fqn = "mv2.features.2.conv.3.weight"; + constants_info_[17].name = "mv2_features_2_conv_3_bias"; + constants_info_[17].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[17].offset = 0; + constants_info_[17].data_size = 96; + constants_info_[17].from_folded = false; + constants_info_[17].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[17].shape = {24}; + constants_info_[17].stride = {1}; + constants_info_[17].layout = static_cast(cached_torch_layout_strided); + constants_info_[17].original_fqn = "mv2.features.2.conv.3.bias"; + constants_info_[18].name = "mv2_features_3_conv_0_0_weight"; + constants_info_[18].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[18].offset = 0; + constants_info_[18].data_size = 13824; + constants_info_[18].from_folded = false; + constants_info_[18].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[18].shape = {144, 24, 1, 1}; + constants_info_[18].stride = {24, 1, 1, 1}; + constants_info_[18].layout = static_cast(cached_torch_layout_strided); + constants_info_[18].original_fqn = "mv2.features.3.conv.0.0.weight"; + constants_info_[19].name = "mv2_features_3_conv_0_1_weight"; + constants_info_[19].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[19].offset = 0; + constants_info_[19].data_size = 576; + constants_info_[19].from_folded = false; + constants_info_[19].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[19].shape = {144}; + constants_info_[19].stride = {1}; + constants_info_[19].layout = static_cast(cached_torch_layout_strided); + constants_info_[19].original_fqn = "mv2.features.3.conv.0.1.weight"; + constants_info_[20].name = "mv2_features_3_conv_0_1_bias"; + constants_info_[20].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[20].offset = 0; + constants_info_[20].data_size = 576; + constants_info_[20].from_folded = false; + constants_info_[20].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[20].shape = {144}; + constants_info_[20].stride = {1}; + constants_info_[20].layout = static_cast(cached_torch_layout_strided); + constants_info_[20].original_fqn = "mv2.features.3.conv.0.1.bias"; + constants_info_[21].name = "mv2_features_3_conv_1_0_weight"; + constants_info_[21].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[21].offset = 0; + constants_info_[21].data_size = 5184; + constants_info_[21].from_folded = false; + constants_info_[21].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[21].shape = {144, 1, 3, 3}; + constants_info_[21].stride = {9, 9, 3, 1}; + constants_info_[21].layout = static_cast(cached_torch_layout_strided); + constants_info_[21].original_fqn = "mv2.features.3.conv.1.0.weight"; + constants_info_[22].name = "mv2_features_3_conv_1_1_weight"; + constants_info_[22].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[22].offset = 0; + constants_info_[22].data_size = 576; + constants_info_[22].from_folded = false; + constants_info_[22].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[22].shape = {144}; + constants_info_[22].stride = {1}; + constants_info_[22].layout = static_cast(cached_torch_layout_strided); + constants_info_[22].original_fqn = "mv2.features.3.conv.1.1.weight"; + constants_info_[23].name = "mv2_features_3_conv_1_1_bias"; + constants_info_[23].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[23].offset = 0; + constants_info_[23].data_size = 576; + constants_info_[23].from_folded = false; + constants_info_[23].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[23].shape = {144}; + constants_info_[23].stride = {1}; + constants_info_[23].layout = static_cast(cached_torch_layout_strided); + constants_info_[23].original_fqn = "mv2.features.3.conv.1.1.bias"; + constants_info_[24].name = "mv2_features_3_conv_2_weight"; + constants_info_[24].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[24].offset = 0; + constants_info_[24].data_size = 13824; + constants_info_[24].from_folded = false; + constants_info_[24].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[24].shape = {24, 144, 1, 1}; + constants_info_[24].stride = {144, 1, 1, 1}; + constants_info_[24].layout = static_cast(cached_torch_layout_strided); + constants_info_[24].original_fqn = "mv2.features.3.conv.2.weight"; + constants_info_[25].name = "mv2_features_3_conv_3_weight"; + constants_info_[25].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[25].offset = 0; + constants_info_[25].data_size = 96; + constants_info_[25].from_folded = false; + constants_info_[25].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[25].shape = {24}; + constants_info_[25].stride = {1}; + constants_info_[25].layout = static_cast(cached_torch_layout_strided); + constants_info_[25].original_fqn = "mv2.features.3.conv.3.weight"; + constants_info_[26].name = "mv2_features_3_conv_3_bias"; + constants_info_[26].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[26].offset = 0; + constants_info_[26].data_size = 96; + constants_info_[26].from_folded = false; + constants_info_[26].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[26].shape = {24}; + constants_info_[26].stride = {1}; + constants_info_[26].layout = static_cast(cached_torch_layout_strided); + constants_info_[26].original_fqn = "mv2.features.3.conv.3.bias"; + constants_info_[27].name = "mv2_features_4_conv_0_0_weight"; + constants_info_[27].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[27].offset = 0; + constants_info_[27].data_size = 13824; + constants_info_[27].from_folded = false; + constants_info_[27].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[27].shape = {144, 24, 1, 1}; + constants_info_[27].stride = {24, 1, 1, 1}; + constants_info_[27].layout = static_cast(cached_torch_layout_strided); + constants_info_[27].original_fqn = "mv2.features.4.conv.0.0.weight"; + constants_info_[28].name = "mv2_features_4_conv_0_1_weight"; + constants_info_[28].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[28].offset = 0; + constants_info_[28].data_size = 576; + constants_info_[28].from_folded = false; + constants_info_[28].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[28].shape = {144}; + constants_info_[28].stride = {1}; + constants_info_[28].layout = static_cast(cached_torch_layout_strided); + constants_info_[28].original_fqn = "mv2.features.4.conv.0.1.weight"; + constants_info_[29].name = "mv2_features_4_conv_0_1_bias"; + constants_info_[29].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[29].offset = 0; + constants_info_[29].data_size = 576; + constants_info_[29].from_folded = false; + constants_info_[29].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[29].shape = {144}; + constants_info_[29].stride = {1}; + constants_info_[29].layout = static_cast(cached_torch_layout_strided); + constants_info_[29].original_fqn = "mv2.features.4.conv.0.1.bias"; + constants_info_[30].name = "mv2_features_4_conv_1_0_weight"; + constants_info_[30].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[30].offset = 0; + constants_info_[30].data_size = 5184; + constants_info_[30].from_folded = false; + constants_info_[30].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[30].shape = {144, 1, 3, 3}; + constants_info_[30].stride = {9, 9, 3, 1}; + constants_info_[30].layout = static_cast(cached_torch_layout_strided); + constants_info_[30].original_fqn = "mv2.features.4.conv.1.0.weight"; + constants_info_[31].name = "mv2_features_4_conv_1_1_weight"; + constants_info_[31].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[31].offset = 0; + constants_info_[31].data_size = 576; + constants_info_[31].from_folded = false; + constants_info_[31].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[31].shape = {144}; + constants_info_[31].stride = {1}; + constants_info_[31].layout = static_cast(cached_torch_layout_strided); + constants_info_[31].original_fqn = "mv2.features.4.conv.1.1.weight"; + constants_info_[32].name = "mv2_features_4_conv_1_1_bias"; + constants_info_[32].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[32].offset = 0; + constants_info_[32].data_size = 576; + constants_info_[32].from_folded = false; + constants_info_[32].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[32].shape = {144}; + constants_info_[32].stride = {1}; + constants_info_[32].layout = static_cast(cached_torch_layout_strided); + constants_info_[32].original_fqn = "mv2.features.4.conv.1.1.bias"; + constants_info_[33].name = "mv2_features_4_conv_2_weight"; + constants_info_[33].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[33].offset = 0; + constants_info_[33].data_size = 18432; + constants_info_[33].from_folded = false; + constants_info_[33].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[33].shape = {32, 144, 1, 1}; + constants_info_[33].stride = {144, 1, 1, 1}; + constants_info_[33].layout = static_cast(cached_torch_layout_strided); + constants_info_[33].original_fqn = "mv2.features.4.conv.2.weight"; + constants_info_[34].name = "mv2_features_4_conv_3_weight"; + constants_info_[34].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[34].offset = 0; + constants_info_[34].data_size = 128; + constants_info_[34].from_folded = false; + constants_info_[34].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[34].shape = {32}; + constants_info_[34].stride = {1}; + constants_info_[34].layout = static_cast(cached_torch_layout_strided); + constants_info_[34].original_fqn = "mv2.features.4.conv.3.weight"; + constants_info_[35].name = "mv2_features_4_conv_3_bias"; + constants_info_[35].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[35].offset = 0; + constants_info_[35].data_size = 128; + constants_info_[35].from_folded = false; + constants_info_[35].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[35].shape = {32}; + constants_info_[35].stride = {1}; + constants_info_[35].layout = static_cast(cached_torch_layout_strided); + constants_info_[35].original_fqn = "mv2.features.4.conv.3.bias"; + constants_info_[36].name = "mv2_features_5_conv_0_0_weight"; + constants_info_[36].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[36].offset = 0; + constants_info_[36].data_size = 24576; + constants_info_[36].from_folded = false; + constants_info_[36].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[36].shape = {192, 32, 1, 1}; + constants_info_[36].stride = {32, 1, 1, 1}; + constants_info_[36].layout = static_cast(cached_torch_layout_strided); + constants_info_[36].original_fqn = "mv2.features.5.conv.0.0.weight"; + constants_info_[37].name = "mv2_features_5_conv_0_1_weight"; + constants_info_[37].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[37].offset = 0; + constants_info_[37].data_size = 768; + constants_info_[37].from_folded = false; + constants_info_[37].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[37].shape = {192}; + constants_info_[37].stride = {1}; + constants_info_[37].layout = static_cast(cached_torch_layout_strided); + constants_info_[37].original_fqn = "mv2.features.5.conv.0.1.weight"; + constants_info_[38].name = "mv2_features_5_conv_0_1_bias"; + constants_info_[38].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[38].offset = 0; + constants_info_[38].data_size = 768; + constants_info_[38].from_folded = false; + constants_info_[38].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[38].shape = {192}; + constants_info_[38].stride = {1}; + constants_info_[38].layout = static_cast(cached_torch_layout_strided); + constants_info_[38].original_fqn = "mv2.features.5.conv.0.1.bias"; + constants_info_[39].name = "mv2_features_5_conv_1_0_weight"; + constants_info_[39].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[39].offset = 0; + constants_info_[39].data_size = 6912; + constants_info_[39].from_folded = false; + constants_info_[39].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[39].shape = {192, 1, 3, 3}; + constants_info_[39].stride = {9, 9, 3, 1}; + constants_info_[39].layout = static_cast(cached_torch_layout_strided); + constants_info_[39].original_fqn = "mv2.features.5.conv.1.0.weight"; + constants_info_[40].name = "mv2_features_5_conv_1_1_weight"; + constants_info_[40].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[40].offset = 0; + constants_info_[40].data_size = 768; + constants_info_[40].from_folded = false; + constants_info_[40].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[40].shape = {192}; + constants_info_[40].stride = {1}; + constants_info_[40].layout = static_cast(cached_torch_layout_strided); + constants_info_[40].original_fqn = "mv2.features.5.conv.1.1.weight"; + constants_info_[41].name = "mv2_features_5_conv_1_1_bias"; + constants_info_[41].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[41].offset = 0; + constants_info_[41].data_size = 768; + constants_info_[41].from_folded = false; + constants_info_[41].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[41].shape = {192}; + constants_info_[41].stride = {1}; + constants_info_[41].layout = static_cast(cached_torch_layout_strided); + constants_info_[41].original_fqn = "mv2.features.5.conv.1.1.bias"; + constants_info_[42].name = "mv2_features_5_conv_2_weight"; + constants_info_[42].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[42].offset = 0; + constants_info_[42].data_size = 24576; + constants_info_[42].from_folded = false; + constants_info_[42].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[42].shape = {32, 192, 1, 1}; + constants_info_[42].stride = {192, 1, 1, 1}; + constants_info_[42].layout = static_cast(cached_torch_layout_strided); + constants_info_[42].original_fqn = "mv2.features.5.conv.2.weight"; + constants_info_[43].name = "mv2_features_5_conv_3_weight"; + constants_info_[43].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[43].offset = 0; + constants_info_[43].data_size = 128; + constants_info_[43].from_folded = false; + constants_info_[43].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[43].shape = {32}; + constants_info_[43].stride = {1}; + constants_info_[43].layout = static_cast(cached_torch_layout_strided); + constants_info_[43].original_fqn = "mv2.features.5.conv.3.weight"; + constants_info_[44].name = "mv2_features_5_conv_3_bias"; + constants_info_[44].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[44].offset = 0; + constants_info_[44].data_size = 128; + constants_info_[44].from_folded = false; + constants_info_[44].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[44].shape = {32}; + constants_info_[44].stride = {1}; + constants_info_[44].layout = static_cast(cached_torch_layout_strided); + constants_info_[44].original_fqn = "mv2.features.5.conv.3.bias"; + constants_info_[45].name = "mv2_features_6_conv_0_0_weight"; + constants_info_[45].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[45].offset = 0; + constants_info_[45].data_size = 24576; + constants_info_[45].from_folded = false; + constants_info_[45].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[45].shape = {192, 32, 1, 1}; + constants_info_[45].stride = {32, 1, 1, 1}; + constants_info_[45].layout = static_cast(cached_torch_layout_strided); + constants_info_[45].original_fqn = "mv2.features.6.conv.0.0.weight"; + constants_info_[46].name = "mv2_features_6_conv_0_1_weight"; + constants_info_[46].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[46].offset = 0; + constants_info_[46].data_size = 768; + constants_info_[46].from_folded = false; + constants_info_[46].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[46].shape = {192}; + constants_info_[46].stride = {1}; + constants_info_[46].layout = static_cast(cached_torch_layout_strided); + constants_info_[46].original_fqn = "mv2.features.6.conv.0.1.weight"; + constants_info_[47].name = "mv2_features_6_conv_0_1_bias"; + constants_info_[47].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[47].offset = 0; + constants_info_[47].data_size = 768; + constants_info_[47].from_folded = false; + constants_info_[47].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[47].shape = {192}; + constants_info_[47].stride = {1}; + constants_info_[47].layout = static_cast(cached_torch_layout_strided); + constants_info_[47].original_fqn = "mv2.features.6.conv.0.1.bias"; + constants_info_[48].name = "mv2_features_6_conv_1_0_weight"; + constants_info_[48].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[48].offset = 0; + constants_info_[48].data_size = 6912; + constants_info_[48].from_folded = false; + constants_info_[48].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[48].shape = {192, 1, 3, 3}; + constants_info_[48].stride = {9, 9, 3, 1}; + constants_info_[48].layout = static_cast(cached_torch_layout_strided); + constants_info_[48].original_fqn = "mv2.features.6.conv.1.0.weight"; + constants_info_[49].name = "mv2_features_6_conv_1_1_weight"; + constants_info_[49].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[49].offset = 0; + constants_info_[49].data_size = 768; + constants_info_[49].from_folded = false; + constants_info_[49].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[49].shape = {192}; + constants_info_[49].stride = {1}; + constants_info_[49].layout = static_cast(cached_torch_layout_strided); + constants_info_[49].original_fqn = "mv2.features.6.conv.1.1.weight"; + constants_info_[50].name = "mv2_features_6_conv_1_1_bias"; + constants_info_[50].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[50].offset = 0; + constants_info_[50].data_size = 768; + constants_info_[50].from_folded = false; + constants_info_[50].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[50].shape = {192}; + constants_info_[50].stride = {1}; + constants_info_[50].layout = static_cast(cached_torch_layout_strided); + constants_info_[50].original_fqn = "mv2.features.6.conv.1.1.bias"; + constants_info_[51].name = "mv2_features_6_conv_2_weight"; + constants_info_[51].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[51].offset = 0; + constants_info_[51].data_size = 24576; + constants_info_[51].from_folded = false; + constants_info_[51].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[51].shape = {32, 192, 1, 1}; + constants_info_[51].stride = {192, 1, 1, 1}; + constants_info_[51].layout = static_cast(cached_torch_layout_strided); + constants_info_[51].original_fqn = "mv2.features.6.conv.2.weight"; + constants_info_[52].name = "mv2_features_6_conv_3_weight"; + constants_info_[52].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[52].offset = 0; + constants_info_[52].data_size = 128; + constants_info_[52].from_folded = false; + constants_info_[52].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[52].shape = {32}; + constants_info_[52].stride = {1}; + constants_info_[52].layout = static_cast(cached_torch_layout_strided); + constants_info_[52].original_fqn = "mv2.features.6.conv.3.weight"; + constants_info_[53].name = "mv2_features_6_conv_3_bias"; + constants_info_[53].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[53].offset = 0; + constants_info_[53].data_size = 128; + constants_info_[53].from_folded = false; + constants_info_[53].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[53].shape = {32}; + constants_info_[53].stride = {1}; + constants_info_[53].layout = static_cast(cached_torch_layout_strided); + constants_info_[53].original_fqn = "mv2.features.6.conv.3.bias"; + constants_info_[54].name = "mv2_features_7_conv_0_0_weight"; + constants_info_[54].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[54].offset = 0; + constants_info_[54].data_size = 24576; + constants_info_[54].from_folded = false; + constants_info_[54].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[54].shape = {192, 32, 1, 1}; + constants_info_[54].stride = {32, 1, 1, 1}; + constants_info_[54].layout = static_cast(cached_torch_layout_strided); + constants_info_[54].original_fqn = "mv2.features.7.conv.0.0.weight"; + constants_info_[55].name = "mv2_features_7_conv_0_1_weight"; + constants_info_[55].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[55].offset = 0; + constants_info_[55].data_size = 768; + constants_info_[55].from_folded = false; + constants_info_[55].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[55].shape = {192}; + constants_info_[55].stride = {1}; + constants_info_[55].layout = static_cast(cached_torch_layout_strided); + constants_info_[55].original_fqn = "mv2.features.7.conv.0.1.weight"; + constants_info_[56].name = "mv2_features_7_conv_0_1_bias"; + constants_info_[56].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[56].offset = 0; + constants_info_[56].data_size = 768; + constants_info_[56].from_folded = false; + constants_info_[56].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[56].shape = {192}; + constants_info_[56].stride = {1}; + constants_info_[56].layout = static_cast(cached_torch_layout_strided); + constants_info_[56].original_fqn = "mv2.features.7.conv.0.1.bias"; + constants_info_[57].name = "mv2_features_7_conv_1_0_weight"; + constants_info_[57].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[57].offset = 0; + constants_info_[57].data_size = 6912; + constants_info_[57].from_folded = false; + constants_info_[57].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[57].shape = {192, 1, 3, 3}; + constants_info_[57].stride = {9, 9, 3, 1}; + constants_info_[57].layout = static_cast(cached_torch_layout_strided); + constants_info_[57].original_fqn = "mv2.features.7.conv.1.0.weight"; + constants_info_[58].name = "mv2_features_7_conv_1_1_weight"; + constants_info_[58].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[58].offset = 0; + constants_info_[58].data_size = 768; + constants_info_[58].from_folded = false; + constants_info_[58].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[58].shape = {192}; + constants_info_[58].stride = {1}; + constants_info_[58].layout = static_cast(cached_torch_layout_strided); + constants_info_[58].original_fqn = "mv2.features.7.conv.1.1.weight"; + constants_info_[59].name = "mv2_features_7_conv_1_1_bias"; + constants_info_[59].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[59].offset = 0; + constants_info_[59].data_size = 768; + constants_info_[59].from_folded = false; + constants_info_[59].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[59].shape = {192}; + constants_info_[59].stride = {1}; + constants_info_[59].layout = static_cast(cached_torch_layout_strided); + constants_info_[59].original_fqn = "mv2.features.7.conv.1.1.bias"; + constants_info_[60].name = "mv2_features_7_conv_2_weight"; + constants_info_[60].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[60].offset = 0; + constants_info_[60].data_size = 49152; + constants_info_[60].from_folded = false; + constants_info_[60].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[60].shape = {64, 192, 1, 1}; + constants_info_[60].stride = {192, 1, 1, 1}; + constants_info_[60].layout = static_cast(cached_torch_layout_strided); + constants_info_[60].original_fqn = "mv2.features.7.conv.2.weight"; + constants_info_[61].name = "mv2_features_7_conv_3_weight"; + constants_info_[61].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[61].offset = 0; + constants_info_[61].data_size = 256; + constants_info_[61].from_folded = false; + constants_info_[61].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[61].shape = {64}; + constants_info_[61].stride = {1}; + constants_info_[61].layout = static_cast(cached_torch_layout_strided); + constants_info_[61].original_fqn = "mv2.features.7.conv.3.weight"; + constants_info_[62].name = "mv2_features_7_conv_3_bias"; + constants_info_[62].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[62].offset = 0; + constants_info_[62].data_size = 256; + constants_info_[62].from_folded = false; + constants_info_[62].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[62].shape = {64}; + constants_info_[62].stride = {1}; + constants_info_[62].layout = static_cast(cached_torch_layout_strided); + constants_info_[62].original_fqn = "mv2.features.7.conv.3.bias"; + constants_info_[63].name = "mv2_features_8_conv_0_0_weight"; + constants_info_[63].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[63].offset = 0; + constants_info_[63].data_size = 98304; + constants_info_[63].from_folded = false; + constants_info_[63].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[63].shape = {384, 64, 1, 1}; + constants_info_[63].stride = {64, 1, 1, 1}; + constants_info_[63].layout = static_cast(cached_torch_layout_strided); + constants_info_[63].original_fqn = "mv2.features.8.conv.0.0.weight"; + constants_info_[64].name = "mv2_features_8_conv_0_1_weight"; + constants_info_[64].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[64].offset = 0; + constants_info_[64].data_size = 1536; + constants_info_[64].from_folded = false; + constants_info_[64].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[64].shape = {384}; + constants_info_[64].stride = {1}; + constants_info_[64].layout = static_cast(cached_torch_layout_strided); + constants_info_[64].original_fqn = "mv2.features.8.conv.0.1.weight"; + constants_info_[65].name = "mv2_features_8_conv_0_1_bias"; + constants_info_[65].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[65].offset = 0; + constants_info_[65].data_size = 1536; + constants_info_[65].from_folded = false; + constants_info_[65].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[65].shape = {384}; + constants_info_[65].stride = {1}; + constants_info_[65].layout = static_cast(cached_torch_layout_strided); + constants_info_[65].original_fqn = "mv2.features.8.conv.0.1.bias"; + constants_info_[66].name = "mv2_features_8_conv_1_0_weight"; + constants_info_[66].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[66].offset = 0; + constants_info_[66].data_size = 13824; + constants_info_[66].from_folded = false; + constants_info_[66].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[66].shape = {384, 1, 3, 3}; + constants_info_[66].stride = {9, 9, 3, 1}; + constants_info_[66].layout = static_cast(cached_torch_layout_strided); + constants_info_[66].original_fqn = "mv2.features.8.conv.1.0.weight"; + constants_info_[67].name = "mv2_features_8_conv_1_1_weight"; + constants_info_[67].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[67].offset = 0; + constants_info_[67].data_size = 1536; + constants_info_[67].from_folded = false; + constants_info_[67].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[67].shape = {384}; + constants_info_[67].stride = {1}; + constants_info_[67].layout = static_cast(cached_torch_layout_strided); + constants_info_[67].original_fqn = "mv2.features.8.conv.1.1.weight"; + constants_info_[68].name = "mv2_features_8_conv_1_1_bias"; + constants_info_[68].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[68].offset = 0; + constants_info_[68].data_size = 1536; + constants_info_[68].from_folded = false; + constants_info_[68].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[68].shape = {384}; + constants_info_[68].stride = {1}; + constants_info_[68].layout = static_cast(cached_torch_layout_strided); + constants_info_[68].original_fqn = "mv2.features.8.conv.1.1.bias"; + constants_info_[69].name = "mv2_features_8_conv_2_weight"; + constants_info_[69].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[69].offset = 0; + constants_info_[69].data_size = 98304; + constants_info_[69].from_folded = false; + constants_info_[69].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[69].shape = {64, 384, 1, 1}; + constants_info_[69].stride = {384, 1, 1, 1}; + constants_info_[69].layout = static_cast(cached_torch_layout_strided); + constants_info_[69].original_fqn = "mv2.features.8.conv.2.weight"; + constants_info_[70].name = "mv2_features_8_conv_3_weight"; + constants_info_[70].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[70].offset = 0; + constants_info_[70].data_size = 256; + constants_info_[70].from_folded = false; + constants_info_[70].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[70].shape = {64}; + constants_info_[70].stride = {1}; + constants_info_[70].layout = static_cast(cached_torch_layout_strided); + constants_info_[70].original_fqn = "mv2.features.8.conv.3.weight"; + constants_info_[71].name = "mv2_features_8_conv_3_bias"; + constants_info_[71].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[71].offset = 0; + constants_info_[71].data_size = 256; + constants_info_[71].from_folded = false; + constants_info_[71].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[71].shape = {64}; + constants_info_[71].stride = {1}; + constants_info_[71].layout = static_cast(cached_torch_layout_strided); + constants_info_[71].original_fqn = "mv2.features.8.conv.3.bias"; + constants_info_[72].name = "mv2_features_9_conv_0_0_weight"; + constants_info_[72].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[72].offset = 0; + constants_info_[72].data_size = 98304; + constants_info_[72].from_folded = false; + constants_info_[72].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[72].shape = {384, 64, 1, 1}; + constants_info_[72].stride = {64, 1, 1, 1}; + constants_info_[72].layout = static_cast(cached_torch_layout_strided); + constants_info_[72].original_fqn = "mv2.features.9.conv.0.0.weight"; + constants_info_[73].name = "mv2_features_9_conv_0_1_weight"; + constants_info_[73].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[73].offset = 0; + constants_info_[73].data_size = 1536; + constants_info_[73].from_folded = false; + constants_info_[73].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[73].shape = {384}; + constants_info_[73].stride = {1}; + constants_info_[73].layout = static_cast(cached_torch_layout_strided); + constants_info_[73].original_fqn = "mv2.features.9.conv.0.1.weight"; + constants_info_[74].name = "mv2_features_9_conv_0_1_bias"; + constants_info_[74].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[74].offset = 0; + constants_info_[74].data_size = 1536; + constants_info_[74].from_folded = false; + constants_info_[74].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[74].shape = {384}; + constants_info_[74].stride = {1}; + constants_info_[74].layout = static_cast(cached_torch_layout_strided); + constants_info_[74].original_fqn = "mv2.features.9.conv.0.1.bias"; + constants_info_[75].name = "mv2_features_9_conv_1_0_weight"; + constants_info_[75].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[75].offset = 0; + constants_info_[75].data_size = 13824; + constants_info_[75].from_folded = false; + constants_info_[75].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[75].shape = {384, 1, 3, 3}; + constants_info_[75].stride = {9, 9, 3, 1}; + constants_info_[75].layout = static_cast(cached_torch_layout_strided); + constants_info_[75].original_fqn = "mv2.features.9.conv.1.0.weight"; + constants_info_[76].name = "mv2_features_9_conv_1_1_weight"; + constants_info_[76].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[76].offset = 0; + constants_info_[76].data_size = 1536; + constants_info_[76].from_folded = false; + constants_info_[76].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[76].shape = {384}; + constants_info_[76].stride = {1}; + constants_info_[76].layout = static_cast(cached_torch_layout_strided); + constants_info_[76].original_fqn = "mv2.features.9.conv.1.1.weight"; + constants_info_[77].name = "mv2_features_9_conv_1_1_bias"; + constants_info_[77].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[77].offset = 0; + constants_info_[77].data_size = 1536; + constants_info_[77].from_folded = false; + constants_info_[77].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[77].shape = {384}; + constants_info_[77].stride = {1}; + constants_info_[77].layout = static_cast(cached_torch_layout_strided); + constants_info_[77].original_fqn = "mv2.features.9.conv.1.1.bias"; + constants_info_[78].name = "mv2_features_9_conv_2_weight"; + constants_info_[78].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[78].offset = 0; + constants_info_[78].data_size = 98304; + constants_info_[78].from_folded = false; + constants_info_[78].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[78].shape = {64, 384, 1, 1}; + constants_info_[78].stride = {384, 1, 1, 1}; + constants_info_[78].layout = static_cast(cached_torch_layout_strided); + constants_info_[78].original_fqn = "mv2.features.9.conv.2.weight"; + constants_info_[79].name = "mv2_features_9_conv_3_weight"; + constants_info_[79].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[79].offset = 0; + constants_info_[79].data_size = 256; + constants_info_[79].from_folded = false; + constants_info_[79].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[79].shape = {64}; + constants_info_[79].stride = {1}; + constants_info_[79].layout = static_cast(cached_torch_layout_strided); + constants_info_[79].original_fqn = "mv2.features.9.conv.3.weight"; + constants_info_[80].name = "mv2_features_9_conv_3_bias"; + constants_info_[80].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[80].offset = 0; + constants_info_[80].data_size = 256; + constants_info_[80].from_folded = false; + constants_info_[80].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[80].shape = {64}; + constants_info_[80].stride = {1}; + constants_info_[80].layout = static_cast(cached_torch_layout_strided); + constants_info_[80].original_fqn = "mv2.features.9.conv.3.bias"; + constants_info_[81].name = "mv2_features_10_conv_0_0_weight"; + constants_info_[81].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[81].offset = 0; + constants_info_[81].data_size = 98304; + constants_info_[81].from_folded = false; + constants_info_[81].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[81].shape = {384, 64, 1, 1}; + constants_info_[81].stride = {64, 1, 1, 1}; + constants_info_[81].layout = static_cast(cached_torch_layout_strided); + constants_info_[81].original_fqn = "mv2.features.10.conv.0.0.weight"; + constants_info_[82].name = "mv2_features_10_conv_0_1_weight"; + constants_info_[82].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[82].offset = 0; + constants_info_[82].data_size = 1536; + constants_info_[82].from_folded = false; + constants_info_[82].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[82].shape = {384}; + constants_info_[82].stride = {1}; + constants_info_[82].layout = static_cast(cached_torch_layout_strided); + constants_info_[82].original_fqn = "mv2.features.10.conv.0.1.weight"; + constants_info_[83].name = "mv2_features_10_conv_0_1_bias"; + constants_info_[83].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[83].offset = 0; + constants_info_[83].data_size = 1536; + constants_info_[83].from_folded = false; + constants_info_[83].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[83].shape = {384}; + constants_info_[83].stride = {1}; + constants_info_[83].layout = static_cast(cached_torch_layout_strided); + constants_info_[83].original_fqn = "mv2.features.10.conv.0.1.bias"; + constants_info_[84].name = "mv2_features_10_conv_1_0_weight"; + constants_info_[84].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[84].offset = 0; + constants_info_[84].data_size = 13824; + constants_info_[84].from_folded = false; + constants_info_[84].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[84].shape = {384, 1, 3, 3}; + constants_info_[84].stride = {9, 9, 3, 1}; + constants_info_[84].layout = static_cast(cached_torch_layout_strided); + constants_info_[84].original_fqn = "mv2.features.10.conv.1.0.weight"; + constants_info_[85].name = "mv2_features_10_conv_1_1_weight"; + constants_info_[85].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[85].offset = 0; + constants_info_[85].data_size = 1536; + constants_info_[85].from_folded = false; + constants_info_[85].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[85].shape = {384}; + constants_info_[85].stride = {1}; + constants_info_[85].layout = static_cast(cached_torch_layout_strided); + constants_info_[85].original_fqn = "mv2.features.10.conv.1.1.weight"; + constants_info_[86].name = "mv2_features_10_conv_1_1_bias"; + constants_info_[86].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[86].offset = 0; + constants_info_[86].data_size = 1536; + constants_info_[86].from_folded = false; + constants_info_[86].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[86].shape = {384}; + constants_info_[86].stride = {1}; + constants_info_[86].layout = static_cast(cached_torch_layout_strided); + constants_info_[86].original_fqn = "mv2.features.10.conv.1.1.bias"; + constants_info_[87].name = "mv2_features_10_conv_2_weight"; + constants_info_[87].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[87].offset = 0; + constants_info_[87].data_size = 98304; + constants_info_[87].from_folded = false; + constants_info_[87].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[87].shape = {64, 384, 1, 1}; + constants_info_[87].stride = {384, 1, 1, 1}; + constants_info_[87].layout = static_cast(cached_torch_layout_strided); + constants_info_[87].original_fqn = "mv2.features.10.conv.2.weight"; + constants_info_[88].name = "mv2_features_10_conv_3_weight"; + constants_info_[88].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[88].offset = 0; + constants_info_[88].data_size = 256; + constants_info_[88].from_folded = false; + constants_info_[88].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[88].shape = {64}; + constants_info_[88].stride = {1}; + constants_info_[88].layout = static_cast(cached_torch_layout_strided); + constants_info_[88].original_fqn = "mv2.features.10.conv.3.weight"; + constants_info_[89].name = "mv2_features_10_conv_3_bias"; + constants_info_[89].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[89].offset = 0; + constants_info_[89].data_size = 256; + constants_info_[89].from_folded = false; + constants_info_[89].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[89].shape = {64}; + constants_info_[89].stride = {1}; + constants_info_[89].layout = static_cast(cached_torch_layout_strided); + constants_info_[89].original_fqn = "mv2.features.10.conv.3.bias"; + constants_info_[90].name = "mv2_features_11_conv_0_0_weight"; + constants_info_[90].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[90].offset = 0; + constants_info_[90].data_size = 98304; + constants_info_[90].from_folded = false; + constants_info_[90].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[90].shape = {384, 64, 1, 1}; + constants_info_[90].stride = {64, 1, 1, 1}; + constants_info_[90].layout = static_cast(cached_torch_layout_strided); + constants_info_[90].original_fqn = "mv2.features.11.conv.0.0.weight"; + constants_info_[91].name = "mv2_features_11_conv_0_1_weight"; + constants_info_[91].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[91].offset = 0; + constants_info_[91].data_size = 1536; + constants_info_[91].from_folded = false; + constants_info_[91].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[91].shape = {384}; + constants_info_[91].stride = {1}; + constants_info_[91].layout = static_cast(cached_torch_layout_strided); + constants_info_[91].original_fqn = "mv2.features.11.conv.0.1.weight"; + constants_info_[92].name = "mv2_features_11_conv_0_1_bias"; + constants_info_[92].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[92].offset = 0; + constants_info_[92].data_size = 1536; + constants_info_[92].from_folded = false; + constants_info_[92].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[92].shape = {384}; + constants_info_[92].stride = {1}; + constants_info_[92].layout = static_cast(cached_torch_layout_strided); + constants_info_[92].original_fqn = "mv2.features.11.conv.0.1.bias"; + constants_info_[93].name = "mv2_features_11_conv_1_0_weight"; + constants_info_[93].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[93].offset = 0; + constants_info_[93].data_size = 13824; + constants_info_[93].from_folded = false; + constants_info_[93].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[93].shape = {384, 1, 3, 3}; + constants_info_[93].stride = {9, 9, 3, 1}; + constants_info_[93].layout = static_cast(cached_torch_layout_strided); + constants_info_[93].original_fqn = "mv2.features.11.conv.1.0.weight"; + constants_info_[94].name = "mv2_features_11_conv_1_1_weight"; + constants_info_[94].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[94].offset = 0; + constants_info_[94].data_size = 1536; + constants_info_[94].from_folded = false; + constants_info_[94].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[94].shape = {384}; + constants_info_[94].stride = {1}; + constants_info_[94].layout = static_cast(cached_torch_layout_strided); + constants_info_[94].original_fqn = "mv2.features.11.conv.1.1.weight"; + constants_info_[95].name = "mv2_features_11_conv_1_1_bias"; + constants_info_[95].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[95].offset = 0; + constants_info_[95].data_size = 1536; + constants_info_[95].from_folded = false; + constants_info_[95].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[95].shape = {384}; + constants_info_[95].stride = {1}; + constants_info_[95].layout = static_cast(cached_torch_layout_strided); + constants_info_[95].original_fqn = "mv2.features.11.conv.1.1.bias"; + constants_info_[96].name = "mv2_features_11_conv_2_weight"; + constants_info_[96].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[96].offset = 0; + constants_info_[96].data_size = 147456; + constants_info_[96].from_folded = false; + constants_info_[96].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[96].shape = {96, 384, 1, 1}; + constants_info_[96].stride = {384, 1, 1, 1}; + constants_info_[96].layout = static_cast(cached_torch_layout_strided); + constants_info_[96].original_fqn = "mv2.features.11.conv.2.weight"; + constants_info_[97].name = "mv2_features_11_conv_3_weight"; + constants_info_[97].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[97].offset = 0; + constants_info_[97].data_size = 384; + constants_info_[97].from_folded = false; + constants_info_[97].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[97].shape = {96}; + constants_info_[97].stride = {1}; + constants_info_[97].layout = static_cast(cached_torch_layout_strided); + constants_info_[97].original_fqn = "mv2.features.11.conv.3.weight"; + constants_info_[98].name = "mv2_features_11_conv_3_bias"; + constants_info_[98].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[98].offset = 0; + constants_info_[98].data_size = 384; + constants_info_[98].from_folded = false; + constants_info_[98].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[98].shape = {96}; + constants_info_[98].stride = {1}; + constants_info_[98].layout = static_cast(cached_torch_layout_strided); + constants_info_[98].original_fqn = "mv2.features.11.conv.3.bias"; + constants_info_[99].name = "mv2_features_12_conv_0_0_weight"; + constants_info_[99].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[99].offset = 0; + constants_info_[99].data_size = 221184; + constants_info_[99].from_folded = false; + constants_info_[99].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[99].shape = {576, 96, 1, 1}; + constants_info_[99].stride = {96, 1, 1, 1}; + constants_info_[99].layout = static_cast(cached_torch_layout_strided); + constants_info_[99].original_fqn = "mv2.features.12.conv.0.0.weight"; + constants_info_[100].name = "mv2_features_12_conv_0_1_weight"; + constants_info_[100].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[100].offset = 0; + constants_info_[100].data_size = 2304; + constants_info_[100].from_folded = false; + constants_info_[100].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[100].shape = {576}; + constants_info_[100].stride = {1}; + constants_info_[100].layout = static_cast(cached_torch_layout_strided); + constants_info_[100].original_fqn = "mv2.features.12.conv.0.1.weight"; + constants_info_[101].name = "mv2_features_12_conv_0_1_bias"; + constants_info_[101].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[101].offset = 0; + constants_info_[101].data_size = 2304; + constants_info_[101].from_folded = false; + constants_info_[101].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[101].shape = {576}; + constants_info_[101].stride = {1}; + constants_info_[101].layout = static_cast(cached_torch_layout_strided); + constants_info_[101].original_fqn = "mv2.features.12.conv.0.1.bias"; + constants_info_[102].name = "mv2_features_12_conv_1_0_weight"; + constants_info_[102].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[102].offset = 0; + constants_info_[102].data_size = 20736; + constants_info_[102].from_folded = false; + constants_info_[102].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[102].shape = {576, 1, 3, 3}; + constants_info_[102].stride = {9, 9, 3, 1}; + constants_info_[102].layout = static_cast(cached_torch_layout_strided); + constants_info_[102].original_fqn = "mv2.features.12.conv.1.0.weight"; + constants_info_[103].name = "mv2_features_12_conv_1_1_weight"; + constants_info_[103].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[103].offset = 0; + constants_info_[103].data_size = 2304; + constants_info_[103].from_folded = false; + constants_info_[103].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[103].shape = {576}; + constants_info_[103].stride = {1}; + constants_info_[103].layout = static_cast(cached_torch_layout_strided); + constants_info_[103].original_fqn = "mv2.features.12.conv.1.1.weight"; + constants_info_[104].name = "mv2_features_12_conv_1_1_bias"; + constants_info_[104].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[104].offset = 0; + constants_info_[104].data_size = 2304; + constants_info_[104].from_folded = false; + constants_info_[104].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[104].shape = {576}; + constants_info_[104].stride = {1}; + constants_info_[104].layout = static_cast(cached_torch_layout_strided); + constants_info_[104].original_fqn = "mv2.features.12.conv.1.1.bias"; + constants_info_[105].name = "mv2_features_12_conv_2_weight"; + constants_info_[105].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[105].offset = 0; + constants_info_[105].data_size = 221184; + constants_info_[105].from_folded = false; + constants_info_[105].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[105].shape = {96, 576, 1, 1}; + constants_info_[105].stride = {576, 1, 1, 1}; + constants_info_[105].layout = static_cast(cached_torch_layout_strided); + constants_info_[105].original_fqn = "mv2.features.12.conv.2.weight"; + constants_info_[106].name = "mv2_features_12_conv_3_weight"; + constants_info_[106].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[106].offset = 0; + constants_info_[106].data_size = 384; + constants_info_[106].from_folded = false; + constants_info_[106].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[106].shape = {96}; + constants_info_[106].stride = {1}; + constants_info_[106].layout = static_cast(cached_torch_layout_strided); + constants_info_[106].original_fqn = "mv2.features.12.conv.3.weight"; + constants_info_[107].name = "mv2_features_12_conv_3_bias"; + constants_info_[107].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[107].offset = 0; + constants_info_[107].data_size = 384; + constants_info_[107].from_folded = false; + constants_info_[107].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[107].shape = {96}; + constants_info_[107].stride = {1}; + constants_info_[107].layout = static_cast(cached_torch_layout_strided); + constants_info_[107].original_fqn = "mv2.features.12.conv.3.bias"; + constants_info_[108].name = "mv2_features_13_conv_0_0_weight"; + constants_info_[108].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[108].offset = 0; + constants_info_[108].data_size = 221184; + constants_info_[108].from_folded = false; + constants_info_[108].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[108].shape = {576, 96, 1, 1}; + constants_info_[108].stride = {96, 1, 1, 1}; + constants_info_[108].layout = static_cast(cached_torch_layout_strided); + constants_info_[108].original_fqn = "mv2.features.13.conv.0.0.weight"; + constants_info_[109].name = "mv2_features_13_conv_0_1_weight"; + constants_info_[109].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[109].offset = 0; + constants_info_[109].data_size = 2304; + constants_info_[109].from_folded = false; + constants_info_[109].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[109].shape = {576}; + constants_info_[109].stride = {1}; + constants_info_[109].layout = static_cast(cached_torch_layout_strided); + constants_info_[109].original_fqn = "mv2.features.13.conv.0.1.weight"; + constants_info_[110].name = "mv2_features_13_conv_0_1_bias"; + constants_info_[110].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[110].offset = 0; + constants_info_[110].data_size = 2304; + constants_info_[110].from_folded = false; + constants_info_[110].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[110].shape = {576}; + constants_info_[110].stride = {1}; + constants_info_[110].layout = static_cast(cached_torch_layout_strided); + constants_info_[110].original_fqn = "mv2.features.13.conv.0.1.bias"; + constants_info_[111].name = "mv2_features_13_conv_1_0_weight"; + constants_info_[111].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[111].offset = 0; + constants_info_[111].data_size = 20736; + constants_info_[111].from_folded = false; + constants_info_[111].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[111].shape = {576, 1, 3, 3}; + constants_info_[111].stride = {9, 9, 3, 1}; + constants_info_[111].layout = static_cast(cached_torch_layout_strided); + constants_info_[111].original_fqn = "mv2.features.13.conv.1.0.weight"; + constants_info_[112].name = "mv2_features_13_conv_1_1_weight"; + constants_info_[112].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[112].offset = 0; + constants_info_[112].data_size = 2304; + constants_info_[112].from_folded = false; + constants_info_[112].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[112].shape = {576}; + constants_info_[112].stride = {1}; + constants_info_[112].layout = static_cast(cached_torch_layout_strided); + constants_info_[112].original_fqn = "mv2.features.13.conv.1.1.weight"; + constants_info_[113].name = "mv2_features_13_conv_1_1_bias"; + constants_info_[113].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[113].offset = 0; + constants_info_[113].data_size = 2304; + constants_info_[113].from_folded = false; + constants_info_[113].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[113].shape = {576}; + constants_info_[113].stride = {1}; + constants_info_[113].layout = static_cast(cached_torch_layout_strided); + constants_info_[113].original_fqn = "mv2.features.13.conv.1.1.bias"; + constants_info_[114].name = "mv2_features_13_conv_2_weight"; + constants_info_[114].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[114].offset = 0; + constants_info_[114].data_size = 221184; + constants_info_[114].from_folded = false; + constants_info_[114].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[114].shape = {96, 576, 1, 1}; + constants_info_[114].stride = {576, 1, 1, 1}; + constants_info_[114].layout = static_cast(cached_torch_layout_strided); + constants_info_[114].original_fqn = "mv2.features.13.conv.2.weight"; + constants_info_[115].name = "mv2_features_13_conv_3_weight"; + constants_info_[115].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[115].offset = 0; + constants_info_[115].data_size = 384; + constants_info_[115].from_folded = false; + constants_info_[115].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[115].shape = {96}; + constants_info_[115].stride = {1}; + constants_info_[115].layout = static_cast(cached_torch_layout_strided); + constants_info_[115].original_fqn = "mv2.features.13.conv.3.weight"; + constants_info_[116].name = "mv2_features_13_conv_3_bias"; + constants_info_[116].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[116].offset = 0; + constants_info_[116].data_size = 384; + constants_info_[116].from_folded = false; + constants_info_[116].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[116].shape = {96}; + constants_info_[116].stride = {1}; + constants_info_[116].layout = static_cast(cached_torch_layout_strided); + constants_info_[116].original_fqn = "mv2.features.13.conv.3.bias"; + constants_info_[117].name = "mv2_features_14_conv_0_0_weight"; + constants_info_[117].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[117].offset = 0; + constants_info_[117].data_size = 221184; + constants_info_[117].from_folded = false; + constants_info_[117].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[117].shape = {576, 96, 1, 1}; + constants_info_[117].stride = {96, 1, 1, 1}; + constants_info_[117].layout = static_cast(cached_torch_layout_strided); + constants_info_[117].original_fqn = "mv2.features.14.conv.0.0.weight"; + constants_info_[118].name = "mv2_features_14_conv_0_1_weight"; + constants_info_[118].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[118].offset = 0; + constants_info_[118].data_size = 2304; + constants_info_[118].from_folded = false; + constants_info_[118].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[118].shape = {576}; + constants_info_[118].stride = {1}; + constants_info_[118].layout = static_cast(cached_torch_layout_strided); + constants_info_[118].original_fqn = "mv2.features.14.conv.0.1.weight"; + constants_info_[119].name = "mv2_features_14_conv_0_1_bias"; + constants_info_[119].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[119].offset = 0; + constants_info_[119].data_size = 2304; + constants_info_[119].from_folded = false; + constants_info_[119].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[119].shape = {576}; + constants_info_[119].stride = {1}; + constants_info_[119].layout = static_cast(cached_torch_layout_strided); + constants_info_[119].original_fqn = "mv2.features.14.conv.0.1.bias"; + constants_info_[120].name = "mv2_features_14_conv_1_0_weight"; + constants_info_[120].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[120].offset = 0; + constants_info_[120].data_size = 20736; + constants_info_[120].from_folded = false; + constants_info_[120].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[120].shape = {576, 1, 3, 3}; + constants_info_[120].stride = {9, 9, 3, 1}; + constants_info_[120].layout = static_cast(cached_torch_layout_strided); + constants_info_[120].original_fqn = "mv2.features.14.conv.1.0.weight"; + constants_info_[121].name = "mv2_features_14_conv_1_1_weight"; + constants_info_[121].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[121].offset = 0; + constants_info_[121].data_size = 2304; + constants_info_[121].from_folded = false; + constants_info_[121].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[121].shape = {576}; + constants_info_[121].stride = {1}; + constants_info_[121].layout = static_cast(cached_torch_layout_strided); + constants_info_[121].original_fqn = "mv2.features.14.conv.1.1.weight"; + constants_info_[122].name = "mv2_features_14_conv_1_1_bias"; + constants_info_[122].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[122].offset = 0; + constants_info_[122].data_size = 2304; + constants_info_[122].from_folded = false; + constants_info_[122].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[122].shape = {576}; + constants_info_[122].stride = {1}; + constants_info_[122].layout = static_cast(cached_torch_layout_strided); + constants_info_[122].original_fqn = "mv2.features.14.conv.1.1.bias"; + constants_info_[123].name = "mv2_features_14_conv_2_weight"; + constants_info_[123].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[123].offset = 0; + constants_info_[123].data_size = 368640; + constants_info_[123].from_folded = false; + constants_info_[123].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[123].shape = {160, 576, 1, 1}; + constants_info_[123].stride = {576, 1, 1, 1}; + constants_info_[123].layout = static_cast(cached_torch_layout_strided); + constants_info_[123].original_fqn = "mv2.features.14.conv.2.weight"; + constants_info_[124].name = "mv2_features_14_conv_3_weight"; + constants_info_[124].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[124].offset = 0; + constants_info_[124].data_size = 640; + constants_info_[124].from_folded = false; + constants_info_[124].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[124].shape = {160}; + constants_info_[124].stride = {1}; + constants_info_[124].layout = static_cast(cached_torch_layout_strided); + constants_info_[124].original_fqn = "mv2.features.14.conv.3.weight"; + constants_info_[125].name = "mv2_features_14_conv_3_bias"; + constants_info_[125].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[125].offset = 0; + constants_info_[125].data_size = 640; + constants_info_[125].from_folded = false; + constants_info_[125].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[125].shape = {160}; + constants_info_[125].stride = {1}; + constants_info_[125].layout = static_cast(cached_torch_layout_strided); + constants_info_[125].original_fqn = "mv2.features.14.conv.3.bias"; + constants_info_[126].name = "mv2_features_15_conv_0_0_weight"; + constants_info_[126].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[126].offset = 0; + constants_info_[126].data_size = 614400; + constants_info_[126].from_folded = false; + constants_info_[126].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[126].shape = {960, 160, 1, 1}; + constants_info_[126].stride = {160, 1, 1, 1}; + constants_info_[126].layout = static_cast(cached_torch_layout_strided); + constants_info_[126].original_fqn = "mv2.features.15.conv.0.0.weight"; + constants_info_[127].name = "mv2_features_15_conv_0_1_weight"; + constants_info_[127].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[127].offset = 0; + constants_info_[127].data_size = 3840; + constants_info_[127].from_folded = false; + constants_info_[127].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[127].shape = {960}; + constants_info_[127].stride = {1}; + constants_info_[127].layout = static_cast(cached_torch_layout_strided); + constants_info_[127].original_fqn = "mv2.features.15.conv.0.1.weight"; + constants_info_[128].name = "mv2_features_15_conv_0_1_bias"; + constants_info_[128].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[128].offset = 0; + constants_info_[128].data_size = 3840; + constants_info_[128].from_folded = false; + constants_info_[128].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[128].shape = {960}; + constants_info_[128].stride = {1}; + constants_info_[128].layout = static_cast(cached_torch_layout_strided); + constants_info_[128].original_fqn = "mv2.features.15.conv.0.1.bias"; + constants_info_[129].name = "mv2_features_15_conv_1_0_weight"; + constants_info_[129].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[129].offset = 0; + constants_info_[129].data_size = 34560; + constants_info_[129].from_folded = false; + constants_info_[129].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[129].shape = {960, 1, 3, 3}; + constants_info_[129].stride = {9, 9, 3, 1}; + constants_info_[129].layout = static_cast(cached_torch_layout_strided); + constants_info_[129].original_fqn = "mv2.features.15.conv.1.0.weight"; + constants_info_[130].name = "mv2_features_15_conv_1_1_weight"; + constants_info_[130].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[130].offset = 0; + constants_info_[130].data_size = 3840; + constants_info_[130].from_folded = false; + constants_info_[130].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[130].shape = {960}; + constants_info_[130].stride = {1}; + constants_info_[130].layout = static_cast(cached_torch_layout_strided); + constants_info_[130].original_fqn = "mv2.features.15.conv.1.1.weight"; + constants_info_[131].name = "mv2_features_15_conv_1_1_bias"; + constants_info_[131].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[131].offset = 0; + constants_info_[131].data_size = 3840; + constants_info_[131].from_folded = false; + constants_info_[131].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[131].shape = {960}; + constants_info_[131].stride = {1}; + constants_info_[131].layout = static_cast(cached_torch_layout_strided); + constants_info_[131].original_fqn = "mv2.features.15.conv.1.1.bias"; + constants_info_[132].name = "mv2_features_15_conv_2_weight"; + constants_info_[132].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[132].offset = 0; + constants_info_[132].data_size = 614400; + constants_info_[132].from_folded = false; + constants_info_[132].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[132].shape = {160, 960, 1, 1}; + constants_info_[132].stride = {960, 1, 1, 1}; + constants_info_[132].layout = static_cast(cached_torch_layout_strided); + constants_info_[132].original_fqn = "mv2.features.15.conv.2.weight"; + constants_info_[133].name = "mv2_features_15_conv_3_weight"; + constants_info_[133].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[133].offset = 0; + constants_info_[133].data_size = 640; + constants_info_[133].from_folded = false; + constants_info_[133].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[133].shape = {160}; + constants_info_[133].stride = {1}; + constants_info_[133].layout = static_cast(cached_torch_layout_strided); + constants_info_[133].original_fqn = "mv2.features.15.conv.3.weight"; + constants_info_[134].name = "mv2_features_15_conv_3_bias"; + constants_info_[134].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[134].offset = 0; + constants_info_[134].data_size = 640; + constants_info_[134].from_folded = false; + constants_info_[134].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[134].shape = {160}; + constants_info_[134].stride = {1}; + constants_info_[134].layout = static_cast(cached_torch_layout_strided); + constants_info_[134].original_fqn = "mv2.features.15.conv.3.bias"; + constants_info_[135].name = "mv2_features_16_conv_0_0_weight"; + constants_info_[135].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[135].offset = 0; + constants_info_[135].data_size = 614400; + constants_info_[135].from_folded = false; + constants_info_[135].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[135].shape = {960, 160, 1, 1}; + constants_info_[135].stride = {160, 1, 1, 1}; + constants_info_[135].layout = static_cast(cached_torch_layout_strided); + constants_info_[135].original_fqn = "mv2.features.16.conv.0.0.weight"; + constants_info_[136].name = "mv2_features_16_conv_0_1_weight"; + constants_info_[136].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[136].offset = 0; + constants_info_[136].data_size = 3840; + constants_info_[136].from_folded = false; + constants_info_[136].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[136].shape = {960}; + constants_info_[136].stride = {1}; + constants_info_[136].layout = static_cast(cached_torch_layout_strided); + constants_info_[136].original_fqn = "mv2.features.16.conv.0.1.weight"; + constants_info_[137].name = "mv2_features_16_conv_0_1_bias"; + constants_info_[137].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[137].offset = 0; + constants_info_[137].data_size = 3840; + constants_info_[137].from_folded = false; + constants_info_[137].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[137].shape = {960}; + constants_info_[137].stride = {1}; + constants_info_[137].layout = static_cast(cached_torch_layout_strided); + constants_info_[137].original_fqn = "mv2.features.16.conv.0.1.bias"; + constants_info_[138].name = "mv2_features_16_conv_1_0_weight"; + constants_info_[138].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[138].offset = 0; + constants_info_[138].data_size = 34560; + constants_info_[138].from_folded = false; + constants_info_[138].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[138].shape = {960, 1, 3, 3}; + constants_info_[138].stride = {9, 9, 3, 1}; + constants_info_[138].layout = static_cast(cached_torch_layout_strided); + constants_info_[138].original_fqn = "mv2.features.16.conv.1.0.weight"; + constants_info_[139].name = "mv2_features_16_conv_1_1_weight"; + constants_info_[139].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[139].offset = 0; + constants_info_[139].data_size = 3840; + constants_info_[139].from_folded = false; + constants_info_[139].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[139].shape = {960}; + constants_info_[139].stride = {1}; + constants_info_[139].layout = static_cast(cached_torch_layout_strided); + constants_info_[139].original_fqn = "mv2.features.16.conv.1.1.weight"; + constants_info_[140].name = "mv2_features_16_conv_1_1_bias"; + constants_info_[140].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[140].offset = 0; + constants_info_[140].data_size = 3840; + constants_info_[140].from_folded = false; + constants_info_[140].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[140].shape = {960}; + constants_info_[140].stride = {1}; + constants_info_[140].layout = static_cast(cached_torch_layout_strided); + constants_info_[140].original_fqn = "mv2.features.16.conv.1.1.bias"; + constants_info_[141].name = "mv2_features_16_conv_2_weight"; + constants_info_[141].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[141].offset = 0; + constants_info_[141].data_size = 614400; + constants_info_[141].from_folded = false; + constants_info_[141].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[141].shape = {160, 960, 1, 1}; + constants_info_[141].stride = {960, 1, 1, 1}; + constants_info_[141].layout = static_cast(cached_torch_layout_strided); + constants_info_[141].original_fqn = "mv2.features.16.conv.2.weight"; + constants_info_[142].name = "mv2_features_16_conv_3_weight"; + constants_info_[142].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[142].offset = 0; + constants_info_[142].data_size = 640; + constants_info_[142].from_folded = false; + constants_info_[142].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[142].shape = {160}; + constants_info_[142].stride = {1}; + constants_info_[142].layout = static_cast(cached_torch_layout_strided); + constants_info_[142].original_fqn = "mv2.features.16.conv.3.weight"; + constants_info_[143].name = "mv2_features_16_conv_3_bias"; + constants_info_[143].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[143].offset = 0; + constants_info_[143].data_size = 640; + constants_info_[143].from_folded = false; + constants_info_[143].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[143].shape = {160}; + constants_info_[143].stride = {1}; + constants_info_[143].layout = static_cast(cached_torch_layout_strided); + constants_info_[143].original_fqn = "mv2.features.16.conv.3.bias"; + constants_info_[144].name = "mv2_features_17_conv_0_0_weight"; + constants_info_[144].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[144].offset = 0; + constants_info_[144].data_size = 614400; + constants_info_[144].from_folded = false; + constants_info_[144].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[144].shape = {960, 160, 1, 1}; + constants_info_[144].stride = {160, 1, 1, 1}; + constants_info_[144].layout = static_cast(cached_torch_layout_strided); + constants_info_[144].original_fqn = "mv2.features.17.conv.0.0.weight"; + constants_info_[145].name = "mv2_features_17_conv_0_1_weight"; + constants_info_[145].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[145].offset = 0; + constants_info_[145].data_size = 3840; + constants_info_[145].from_folded = false; + constants_info_[145].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[145].shape = {960}; + constants_info_[145].stride = {1}; + constants_info_[145].layout = static_cast(cached_torch_layout_strided); + constants_info_[145].original_fqn = "mv2.features.17.conv.0.1.weight"; + constants_info_[146].name = "mv2_features_17_conv_0_1_bias"; + constants_info_[146].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[146].offset = 0; + constants_info_[146].data_size = 3840; + constants_info_[146].from_folded = false; + constants_info_[146].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[146].shape = {960}; + constants_info_[146].stride = {1}; + constants_info_[146].layout = static_cast(cached_torch_layout_strided); + constants_info_[146].original_fqn = "mv2.features.17.conv.0.1.bias"; + constants_info_[147].name = "mv2_features_17_conv_1_0_weight"; + constants_info_[147].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[147].offset = 0; + constants_info_[147].data_size = 34560; + constants_info_[147].from_folded = false; + constants_info_[147].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[147].shape = {960, 1, 3, 3}; + constants_info_[147].stride = {9, 9, 3, 1}; + constants_info_[147].layout = static_cast(cached_torch_layout_strided); + constants_info_[147].original_fqn = "mv2.features.17.conv.1.0.weight"; + constants_info_[148].name = "mv2_features_17_conv_1_1_weight"; + constants_info_[148].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[148].offset = 0; + constants_info_[148].data_size = 3840; + constants_info_[148].from_folded = false; + constants_info_[148].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[148].shape = {960}; + constants_info_[148].stride = {1}; + constants_info_[148].layout = static_cast(cached_torch_layout_strided); + constants_info_[148].original_fqn = "mv2.features.17.conv.1.1.weight"; + constants_info_[149].name = "mv2_features_17_conv_1_1_bias"; + constants_info_[149].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[149].offset = 0; + constants_info_[149].data_size = 3840; + constants_info_[149].from_folded = false; + constants_info_[149].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[149].shape = {960}; + constants_info_[149].stride = {1}; + constants_info_[149].layout = static_cast(cached_torch_layout_strided); + constants_info_[149].original_fqn = "mv2.features.17.conv.1.1.bias"; + constants_info_[150].name = "mv2_features_17_conv_2_weight"; + constants_info_[150].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[150].offset = 0; + constants_info_[150].data_size = 1228800; + constants_info_[150].from_folded = false; + constants_info_[150].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[150].shape = {320, 960, 1, 1}; + constants_info_[150].stride = {960, 1, 1, 1}; + constants_info_[150].layout = static_cast(cached_torch_layout_strided); + constants_info_[150].original_fqn = "mv2.features.17.conv.2.weight"; + constants_info_[151].name = "mv2_features_17_conv_3_weight"; + constants_info_[151].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[151].offset = 0; + constants_info_[151].data_size = 1280; + constants_info_[151].from_folded = false; + constants_info_[151].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[151].shape = {320}; + constants_info_[151].stride = {1}; + constants_info_[151].layout = static_cast(cached_torch_layout_strided); + constants_info_[151].original_fqn = "mv2.features.17.conv.3.weight"; + constants_info_[152].name = "mv2_features_17_conv_3_bias"; + constants_info_[152].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[152].offset = 0; + constants_info_[152].data_size = 1280; + constants_info_[152].from_folded = false; + constants_info_[152].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[152].shape = {320}; + constants_info_[152].stride = {1}; + constants_info_[152].layout = static_cast(cached_torch_layout_strided); + constants_info_[152].original_fqn = "mv2.features.17.conv.3.bias"; + constants_info_[153].name = "mv2_features_18_0_weight"; + constants_info_[153].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[153].offset = 0; + constants_info_[153].data_size = 1638400; + constants_info_[153].from_folded = false; + constants_info_[153].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[153].shape = {1280, 320, 1, 1}; + constants_info_[153].stride = {320, 1, 1, 1}; + constants_info_[153].layout = static_cast(cached_torch_layout_strided); + constants_info_[153].original_fqn = "mv2.features.18.0.weight"; + constants_info_[154].name = "mv2_features_18_1_weight"; + constants_info_[154].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[154].offset = 0; + constants_info_[154].data_size = 5120; + constants_info_[154].from_folded = false; + constants_info_[154].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[154].shape = {1280}; + constants_info_[154].stride = {1}; + constants_info_[154].layout = static_cast(cached_torch_layout_strided); + constants_info_[154].original_fqn = "mv2.features.18.1.weight"; + constants_info_[155].name = "mv2_features_18_1_bias"; + constants_info_[155].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[155].offset = 0; + constants_info_[155].data_size = 5120; + constants_info_[155].from_folded = false; + constants_info_[155].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[155].shape = {1280}; + constants_info_[155].stride = {1}; + constants_info_[155].layout = static_cast(cached_torch_layout_strided); + constants_info_[155].original_fqn = "mv2.features.18.1.bias"; + constants_info_[156].name = "mv2_classifier_1_weight"; + constants_info_[156].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[156].offset = 0; + constants_info_[156].data_size = 5120000; + constants_info_[156].from_folded = false; + constants_info_[156].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[156].shape = {1000, 1280}; + constants_info_[156].stride = {1280, 1}; + constants_info_[156].layout = static_cast(cached_torch_layout_strided); + constants_info_[156].original_fqn = "mv2.classifier.1.weight"; + constants_info_[157].name = "mv2_classifier_1_bias"; + constants_info_[157].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[157].offset = 0; + constants_info_[157].data_size = 4000; + constants_info_[157].from_folded = false; + constants_info_[157].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[157].shape = {1000}; + constants_info_[157].stride = {1}; + constants_info_[157].layout = static_cast(cached_torch_layout_strided); + constants_info_[157].original_fqn = "mv2.classifier.1.bias"; + constants_info_[158].name = "mv2_features_0_1_running_mean"; + constants_info_[158].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[158].offset = 0; + constants_info_[158].data_size = 128; + constants_info_[158].from_folded = false; + constants_info_[158].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[158].shape = {32}; + constants_info_[158].stride = {1}; + constants_info_[158].layout = static_cast(cached_torch_layout_strided); + constants_info_[158].original_fqn = "mv2.features.0.1.running_mean"; + constants_info_[159].name = "mv2_features_0_1_running_var"; + constants_info_[159].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[159].offset = 0; + constants_info_[159].data_size = 128; + constants_info_[159].from_folded = false; + constants_info_[159].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[159].shape = {32}; + constants_info_[159].stride = {1}; + constants_info_[159].layout = static_cast(cached_torch_layout_strided); + constants_info_[159].original_fqn = "mv2.features.0.1.running_var"; + constants_info_[160].name = "mv2_features_1_conv_0_1_running_mean"; + constants_info_[160].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[160].offset = 0; + constants_info_[160].data_size = 128; + constants_info_[160].from_folded = false; + constants_info_[160].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[160].shape = {32}; + constants_info_[160].stride = {1}; + constants_info_[160].layout = static_cast(cached_torch_layout_strided); + constants_info_[160].original_fqn = "mv2.features.1.conv.0.1.running_mean"; + constants_info_[161].name = "mv2_features_1_conv_0_1_running_var"; + constants_info_[161].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[161].offset = 0; + constants_info_[161].data_size = 128; + constants_info_[161].from_folded = false; + constants_info_[161].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[161].shape = {32}; + constants_info_[161].stride = {1}; + constants_info_[161].layout = static_cast(cached_torch_layout_strided); + constants_info_[161].original_fqn = "mv2.features.1.conv.0.1.running_var"; + constants_info_[162].name = "mv2_features_1_conv_2_running_mean"; + constants_info_[162].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[162].offset = 0; + constants_info_[162].data_size = 64; + constants_info_[162].from_folded = false; + constants_info_[162].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[162].shape = {16}; + constants_info_[162].stride = {1}; + constants_info_[162].layout = static_cast(cached_torch_layout_strided); + constants_info_[162].original_fqn = "mv2.features.1.conv.2.running_mean"; + constants_info_[163].name = "mv2_features_1_conv_2_running_var"; + constants_info_[163].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[163].offset = 0; + constants_info_[163].data_size = 64; + constants_info_[163].from_folded = false; + constants_info_[163].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[163].shape = {16}; + constants_info_[163].stride = {1}; + constants_info_[163].layout = static_cast(cached_torch_layout_strided); + constants_info_[163].original_fqn = "mv2.features.1.conv.2.running_var"; + constants_info_[164].name = "mv2_features_2_conv_0_1_running_mean"; + constants_info_[164].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[164].offset = 0; + constants_info_[164].data_size = 384; + constants_info_[164].from_folded = false; + constants_info_[164].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[164].shape = {96}; + constants_info_[164].stride = {1}; + constants_info_[164].layout = static_cast(cached_torch_layout_strided); + constants_info_[164].original_fqn = "mv2.features.2.conv.0.1.running_mean"; + constants_info_[165].name = "mv2_features_2_conv_0_1_running_var"; + constants_info_[165].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[165].offset = 0; + constants_info_[165].data_size = 384; + constants_info_[165].from_folded = false; + constants_info_[165].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[165].shape = {96}; + constants_info_[165].stride = {1}; + constants_info_[165].layout = static_cast(cached_torch_layout_strided); + constants_info_[165].original_fqn = "mv2.features.2.conv.0.1.running_var"; + constants_info_[166].name = "mv2_features_2_conv_1_1_running_mean"; + constants_info_[166].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[166].offset = 0; + constants_info_[166].data_size = 384; + constants_info_[166].from_folded = false; + constants_info_[166].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[166].shape = {96}; + constants_info_[166].stride = {1}; + constants_info_[166].layout = static_cast(cached_torch_layout_strided); + constants_info_[166].original_fqn = "mv2.features.2.conv.1.1.running_mean"; + constants_info_[167].name = "mv2_features_2_conv_1_1_running_var"; + constants_info_[167].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[167].offset = 0; + constants_info_[167].data_size = 384; + constants_info_[167].from_folded = false; + constants_info_[167].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[167].shape = {96}; + constants_info_[167].stride = {1}; + constants_info_[167].layout = static_cast(cached_torch_layout_strided); + constants_info_[167].original_fqn = "mv2.features.2.conv.1.1.running_var"; + constants_info_[168].name = "mv2_features_2_conv_3_running_mean"; + constants_info_[168].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[168].offset = 0; + constants_info_[168].data_size = 96; + constants_info_[168].from_folded = false; + constants_info_[168].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[168].shape = {24}; + constants_info_[168].stride = {1}; + constants_info_[168].layout = static_cast(cached_torch_layout_strided); + constants_info_[168].original_fqn = "mv2.features.2.conv.3.running_mean"; + constants_info_[169].name = "mv2_features_2_conv_3_running_var"; + constants_info_[169].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[169].offset = 0; + constants_info_[169].data_size = 96; + constants_info_[169].from_folded = false; + constants_info_[169].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[169].shape = {24}; + constants_info_[169].stride = {1}; + constants_info_[169].layout = static_cast(cached_torch_layout_strided); + constants_info_[169].original_fqn = "mv2.features.2.conv.3.running_var"; + constants_info_[170].name = "mv2_features_3_conv_0_1_running_mean"; + constants_info_[170].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[170].offset = 0; + constants_info_[170].data_size = 576; + constants_info_[170].from_folded = false; + constants_info_[170].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[170].shape = {144}; + constants_info_[170].stride = {1}; + constants_info_[170].layout = static_cast(cached_torch_layout_strided); + constants_info_[170].original_fqn = "mv2.features.3.conv.0.1.running_mean"; + constants_info_[171].name = "mv2_features_3_conv_0_1_running_var"; + constants_info_[171].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[171].offset = 0; + constants_info_[171].data_size = 576; + constants_info_[171].from_folded = false; + constants_info_[171].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[171].shape = {144}; + constants_info_[171].stride = {1}; + constants_info_[171].layout = static_cast(cached_torch_layout_strided); + constants_info_[171].original_fqn = "mv2.features.3.conv.0.1.running_var"; + constants_info_[172].name = "mv2_features_3_conv_1_1_running_mean"; + constants_info_[172].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[172].offset = 0; + constants_info_[172].data_size = 576; + constants_info_[172].from_folded = false; + constants_info_[172].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[172].shape = {144}; + constants_info_[172].stride = {1}; + constants_info_[172].layout = static_cast(cached_torch_layout_strided); + constants_info_[172].original_fqn = "mv2.features.3.conv.1.1.running_mean"; + constants_info_[173].name = "mv2_features_3_conv_1_1_running_var"; + constants_info_[173].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[173].offset = 0; + constants_info_[173].data_size = 576; + constants_info_[173].from_folded = false; + constants_info_[173].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[173].shape = {144}; + constants_info_[173].stride = {1}; + constants_info_[173].layout = static_cast(cached_torch_layout_strided); + constants_info_[173].original_fqn = "mv2.features.3.conv.1.1.running_var"; + constants_info_[174].name = "mv2_features_3_conv_3_running_mean"; + constants_info_[174].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[174].offset = 0; + constants_info_[174].data_size = 96; + constants_info_[174].from_folded = false; + constants_info_[174].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[174].shape = {24}; + constants_info_[174].stride = {1}; + constants_info_[174].layout = static_cast(cached_torch_layout_strided); + constants_info_[174].original_fqn = "mv2.features.3.conv.3.running_mean"; + constants_info_[175].name = "mv2_features_3_conv_3_running_var"; + constants_info_[175].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[175].offset = 0; + constants_info_[175].data_size = 96; + constants_info_[175].from_folded = false; + constants_info_[175].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[175].shape = {24}; + constants_info_[175].stride = {1}; + constants_info_[175].layout = static_cast(cached_torch_layout_strided); + constants_info_[175].original_fqn = "mv2.features.3.conv.3.running_var"; + constants_info_[176].name = "mv2_features_4_conv_0_1_running_mean"; + constants_info_[176].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[176].offset = 0; + constants_info_[176].data_size = 576; + constants_info_[176].from_folded = false; + constants_info_[176].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[176].shape = {144}; + constants_info_[176].stride = {1}; + constants_info_[176].layout = static_cast(cached_torch_layout_strided); + constants_info_[176].original_fqn = "mv2.features.4.conv.0.1.running_mean"; + constants_info_[177].name = "mv2_features_4_conv_0_1_running_var"; + constants_info_[177].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[177].offset = 0; + constants_info_[177].data_size = 576; + constants_info_[177].from_folded = false; + constants_info_[177].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[177].shape = {144}; + constants_info_[177].stride = {1}; + constants_info_[177].layout = static_cast(cached_torch_layout_strided); + constants_info_[177].original_fqn = "mv2.features.4.conv.0.1.running_var"; + constants_info_[178].name = "mv2_features_4_conv_1_1_running_mean"; + constants_info_[178].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[178].offset = 0; + constants_info_[178].data_size = 576; + constants_info_[178].from_folded = false; + constants_info_[178].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[178].shape = {144}; + constants_info_[178].stride = {1}; + constants_info_[178].layout = static_cast(cached_torch_layout_strided); + constants_info_[178].original_fqn = "mv2.features.4.conv.1.1.running_mean"; + constants_info_[179].name = "mv2_features_4_conv_1_1_running_var"; + constants_info_[179].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[179].offset = 0; + constants_info_[179].data_size = 576; + constants_info_[179].from_folded = false; + constants_info_[179].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[179].shape = {144}; + constants_info_[179].stride = {1}; + constants_info_[179].layout = static_cast(cached_torch_layout_strided); + constants_info_[179].original_fqn = "mv2.features.4.conv.1.1.running_var"; + constants_info_[180].name = "mv2_features_4_conv_3_running_mean"; + constants_info_[180].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[180].offset = 0; + constants_info_[180].data_size = 128; + constants_info_[180].from_folded = false; + constants_info_[180].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[180].shape = {32}; + constants_info_[180].stride = {1}; + constants_info_[180].layout = static_cast(cached_torch_layout_strided); + constants_info_[180].original_fqn = "mv2.features.4.conv.3.running_mean"; + constants_info_[181].name = "mv2_features_4_conv_3_running_var"; + constants_info_[181].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[181].offset = 0; + constants_info_[181].data_size = 128; + constants_info_[181].from_folded = false; + constants_info_[181].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[181].shape = {32}; + constants_info_[181].stride = {1}; + constants_info_[181].layout = static_cast(cached_torch_layout_strided); + constants_info_[181].original_fqn = "mv2.features.4.conv.3.running_var"; + constants_info_[182].name = "mv2_features_5_conv_0_1_running_mean"; + constants_info_[182].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[182].offset = 0; + constants_info_[182].data_size = 768; + constants_info_[182].from_folded = false; + constants_info_[182].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[182].shape = {192}; + constants_info_[182].stride = {1}; + constants_info_[182].layout = static_cast(cached_torch_layout_strided); + constants_info_[182].original_fqn = "mv2.features.5.conv.0.1.running_mean"; + constants_info_[183].name = "mv2_features_5_conv_0_1_running_var"; + constants_info_[183].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[183].offset = 0; + constants_info_[183].data_size = 768; + constants_info_[183].from_folded = false; + constants_info_[183].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[183].shape = {192}; + constants_info_[183].stride = {1}; + constants_info_[183].layout = static_cast(cached_torch_layout_strided); + constants_info_[183].original_fqn = "mv2.features.5.conv.0.1.running_var"; + constants_info_[184].name = "mv2_features_5_conv_1_1_running_mean"; + constants_info_[184].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[184].offset = 0; + constants_info_[184].data_size = 768; + constants_info_[184].from_folded = false; + constants_info_[184].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[184].shape = {192}; + constants_info_[184].stride = {1}; + constants_info_[184].layout = static_cast(cached_torch_layout_strided); + constants_info_[184].original_fqn = "mv2.features.5.conv.1.1.running_mean"; + constants_info_[185].name = "mv2_features_5_conv_1_1_running_var"; + constants_info_[185].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[185].offset = 0; + constants_info_[185].data_size = 768; + constants_info_[185].from_folded = false; + constants_info_[185].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[185].shape = {192}; + constants_info_[185].stride = {1}; + constants_info_[185].layout = static_cast(cached_torch_layout_strided); + constants_info_[185].original_fqn = "mv2.features.5.conv.1.1.running_var"; + constants_info_[186].name = "mv2_features_5_conv_3_running_mean"; + constants_info_[186].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[186].offset = 0; + constants_info_[186].data_size = 128; + constants_info_[186].from_folded = false; + constants_info_[186].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[186].shape = {32}; + constants_info_[186].stride = {1}; + constants_info_[186].layout = static_cast(cached_torch_layout_strided); + constants_info_[186].original_fqn = "mv2.features.5.conv.3.running_mean"; + constants_info_[187].name = "mv2_features_5_conv_3_running_var"; + constants_info_[187].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[187].offset = 0; + constants_info_[187].data_size = 128; + constants_info_[187].from_folded = false; + constants_info_[187].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[187].shape = {32}; + constants_info_[187].stride = {1}; + constants_info_[187].layout = static_cast(cached_torch_layout_strided); + constants_info_[187].original_fqn = "mv2.features.5.conv.3.running_var"; + constants_info_[188].name = "mv2_features_6_conv_0_1_running_mean"; + constants_info_[188].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[188].offset = 0; + constants_info_[188].data_size = 768; + constants_info_[188].from_folded = false; + constants_info_[188].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[188].shape = {192}; + constants_info_[188].stride = {1}; + constants_info_[188].layout = static_cast(cached_torch_layout_strided); + constants_info_[188].original_fqn = "mv2.features.6.conv.0.1.running_mean"; + constants_info_[189].name = "mv2_features_6_conv_0_1_running_var"; + constants_info_[189].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[189].offset = 0; + constants_info_[189].data_size = 768; + constants_info_[189].from_folded = false; + constants_info_[189].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[189].shape = {192}; + constants_info_[189].stride = {1}; + constants_info_[189].layout = static_cast(cached_torch_layout_strided); + constants_info_[189].original_fqn = "mv2.features.6.conv.0.1.running_var"; + constants_info_[190].name = "mv2_features_6_conv_1_1_running_mean"; + constants_info_[190].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[190].offset = 0; + constants_info_[190].data_size = 768; + constants_info_[190].from_folded = false; + constants_info_[190].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[190].shape = {192}; + constants_info_[190].stride = {1}; + constants_info_[190].layout = static_cast(cached_torch_layout_strided); + constants_info_[190].original_fqn = "mv2.features.6.conv.1.1.running_mean"; + constants_info_[191].name = "mv2_features_6_conv_1_1_running_var"; + constants_info_[191].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[191].offset = 0; + constants_info_[191].data_size = 768; + constants_info_[191].from_folded = false; + constants_info_[191].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[191].shape = {192}; + constants_info_[191].stride = {1}; + constants_info_[191].layout = static_cast(cached_torch_layout_strided); + constants_info_[191].original_fqn = "mv2.features.6.conv.1.1.running_var"; + constants_info_[192].name = "mv2_features_6_conv_3_running_mean"; + constants_info_[192].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[192].offset = 0; + constants_info_[192].data_size = 128; + constants_info_[192].from_folded = false; + constants_info_[192].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[192].shape = {32}; + constants_info_[192].stride = {1}; + constants_info_[192].layout = static_cast(cached_torch_layout_strided); + constants_info_[192].original_fqn = "mv2.features.6.conv.3.running_mean"; + constants_info_[193].name = "mv2_features_6_conv_3_running_var"; + constants_info_[193].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[193].offset = 0; + constants_info_[193].data_size = 128; + constants_info_[193].from_folded = false; + constants_info_[193].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[193].shape = {32}; + constants_info_[193].stride = {1}; + constants_info_[193].layout = static_cast(cached_torch_layout_strided); + constants_info_[193].original_fqn = "mv2.features.6.conv.3.running_var"; + constants_info_[194].name = "mv2_features_7_conv_0_1_running_mean"; + constants_info_[194].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[194].offset = 0; + constants_info_[194].data_size = 768; + constants_info_[194].from_folded = false; + constants_info_[194].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[194].shape = {192}; + constants_info_[194].stride = {1}; + constants_info_[194].layout = static_cast(cached_torch_layout_strided); + constants_info_[194].original_fqn = "mv2.features.7.conv.0.1.running_mean"; + constants_info_[195].name = "mv2_features_7_conv_0_1_running_var"; + constants_info_[195].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[195].offset = 0; + constants_info_[195].data_size = 768; + constants_info_[195].from_folded = false; + constants_info_[195].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[195].shape = {192}; + constants_info_[195].stride = {1}; + constants_info_[195].layout = static_cast(cached_torch_layout_strided); + constants_info_[195].original_fqn = "mv2.features.7.conv.0.1.running_var"; + constants_info_[196].name = "mv2_features_7_conv_1_1_running_mean"; + constants_info_[196].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[196].offset = 0; + constants_info_[196].data_size = 768; + constants_info_[196].from_folded = false; + constants_info_[196].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[196].shape = {192}; + constants_info_[196].stride = {1}; + constants_info_[196].layout = static_cast(cached_torch_layout_strided); + constants_info_[196].original_fqn = "mv2.features.7.conv.1.1.running_mean"; + constants_info_[197].name = "mv2_features_7_conv_1_1_running_var"; + constants_info_[197].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[197].offset = 0; + constants_info_[197].data_size = 768; + constants_info_[197].from_folded = false; + constants_info_[197].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[197].shape = {192}; + constants_info_[197].stride = {1}; + constants_info_[197].layout = static_cast(cached_torch_layout_strided); + constants_info_[197].original_fqn = "mv2.features.7.conv.1.1.running_var"; + constants_info_[198].name = "mv2_features_7_conv_3_running_mean"; + constants_info_[198].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[198].offset = 0; + constants_info_[198].data_size = 256; + constants_info_[198].from_folded = false; + constants_info_[198].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[198].shape = {64}; + constants_info_[198].stride = {1}; + constants_info_[198].layout = static_cast(cached_torch_layout_strided); + constants_info_[198].original_fqn = "mv2.features.7.conv.3.running_mean"; + constants_info_[199].name = "mv2_features_7_conv_3_running_var"; + constants_info_[199].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[199].offset = 0; + constants_info_[199].data_size = 256; + constants_info_[199].from_folded = false; + constants_info_[199].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[199].shape = {64}; + constants_info_[199].stride = {1}; + constants_info_[199].layout = static_cast(cached_torch_layout_strided); + constants_info_[199].original_fqn = "mv2.features.7.conv.3.running_var"; + constants_info_[200].name = "mv2_features_8_conv_0_1_running_mean"; + constants_info_[200].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[200].offset = 0; + constants_info_[200].data_size = 1536; + constants_info_[200].from_folded = false; + constants_info_[200].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[200].shape = {384}; + constants_info_[200].stride = {1}; + constants_info_[200].layout = static_cast(cached_torch_layout_strided); + constants_info_[200].original_fqn = "mv2.features.8.conv.0.1.running_mean"; + constants_info_[201].name = "mv2_features_8_conv_0_1_running_var"; + constants_info_[201].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[201].offset = 0; + constants_info_[201].data_size = 1536; + constants_info_[201].from_folded = false; + constants_info_[201].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[201].shape = {384}; + constants_info_[201].stride = {1}; + constants_info_[201].layout = static_cast(cached_torch_layout_strided); + constants_info_[201].original_fqn = "mv2.features.8.conv.0.1.running_var"; + constants_info_[202].name = "mv2_features_8_conv_1_1_running_mean"; + constants_info_[202].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[202].offset = 0; + constants_info_[202].data_size = 1536; + constants_info_[202].from_folded = false; + constants_info_[202].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[202].shape = {384}; + constants_info_[202].stride = {1}; + constants_info_[202].layout = static_cast(cached_torch_layout_strided); + constants_info_[202].original_fqn = "mv2.features.8.conv.1.1.running_mean"; + constants_info_[203].name = "mv2_features_8_conv_1_1_running_var"; + constants_info_[203].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[203].offset = 0; + constants_info_[203].data_size = 1536; + constants_info_[203].from_folded = false; + constants_info_[203].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[203].shape = {384}; + constants_info_[203].stride = {1}; + constants_info_[203].layout = static_cast(cached_torch_layout_strided); + constants_info_[203].original_fqn = "mv2.features.8.conv.1.1.running_var"; + constants_info_[204].name = "mv2_features_8_conv_3_running_mean"; + constants_info_[204].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[204].offset = 0; + constants_info_[204].data_size = 256; + constants_info_[204].from_folded = false; + constants_info_[204].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[204].shape = {64}; + constants_info_[204].stride = {1}; + constants_info_[204].layout = static_cast(cached_torch_layout_strided); + constants_info_[204].original_fqn = "mv2.features.8.conv.3.running_mean"; + constants_info_[205].name = "mv2_features_8_conv_3_running_var"; + constants_info_[205].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[205].offset = 0; + constants_info_[205].data_size = 256; + constants_info_[205].from_folded = false; + constants_info_[205].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[205].shape = {64}; + constants_info_[205].stride = {1}; + constants_info_[205].layout = static_cast(cached_torch_layout_strided); + constants_info_[205].original_fqn = "mv2.features.8.conv.3.running_var"; + constants_info_[206].name = "mv2_features_9_conv_0_1_running_mean"; + constants_info_[206].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[206].offset = 0; + constants_info_[206].data_size = 1536; + constants_info_[206].from_folded = false; + constants_info_[206].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[206].shape = {384}; + constants_info_[206].stride = {1}; + constants_info_[206].layout = static_cast(cached_torch_layout_strided); + constants_info_[206].original_fqn = "mv2.features.9.conv.0.1.running_mean"; + constants_info_[207].name = "mv2_features_9_conv_0_1_running_var"; + constants_info_[207].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[207].offset = 0; + constants_info_[207].data_size = 1536; + constants_info_[207].from_folded = false; + constants_info_[207].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[207].shape = {384}; + constants_info_[207].stride = {1}; + constants_info_[207].layout = static_cast(cached_torch_layout_strided); + constants_info_[207].original_fqn = "mv2.features.9.conv.0.1.running_var"; + constants_info_[208].name = "mv2_features_9_conv_1_1_running_mean"; + constants_info_[208].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[208].offset = 0; + constants_info_[208].data_size = 1536; + constants_info_[208].from_folded = false; + constants_info_[208].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[208].shape = {384}; + constants_info_[208].stride = {1}; + constants_info_[208].layout = static_cast(cached_torch_layout_strided); + constants_info_[208].original_fqn = "mv2.features.9.conv.1.1.running_mean"; + constants_info_[209].name = "mv2_features_9_conv_1_1_running_var"; + constants_info_[209].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[209].offset = 0; + constants_info_[209].data_size = 1536; + constants_info_[209].from_folded = false; + constants_info_[209].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[209].shape = {384}; + constants_info_[209].stride = {1}; + constants_info_[209].layout = static_cast(cached_torch_layout_strided); + constants_info_[209].original_fqn = "mv2.features.9.conv.1.1.running_var"; + constants_info_[210].name = "mv2_features_9_conv_3_running_mean"; + constants_info_[210].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[210].offset = 0; + constants_info_[210].data_size = 256; + constants_info_[210].from_folded = false; + constants_info_[210].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[210].shape = {64}; + constants_info_[210].stride = {1}; + constants_info_[210].layout = static_cast(cached_torch_layout_strided); + constants_info_[210].original_fqn = "mv2.features.9.conv.3.running_mean"; + constants_info_[211].name = "mv2_features_9_conv_3_running_var"; + constants_info_[211].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[211].offset = 0; + constants_info_[211].data_size = 256; + constants_info_[211].from_folded = false; + constants_info_[211].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[211].shape = {64}; + constants_info_[211].stride = {1}; + constants_info_[211].layout = static_cast(cached_torch_layout_strided); + constants_info_[211].original_fqn = "mv2.features.9.conv.3.running_var"; + constants_info_[212].name = "mv2_features_10_conv_0_1_running_mean"; + constants_info_[212].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[212].offset = 0; + constants_info_[212].data_size = 1536; + constants_info_[212].from_folded = false; + constants_info_[212].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[212].shape = {384}; + constants_info_[212].stride = {1}; + constants_info_[212].layout = static_cast(cached_torch_layout_strided); + constants_info_[212].original_fqn = "mv2.features.10.conv.0.1.running_mean"; + constants_info_[213].name = "mv2_features_10_conv_0_1_running_var"; + constants_info_[213].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[213].offset = 0; + constants_info_[213].data_size = 1536; + constants_info_[213].from_folded = false; + constants_info_[213].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[213].shape = {384}; + constants_info_[213].stride = {1}; + constants_info_[213].layout = static_cast(cached_torch_layout_strided); + constants_info_[213].original_fqn = "mv2.features.10.conv.0.1.running_var"; + constants_info_[214].name = "mv2_features_10_conv_1_1_running_mean"; + constants_info_[214].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[214].offset = 0; + constants_info_[214].data_size = 1536; + constants_info_[214].from_folded = false; + constants_info_[214].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[214].shape = {384}; + constants_info_[214].stride = {1}; + constants_info_[214].layout = static_cast(cached_torch_layout_strided); + constants_info_[214].original_fqn = "mv2.features.10.conv.1.1.running_mean"; + constants_info_[215].name = "mv2_features_10_conv_1_1_running_var"; + constants_info_[215].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[215].offset = 0; + constants_info_[215].data_size = 1536; + constants_info_[215].from_folded = false; + constants_info_[215].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[215].shape = {384}; + constants_info_[215].stride = {1}; + constants_info_[215].layout = static_cast(cached_torch_layout_strided); + constants_info_[215].original_fqn = "mv2.features.10.conv.1.1.running_var"; + constants_info_[216].name = "mv2_features_10_conv_3_running_mean"; + constants_info_[216].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[216].offset = 0; + constants_info_[216].data_size = 256; + constants_info_[216].from_folded = false; + constants_info_[216].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[216].shape = {64}; + constants_info_[216].stride = {1}; + constants_info_[216].layout = static_cast(cached_torch_layout_strided); + constants_info_[216].original_fqn = "mv2.features.10.conv.3.running_mean"; + constants_info_[217].name = "mv2_features_10_conv_3_running_var"; + constants_info_[217].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[217].offset = 0; + constants_info_[217].data_size = 256; + constants_info_[217].from_folded = false; + constants_info_[217].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[217].shape = {64}; + constants_info_[217].stride = {1}; + constants_info_[217].layout = static_cast(cached_torch_layout_strided); + constants_info_[217].original_fqn = "mv2.features.10.conv.3.running_var"; + constants_info_[218].name = "mv2_features_11_conv_0_1_running_mean"; + constants_info_[218].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[218].offset = 0; + constants_info_[218].data_size = 1536; + constants_info_[218].from_folded = false; + constants_info_[218].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[218].shape = {384}; + constants_info_[218].stride = {1}; + constants_info_[218].layout = static_cast(cached_torch_layout_strided); + constants_info_[218].original_fqn = "mv2.features.11.conv.0.1.running_mean"; + constants_info_[219].name = "mv2_features_11_conv_0_1_running_var"; + constants_info_[219].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[219].offset = 0; + constants_info_[219].data_size = 1536; + constants_info_[219].from_folded = false; + constants_info_[219].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[219].shape = {384}; + constants_info_[219].stride = {1}; + constants_info_[219].layout = static_cast(cached_torch_layout_strided); + constants_info_[219].original_fqn = "mv2.features.11.conv.0.1.running_var"; + constants_info_[220].name = "mv2_features_11_conv_1_1_running_mean"; + constants_info_[220].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[220].offset = 0; + constants_info_[220].data_size = 1536; + constants_info_[220].from_folded = false; + constants_info_[220].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[220].shape = {384}; + constants_info_[220].stride = {1}; + constants_info_[220].layout = static_cast(cached_torch_layout_strided); + constants_info_[220].original_fqn = "mv2.features.11.conv.1.1.running_mean"; + constants_info_[221].name = "mv2_features_11_conv_1_1_running_var"; + constants_info_[221].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[221].offset = 0; + constants_info_[221].data_size = 1536; + constants_info_[221].from_folded = false; + constants_info_[221].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[221].shape = {384}; + constants_info_[221].stride = {1}; + constants_info_[221].layout = static_cast(cached_torch_layout_strided); + constants_info_[221].original_fqn = "mv2.features.11.conv.1.1.running_var"; + constants_info_[222].name = "mv2_features_11_conv_3_running_mean"; + constants_info_[222].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[222].offset = 0; + constants_info_[222].data_size = 384; + constants_info_[222].from_folded = false; + constants_info_[222].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[222].shape = {96}; + constants_info_[222].stride = {1}; + constants_info_[222].layout = static_cast(cached_torch_layout_strided); + constants_info_[222].original_fqn = "mv2.features.11.conv.3.running_mean"; + constants_info_[223].name = "mv2_features_11_conv_3_running_var"; + constants_info_[223].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[223].offset = 0; + constants_info_[223].data_size = 384; + constants_info_[223].from_folded = false; + constants_info_[223].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[223].shape = {96}; + constants_info_[223].stride = {1}; + constants_info_[223].layout = static_cast(cached_torch_layout_strided); + constants_info_[223].original_fqn = "mv2.features.11.conv.3.running_var"; + constants_info_[224].name = "mv2_features_12_conv_0_1_running_mean"; + constants_info_[224].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[224].offset = 0; + constants_info_[224].data_size = 2304; + constants_info_[224].from_folded = false; + constants_info_[224].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[224].shape = {576}; + constants_info_[224].stride = {1}; + constants_info_[224].layout = static_cast(cached_torch_layout_strided); + constants_info_[224].original_fqn = "mv2.features.12.conv.0.1.running_mean"; + constants_info_[225].name = "mv2_features_12_conv_0_1_running_var"; + constants_info_[225].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[225].offset = 0; + constants_info_[225].data_size = 2304; + constants_info_[225].from_folded = false; + constants_info_[225].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[225].shape = {576}; + constants_info_[225].stride = {1}; + constants_info_[225].layout = static_cast(cached_torch_layout_strided); + constants_info_[225].original_fqn = "mv2.features.12.conv.0.1.running_var"; + constants_info_[226].name = "mv2_features_12_conv_1_1_running_mean"; + constants_info_[226].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[226].offset = 0; + constants_info_[226].data_size = 2304; + constants_info_[226].from_folded = false; + constants_info_[226].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[226].shape = {576}; + constants_info_[226].stride = {1}; + constants_info_[226].layout = static_cast(cached_torch_layout_strided); + constants_info_[226].original_fqn = "mv2.features.12.conv.1.1.running_mean"; + constants_info_[227].name = "mv2_features_12_conv_1_1_running_var"; + constants_info_[227].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[227].offset = 0; + constants_info_[227].data_size = 2304; + constants_info_[227].from_folded = false; + constants_info_[227].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[227].shape = {576}; + constants_info_[227].stride = {1}; + constants_info_[227].layout = static_cast(cached_torch_layout_strided); + constants_info_[227].original_fqn = "mv2.features.12.conv.1.1.running_var"; + constants_info_[228].name = "mv2_features_12_conv_3_running_mean"; + constants_info_[228].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[228].offset = 0; + constants_info_[228].data_size = 384; + constants_info_[228].from_folded = false; + constants_info_[228].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[228].shape = {96}; + constants_info_[228].stride = {1}; + constants_info_[228].layout = static_cast(cached_torch_layout_strided); + constants_info_[228].original_fqn = "mv2.features.12.conv.3.running_mean"; + constants_info_[229].name = "mv2_features_12_conv_3_running_var"; + constants_info_[229].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[229].offset = 0; + constants_info_[229].data_size = 384; + constants_info_[229].from_folded = false; + constants_info_[229].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[229].shape = {96}; + constants_info_[229].stride = {1}; + constants_info_[229].layout = static_cast(cached_torch_layout_strided); + constants_info_[229].original_fqn = "mv2.features.12.conv.3.running_var"; + constants_info_[230].name = "mv2_features_13_conv_0_1_running_mean"; + constants_info_[230].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[230].offset = 0; + constants_info_[230].data_size = 2304; + constants_info_[230].from_folded = false; + constants_info_[230].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[230].shape = {576}; + constants_info_[230].stride = {1}; + constants_info_[230].layout = static_cast(cached_torch_layout_strided); + constants_info_[230].original_fqn = "mv2.features.13.conv.0.1.running_mean"; + constants_info_[231].name = "mv2_features_13_conv_0_1_running_var"; + constants_info_[231].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[231].offset = 0; + constants_info_[231].data_size = 2304; + constants_info_[231].from_folded = false; + constants_info_[231].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[231].shape = {576}; + constants_info_[231].stride = {1}; + constants_info_[231].layout = static_cast(cached_torch_layout_strided); + constants_info_[231].original_fqn = "mv2.features.13.conv.0.1.running_var"; + constants_info_[232].name = "mv2_features_13_conv_1_1_running_mean"; + constants_info_[232].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[232].offset = 0; + constants_info_[232].data_size = 2304; + constants_info_[232].from_folded = false; + constants_info_[232].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[232].shape = {576}; + constants_info_[232].stride = {1}; + constants_info_[232].layout = static_cast(cached_torch_layout_strided); + constants_info_[232].original_fqn = "mv2.features.13.conv.1.1.running_mean"; + constants_info_[233].name = "mv2_features_13_conv_1_1_running_var"; + constants_info_[233].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[233].offset = 0; + constants_info_[233].data_size = 2304; + constants_info_[233].from_folded = false; + constants_info_[233].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[233].shape = {576}; + constants_info_[233].stride = {1}; + constants_info_[233].layout = static_cast(cached_torch_layout_strided); + constants_info_[233].original_fqn = "mv2.features.13.conv.1.1.running_var"; + constants_info_[234].name = "mv2_features_13_conv_3_running_mean"; + constants_info_[234].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[234].offset = 0; + constants_info_[234].data_size = 384; + constants_info_[234].from_folded = false; + constants_info_[234].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[234].shape = {96}; + constants_info_[234].stride = {1}; + constants_info_[234].layout = static_cast(cached_torch_layout_strided); + constants_info_[234].original_fqn = "mv2.features.13.conv.3.running_mean"; + constants_info_[235].name = "mv2_features_13_conv_3_running_var"; + constants_info_[235].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[235].offset = 0; + constants_info_[235].data_size = 384; + constants_info_[235].from_folded = false; + constants_info_[235].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[235].shape = {96}; + constants_info_[235].stride = {1}; + constants_info_[235].layout = static_cast(cached_torch_layout_strided); + constants_info_[235].original_fqn = "mv2.features.13.conv.3.running_var"; + constants_info_[236].name = "mv2_features_14_conv_0_1_running_mean"; + constants_info_[236].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[236].offset = 0; + constants_info_[236].data_size = 2304; + constants_info_[236].from_folded = false; + constants_info_[236].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[236].shape = {576}; + constants_info_[236].stride = {1}; + constants_info_[236].layout = static_cast(cached_torch_layout_strided); + constants_info_[236].original_fqn = "mv2.features.14.conv.0.1.running_mean"; + constants_info_[237].name = "mv2_features_14_conv_0_1_running_var"; + constants_info_[237].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[237].offset = 0; + constants_info_[237].data_size = 2304; + constants_info_[237].from_folded = false; + constants_info_[237].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[237].shape = {576}; + constants_info_[237].stride = {1}; + constants_info_[237].layout = static_cast(cached_torch_layout_strided); + constants_info_[237].original_fqn = "mv2.features.14.conv.0.1.running_var"; + constants_info_[238].name = "mv2_features_14_conv_1_1_running_mean"; + constants_info_[238].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[238].offset = 0; + constants_info_[238].data_size = 2304; + constants_info_[238].from_folded = false; + constants_info_[238].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[238].shape = {576}; + constants_info_[238].stride = {1}; + constants_info_[238].layout = static_cast(cached_torch_layout_strided); + constants_info_[238].original_fqn = "mv2.features.14.conv.1.1.running_mean"; + constants_info_[239].name = "mv2_features_14_conv_1_1_running_var"; + constants_info_[239].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[239].offset = 0; + constants_info_[239].data_size = 2304; + constants_info_[239].from_folded = false; + constants_info_[239].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[239].shape = {576}; + constants_info_[239].stride = {1}; + constants_info_[239].layout = static_cast(cached_torch_layout_strided); + constants_info_[239].original_fqn = "mv2.features.14.conv.1.1.running_var"; + constants_info_[240].name = "mv2_features_14_conv_3_running_mean"; + constants_info_[240].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[240].offset = 0; + constants_info_[240].data_size = 640; + constants_info_[240].from_folded = false; + constants_info_[240].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[240].shape = {160}; + constants_info_[240].stride = {1}; + constants_info_[240].layout = static_cast(cached_torch_layout_strided); + constants_info_[240].original_fqn = "mv2.features.14.conv.3.running_mean"; + constants_info_[241].name = "mv2_features_14_conv_3_running_var"; + constants_info_[241].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[241].offset = 0; + constants_info_[241].data_size = 640; + constants_info_[241].from_folded = false; + constants_info_[241].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[241].shape = {160}; + constants_info_[241].stride = {1}; + constants_info_[241].layout = static_cast(cached_torch_layout_strided); + constants_info_[241].original_fqn = "mv2.features.14.conv.3.running_var"; + constants_info_[242].name = "mv2_features_15_conv_0_1_running_mean"; + constants_info_[242].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[242].offset = 0; + constants_info_[242].data_size = 3840; + constants_info_[242].from_folded = false; + constants_info_[242].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[242].shape = {960}; + constants_info_[242].stride = {1}; + constants_info_[242].layout = static_cast(cached_torch_layout_strided); + constants_info_[242].original_fqn = "mv2.features.15.conv.0.1.running_mean"; + constants_info_[243].name = "mv2_features_15_conv_0_1_running_var"; + constants_info_[243].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[243].offset = 0; + constants_info_[243].data_size = 3840; + constants_info_[243].from_folded = false; + constants_info_[243].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[243].shape = {960}; + constants_info_[243].stride = {1}; + constants_info_[243].layout = static_cast(cached_torch_layout_strided); + constants_info_[243].original_fqn = "mv2.features.15.conv.0.1.running_var"; + constants_info_[244].name = "mv2_features_15_conv_1_1_running_mean"; + constants_info_[244].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[244].offset = 0; + constants_info_[244].data_size = 3840; + constants_info_[244].from_folded = false; + constants_info_[244].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[244].shape = {960}; + constants_info_[244].stride = {1}; + constants_info_[244].layout = static_cast(cached_torch_layout_strided); + constants_info_[244].original_fqn = "mv2.features.15.conv.1.1.running_mean"; + constants_info_[245].name = "mv2_features_15_conv_1_1_running_var"; + constants_info_[245].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[245].offset = 0; + constants_info_[245].data_size = 3840; + constants_info_[245].from_folded = false; + constants_info_[245].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[245].shape = {960}; + constants_info_[245].stride = {1}; + constants_info_[245].layout = static_cast(cached_torch_layout_strided); + constants_info_[245].original_fqn = "mv2.features.15.conv.1.1.running_var"; + constants_info_[246].name = "mv2_features_15_conv_3_running_mean"; + constants_info_[246].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[246].offset = 0; + constants_info_[246].data_size = 640; + constants_info_[246].from_folded = false; + constants_info_[246].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[246].shape = {160}; + constants_info_[246].stride = {1}; + constants_info_[246].layout = static_cast(cached_torch_layout_strided); + constants_info_[246].original_fqn = "mv2.features.15.conv.3.running_mean"; + constants_info_[247].name = "mv2_features_15_conv_3_running_var"; + constants_info_[247].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[247].offset = 0; + constants_info_[247].data_size = 640; + constants_info_[247].from_folded = false; + constants_info_[247].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[247].shape = {160}; + constants_info_[247].stride = {1}; + constants_info_[247].layout = static_cast(cached_torch_layout_strided); + constants_info_[247].original_fqn = "mv2.features.15.conv.3.running_var"; + constants_info_[248].name = "mv2_features_16_conv_0_1_running_mean"; + constants_info_[248].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[248].offset = 0; + constants_info_[248].data_size = 3840; + constants_info_[248].from_folded = false; + constants_info_[248].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[248].shape = {960}; + constants_info_[248].stride = {1}; + constants_info_[248].layout = static_cast(cached_torch_layout_strided); + constants_info_[248].original_fqn = "mv2.features.16.conv.0.1.running_mean"; + constants_info_[249].name = "mv2_features_16_conv_0_1_running_var"; + constants_info_[249].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[249].offset = 0; + constants_info_[249].data_size = 3840; + constants_info_[249].from_folded = false; + constants_info_[249].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[249].shape = {960}; + constants_info_[249].stride = {1}; + constants_info_[249].layout = static_cast(cached_torch_layout_strided); + constants_info_[249].original_fqn = "mv2.features.16.conv.0.1.running_var"; + constants_info_[250].name = "mv2_features_16_conv_1_1_running_mean"; + constants_info_[250].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[250].offset = 0; + constants_info_[250].data_size = 3840; + constants_info_[250].from_folded = false; + constants_info_[250].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[250].shape = {960}; + constants_info_[250].stride = {1}; + constants_info_[250].layout = static_cast(cached_torch_layout_strided); + constants_info_[250].original_fqn = "mv2.features.16.conv.1.1.running_mean"; + constants_info_[251].name = "mv2_features_16_conv_1_1_running_var"; + constants_info_[251].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[251].offset = 0; + constants_info_[251].data_size = 3840; + constants_info_[251].from_folded = false; + constants_info_[251].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[251].shape = {960}; + constants_info_[251].stride = {1}; + constants_info_[251].layout = static_cast(cached_torch_layout_strided); + constants_info_[251].original_fqn = "mv2.features.16.conv.1.1.running_var"; + constants_info_[252].name = "mv2_features_16_conv_3_running_mean"; + constants_info_[252].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[252].offset = 0; + constants_info_[252].data_size = 640; + constants_info_[252].from_folded = false; + constants_info_[252].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[252].shape = {160}; + constants_info_[252].stride = {1}; + constants_info_[252].layout = static_cast(cached_torch_layout_strided); + constants_info_[252].original_fqn = "mv2.features.16.conv.3.running_mean"; + constants_info_[253].name = "mv2_features_16_conv_3_running_var"; + constants_info_[253].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[253].offset = 0; + constants_info_[253].data_size = 640; + constants_info_[253].from_folded = false; + constants_info_[253].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[253].shape = {160}; + constants_info_[253].stride = {1}; + constants_info_[253].layout = static_cast(cached_torch_layout_strided); + constants_info_[253].original_fqn = "mv2.features.16.conv.3.running_var"; + constants_info_[254].name = "mv2_features_17_conv_0_1_running_mean"; + constants_info_[254].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[254].offset = 0; + constants_info_[254].data_size = 3840; + constants_info_[254].from_folded = false; + constants_info_[254].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[254].shape = {960}; + constants_info_[254].stride = {1}; + constants_info_[254].layout = static_cast(cached_torch_layout_strided); + constants_info_[254].original_fqn = "mv2.features.17.conv.0.1.running_mean"; + constants_info_[255].name = "mv2_features_17_conv_0_1_running_var"; + constants_info_[255].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[255].offset = 0; + constants_info_[255].data_size = 3840; + constants_info_[255].from_folded = false; + constants_info_[255].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[255].shape = {960}; + constants_info_[255].stride = {1}; + constants_info_[255].layout = static_cast(cached_torch_layout_strided); + constants_info_[255].original_fqn = "mv2.features.17.conv.0.1.running_var"; + constants_info_[256].name = "mv2_features_17_conv_1_1_running_mean"; + constants_info_[256].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[256].offset = 0; + constants_info_[256].data_size = 3840; + constants_info_[256].from_folded = false; + constants_info_[256].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[256].shape = {960}; + constants_info_[256].stride = {1}; + constants_info_[256].layout = static_cast(cached_torch_layout_strided); + constants_info_[256].original_fqn = "mv2.features.17.conv.1.1.running_mean"; + constants_info_[257].name = "mv2_features_17_conv_1_1_running_var"; + constants_info_[257].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[257].offset = 0; + constants_info_[257].data_size = 3840; + constants_info_[257].from_folded = false; + constants_info_[257].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[257].shape = {960}; + constants_info_[257].stride = {1}; + constants_info_[257].layout = static_cast(cached_torch_layout_strided); + constants_info_[257].original_fqn = "mv2.features.17.conv.1.1.running_var"; + constants_info_[258].name = "mv2_features_17_conv_3_running_mean"; + constants_info_[258].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[258].offset = 0; + constants_info_[258].data_size = 1280; + constants_info_[258].from_folded = false; + constants_info_[258].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[258].shape = {320}; + constants_info_[258].stride = {1}; + constants_info_[258].layout = static_cast(cached_torch_layout_strided); + constants_info_[258].original_fqn = "mv2.features.17.conv.3.running_mean"; + constants_info_[259].name = "mv2_features_17_conv_3_running_var"; + constants_info_[259].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[259].offset = 0; + constants_info_[259].data_size = 1280; + constants_info_[259].from_folded = false; + constants_info_[259].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[259].shape = {320}; + constants_info_[259].stride = {1}; + constants_info_[259].layout = static_cast(cached_torch_layout_strided); + constants_info_[259].original_fqn = "mv2.features.17.conv.3.running_var"; + constants_info_[260].name = "mv2_features_18_1_running_mean"; + constants_info_[260].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[260].offset = 0; + constants_info_[260].data_size = 5120; + constants_info_[260].from_folded = false; + constants_info_[260].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[260].shape = {1280}; + constants_info_[260].stride = {1}; + constants_info_[260].layout = static_cast(cached_torch_layout_strided); + constants_info_[260].original_fqn = "mv2.features.18.1.running_mean"; + constants_info_[261].name = "mv2_features_18_1_running_var"; + constants_info_[261].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[261].offset = 0; + constants_info_[261].data_size = 5120; + constants_info_[261].from_folded = false; + constants_info_[261].type = static_cast(torch::aot_inductor::ConstantType::Buffer); + constants_info_[261].shape = {1280}; + constants_info_[261].stride = {1}; + constants_info_[261].layout = static_cast(cached_torch_layout_strided); + constants_info_[261].original_fqn = "mv2.features.18.1.running_var"; + update_constants_map(std::move(constants_map)); + update_constants_array(std::move(constants_array)); + in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; + out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; + outputs_info_[0].name = "output0"; + this->kernels_ = std::make_unique(); +} + +std::unordered_map AOTInductorModel::const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization +) { + + if (!initialization) { + std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " + << "aot_inductor.use_runtime_constant_folding=False\n"; + } + return {}; +} +} // namespace torch::aot_inductor +using namespace torch::aot_inductor; + +template +static inline void call_triton_poi_fused_convolution_0( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_0', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 4, 'x': 65536}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 451584, 'x': 602112}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 3 + xnumel = 50176 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x1 = xindex + y0 = yindex + tmp0 = tl.load(in_ptr0 + (x1 + 50176*y0), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = ((ynumel + (4 - 1)) / (4)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_0 == nullptr) { + kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin", "triton_poi_fused_convolution_0", 4160, cubin_dir_); + } + CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); + int var_2 = ynumel; + int var_3 = xnumel; + CUdeviceptr global_scratch_4 = 0; + void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; + launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4160, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_1( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_1', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 128, 'x': 16}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6912, 'x': 3456}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 96 + xnumel = 9 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); + uint32_t grid_1 = ((ynumel + (64 - 1)) / (64)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_1 == nullptr) { + kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin", "triton_poi_fused_convolution_1", 4352, cubin_dir_); + } + CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); + int var_7 = ynumel; + int var_8 = xnumel; + CUdeviceptr global_scratch_9 = 0; + void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; + launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 524288}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 4817408}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 401408 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 32) + tmp0 = tl.load(in_out_ptr0 + (x2), None) + tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, None) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 = loadKernel("/home/gasoonjia/executorch/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2", 0, cubin_dir_); + } + CUdeviceptr var_10 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_11 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_12 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_13 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_14 = reinterpret_cast(in_ptr3.data_ptr()); + int var_15 = xnumel; + CUdeviceptr global_scratch_16 = 0; + void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &var_14, &var_15, &global_scratch_16}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_3( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_3', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 262144}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 2408704}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_3(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 200704 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 16) + tmp0 = tl.load(in_out_ptr0 + (x2), None) + tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, None) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 = loadKernel("/home/gasoonjia/executorch/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_3", 0, cubin_dir_); + } + CUdeviceptr var_17 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_18 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_19 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_20 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_21 = reinterpret_cast(in_ptr3.data_ptr()); + int var_22 = xnumel; + CUdeviceptr global_scratch_23 = 0; + void* kernel_args_[] = {&var_17, &var_18, &var_19, &var_20, &var_21, &var_22, &global_scratch_23}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 2097152}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 14452224}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 1204224 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = tl.full([XBLOCK], True, tl.int1) + x2 = xindex + x0 = (xindex % 96) + tmp0 = tl.load(in_out_ptr0 + (x2), None) + tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, None) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 = loadKernel("/home/gasoonjia/executorch/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4", 0, cubin_dir_); + } + CUdeviceptr var_24 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_25 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_26 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_27 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_28 = reinterpret_cast(in_ptr3.data_ptr()); + int var_29 = xnumel; + CUdeviceptr global_scratch_30 = 0; + void* kernel_args_[] = {&var_24, &var_25, &var_26, &var_27, &var_28, &var_29, &global_scratch_30}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 524288}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 3614208}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 301056 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 96) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 = loadKernel("/home/gasoonjia/executorch/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5", 0, cubin_dir_); + } + CUdeviceptr var_31 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_32 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_33 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_34 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_35 = reinterpret_cast(in_ptr3.data_ptr()); + int var_36 = xnumel; + CUdeviceptr global_scratch_37 = 0; + void* kernel_args_[] = {&var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &global_scratch_37}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_6( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_6', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 131072}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 903552}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_6(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 75264 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 24) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 = loadKernel("/home/gasoonjia/executorch/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_6", 0, cubin_dir_); + } + CUdeviceptr var_38 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_39 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_40 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_41 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_42 = reinterpret_cast(in_ptr3.data_ptr()); + int var_43 = xnumel; + CUdeviceptr global_scratch_44 = 0; + void* kernel_args_[] = {&var_38, &var_39, &var_40, &var_41, &var_42, &var_43, &global_scratch_44}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 524288}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 5421312}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 451584 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 144) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 = loadKernel("/home/gasoonjia/executorch/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7", 0, cubin_dir_); + } + CUdeviceptr var_45 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_46 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_47 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_48 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_49 = reinterpret_cast(in_ptr3.data_ptr()); + int var_50 = xnumel; + CUdeviceptr global_scratch_51 = 0; + void* kernel_args_[] = {&var_45, &var_46, &var_47, &var_48, &var_49, &var_50, &global_scratch_51}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_8( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_8', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 131072}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1204608}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_add_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): + xnumel = 75264 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 24) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x2), xmask) + tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tmp1 - tmp2 + tmp5 = 1e-05 + tmp6 = tmp4 + tmp5 + tmp7 = libdevice.sqrt(tmp6) + tmp8 = tl.full([1], 1, tl.int32) + tmp9 = (tmp8 / tmp7) + tmp10 = 1.0 + tmp11 = tmp9 * tmp10 + tmp12 = tmp3 * tmp11 + tmp14 = tmp12 * tmp13 + tmp16 = tmp14 + tmp15 + tmp17 = tmp0 + tmp16 + tl.store(in_out_ptr0 + (x2), tmp17, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 = loadKernel("/home/gasoonjia/executorch/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_8", 0, cubin_dir_); + } + CUdeviceptr var_52 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_53 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_54 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_55 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_56 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_57 = reinterpret_cast(in_ptr4.data_ptr()); + int var_58 = xnumel; + CUdeviceptr global_scratch_59 = 0; + void* kernel_args_[] = {&var_52, &var_53, &var_54, &var_55, &var_56, &var_57, &var_58, &global_scratch_59}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 131072}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1357056}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 112896 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 144) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 = loadKernel("/home/gasoonjia/executorch/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9", 0, cubin_dir_); + } + CUdeviceptr var_60 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_61 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_62 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_63 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_64 = reinterpret_cast(in_ptr3.data_ptr()); + int var_65 = xnumel; + CUdeviceptr global_scratch_66 = 0; + void* kernel_args_[] = {&var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &global_scratch_66}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_10( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_10', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 32768}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 301568}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 25088 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 32) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 = loadKernel("/home/gasoonjia/executorch/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_10", 0, cubin_dir_); + } + CUdeviceptr var_67 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_68 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_69 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_70 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_71 = reinterpret_cast(in_ptr3.data_ptr()); + int var_72 = xnumel; + CUdeviceptr global_scratch_73 = 0; + void* kernel_args_[] = {&var_67, &var_68, &var_69, &var_70, &var_71, &var_72, &global_scratch_73}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 262144}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1809408}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 150528 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 192) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 = loadKernel("/home/gasoonjia/executorch/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11", 0, cubin_dir_); + } + CUdeviceptr var_74 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_75 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_76 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_77 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_78 = reinterpret_cast(in_ptr3.data_ptr()); + int var_79 = xnumel; + CUdeviceptr global_scratch_80 = 0; + void* kernel_args_[] = {&var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &global_scratch_80}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_12( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_12', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 32768}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 401920}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_add_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): + xnumel = 25088 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 32) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x2), xmask) + tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tmp1 - tmp2 + tmp5 = 1e-05 + tmp6 = tmp4 + tmp5 + tmp7 = libdevice.sqrt(tmp6) + tmp8 = tl.full([1], 1, tl.int32) + tmp9 = (tmp8 / tmp7) + tmp10 = 1.0 + tmp11 = tmp9 * tmp10 + tmp12 = tmp3 * tmp11 + tmp14 = tmp12 * tmp13 + tmp16 = tmp14 + tmp15 + tmp17 = tmp0 + tmp16 + tl.store(in_out_ptr0 + (x2), tmp17, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 = loadKernel("/home/gasoonjia/executorch/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_12", 0, cubin_dir_); + } + CUdeviceptr var_81 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_82 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_83 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_84 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_85 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_86 = reinterpret_cast(in_ptr4.data_ptr()); + int var_87 = xnumel; + CUdeviceptr global_scratch_88 = 0; + void* kernel_args_[] = {&var_81, &var_82, &var_83, &var_84, &var_85, &var_86, &var_87, &global_scratch_88}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 65536}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 454656}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 37632 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 192) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 = loadKernel("/home/gasoonjia/executorch/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13", 0, cubin_dir_); + } + CUdeviceptr var_89 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_90 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_91 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_92 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_93 = reinterpret_cast(in_ptr3.data_ptr()); + int var_94 = xnumel; + CUdeviceptr global_scratch_95 = 0; + void* kernel_args_[] = {&var_89, &var_90, &var_91, &var_92, &var_93, &var_94, &global_scratch_95}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_14( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_14', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 16384}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_14', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 151552}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_14(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 12544 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 64) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 = loadKernel("/home/gasoonjia/executorch/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_14", 0, cubin_dir_); + } + CUdeviceptr var_96 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_97 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_98 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_99 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_100 = reinterpret_cast(in_ptr3.data_ptr()); + int var_101 = xnumel; + CUdeviceptr global_scratch_102 = 0; + void* kernel_args_[] = {&var_96, &var_97, &var_98, &var_99, &var_100, &var_101, &global_scratch_102}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 131072}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 909312}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 75264 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 384) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 = loadKernel("/home/gasoonjia/executorch/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15", 0, cubin_dir_); + } + CUdeviceptr var_103 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_104 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_105 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_106 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_107 = reinterpret_cast(in_ptr3.data_ptr()); + int var_108 = xnumel; + CUdeviceptr global_scratch_109 = 0; + void* kernel_args_[] = {&var_103, &var_104, &var_105, &var_106, &var_107, &var_108, &global_scratch_109}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_16( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_16', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 16384}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_16', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 201728}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_add_16(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): + xnumel = 12544 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 64) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x2), xmask) + tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tmp1 - tmp2 + tmp5 = 1e-05 + tmp6 = tmp4 + tmp5 + tmp7 = libdevice.sqrt(tmp6) + tmp8 = tl.full([1], 1, tl.int32) + tmp9 = (tmp8 / tmp7) + tmp10 = 1.0 + tmp11 = tmp9 * tmp10 + tmp12 = tmp3 * tmp11 + tmp14 = tmp12 * tmp13 + tmp16 = tmp14 + tmp15 + tmp17 = tmp0 + tmp16 + tl.store(in_out_ptr0 + (x2), tmp17, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 = loadKernel("/home/gasoonjia/executorch/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_16", 0, cubin_dir_); + } + CUdeviceptr var_110 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_111 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_112 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_113 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_114 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_115 = reinterpret_cast(in_ptr4.data_ptr()); + int var_116 = xnumel; + CUdeviceptr global_scratch_117 = 0; + void* kernel_args_[] = {&var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &global_scratch_117}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_17( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_17', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 32768}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_17', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 227328}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_17(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 18816 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 96) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 = loadKernel("/home/gasoonjia/executorch/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_17", 0, cubin_dir_); + } + CUdeviceptr var_118 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_119 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_120 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_121 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_122 = reinterpret_cast(in_ptr3.data_ptr()); + int var_123 = xnumel; + CUdeviceptr global_scratch_124 = 0; + void* kernel_args_[] = {&var_118, &var_119, &var_120, &var_121, &var_122, &var_123, &global_scratch_124}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 131072}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1363968}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 112896 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 576) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 = loadKernel("/home/gasoonjia/executorch/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18", 0, cubin_dir_); + } + CUdeviceptr var_125 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_126 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_127 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_128 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_129 = reinterpret_cast(in_ptr3.data_ptr()); + int var_130 = xnumel; + CUdeviceptr global_scratch_131 = 0; + void* kernel_args_[] = {&var_125, &var_126, &var_127, &var_128, &var_129, &var_130, &global_scratch_131}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_19( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_19', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 32768}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_19', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 302592}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_add_19(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): + xnumel = 18816 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 96) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x2), xmask) + tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tmp1 - tmp2 + tmp5 = 1e-05 + tmp6 = tmp4 + tmp5 + tmp7 = libdevice.sqrt(tmp6) + tmp8 = tl.full([1], 1, tl.int32) + tmp9 = (tmp8 / tmp7) + tmp10 = 1.0 + tmp11 = tmp9 * tmp10 + tmp12 = tmp3 * tmp11 + tmp14 = tmp12 * tmp13 + tmp16 = tmp14 + tmp15 + tmp17 = tmp0 + tmp16 + tl.store(in_out_ptr0 + (x2), tmp17, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 = loadKernel("/home/gasoonjia/executorch/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_19", 0, cubin_dir_); + } + CUdeviceptr var_132 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_133 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_134 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_135 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_136 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_137 = reinterpret_cast(in_ptr4.data_ptr()); + int var_138 = xnumel; + CUdeviceptr global_scratch_139 = 0; + void* kernel_args_[] = {&var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &global_scratch_139}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 32768}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 347904}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 28224 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 576) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 = loadKernel("/home/gasoonjia/executorch/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20", 0, cubin_dir_); + } + CUdeviceptr var_140 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_141 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_142 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_143 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_144 = reinterpret_cast(in_ptr3.data_ptr()); + int var_145 = xnumel; + CUdeviceptr global_scratch_146 = 0; + void* kernel_args_[] = {&var_140, &var_141, &var_142, &var_143, &var_144, &var_145, &global_scratch_146}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_21( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_21', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 8192}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_21', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 96640}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_21(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 7840 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 160) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 = loadKernel("/home/gasoonjia/executorch/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_21", 0, cubin_dir_); + } + CUdeviceptr var_147 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_148 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_149 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_150 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_151 = reinterpret_cast(in_ptr3.data_ptr()); + int var_152 = xnumel; + CUdeviceptr global_scratch_153 = 0; + void* kernel_args_[] = {&var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &global_scratch_153}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 65536}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 579840}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 47040 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 960) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tl.store(in_out_ptr0 + (x2), tmp19, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 = loadKernel("/home/gasoonjia/executorch/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22", 0, cubin_dir_); + } + CUdeviceptr var_154 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_155 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_156 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_157 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_158 = reinterpret_cast(in_ptr3.data_ptr()); + int var_159 = xnumel; + CUdeviceptr global_scratch_160 = 0; + void* kernel_args_[] = {&var_154, &var_155, &var_156, &var_157, &var_158, &var_159, &global_scratch_160}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_23( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_23', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 8192}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_23', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 128000}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_add_23(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): + xnumel = 7840 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 160) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x2), xmask) + tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tmp1 - tmp2 + tmp5 = 1e-05 + tmp6 = tmp4 + tmp5 + tmp7 = libdevice.sqrt(tmp6) + tmp8 = tl.full([1], 1, tl.int32) + tmp9 = (tmp8 / tmp7) + tmp10 = 1.0 + tmp11 = tmp9 * tmp10 + tmp12 = tmp3 * tmp11 + tmp14 = tmp12 * tmp13 + tmp16 = tmp14 + tmp15 + tmp17 = tmp0 + tmp16 + tl.store(in_out_ptr0 + (x2), tmp17, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 = loadKernel("/home/gasoonjia/executorch/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_23", 0, cubin_dir_); + } + CUdeviceptr var_161 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_162 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_163 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_164 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_165 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_166 = reinterpret_cast(in_ptr4.data_ptr()); + int var_167 = xnumel; + CUdeviceptr global_scratch_168 = 0; + void* kernel_args_[] = {&var_161, &var_162, &var_163, &var_164, &var_165, &var_166, &var_167, &global_scratch_168}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_24( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_24', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 16384}, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_24', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 193280}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused__native_batch_norm_legit_no_training_24(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): + xnumel = 15680 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x2 = xindex + x0 = (xindex % 320) + tmp0 = tl.load(in_out_ptr0 + (x2), xmask) + tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tl.store(in_out_ptr0 + (x2), tmp15, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 == nullptr) { + kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 = loadKernel("/home/gasoonjia/executorch/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_24", 0, cubin_dir_); + } + CUdeviceptr var_169 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_170 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_171 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_172 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_173 = reinterpret_cast(in_ptr3.data_ptr()); + int var_174 = xnumel; + CUdeviceptr global_scratch_175 = 0; + void* kernel_args_[] = {&var_169, &var_170, &var_171, &var_172, &var_173, &var_174, &global_scratch_175}; + launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +template +static inline void call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25( + const in_out_ptr0_type_& in_out_ptr0, + const in_ptr0_type_& in_ptr0, + const in_ptr1_type_& in_ptr1, + const in_ptr2_type_& in_ptr2, + const in_ptr3_type_& in_ptr3, + const in_ptr4_type_& in_ptr4, + int64_t xnumel, + int64_t r0_numel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.persistent_reduction( + size_hints={'x': 2048, 'r0_': 64}, + reduction_hint=ReductionHint.OUTER, + filename=__file__, + triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 1, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 281600, 'r0_': 0}} + ) + @triton.jit + def triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr): + xnumel = 1280 + r0_numel = 49 + R0_BLOCK: tl.constexpr = 64 + rnumel = r0_numel + RBLOCK: tl.constexpr = R0_BLOCK + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:, None] + xmask = xindex < xnumel + r0_index = tl.arange(0, R0_BLOCK)[None, :] + r0_offset = 0 + r0_mask = r0_index < r0_numel + roffset = r0_offset + rindex = r0_index + r0_1 = r0_index + x0 = xindex + tmp0 = tl.load(in_ptr0 + (x0 + 1280*r0_1), r0_mask & xmask, other=0.0) + tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') + tmp3 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') + tmp12 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') + tmp14 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') + tmp2 = tmp0 - tmp1 + tmp4 = 1e-05 + tmp5 = tmp3 + tmp4 + tmp6 = libdevice.sqrt(tmp5) + tmp7 = tl.full([1, 1], 1, tl.int32) + tmp8 = (tmp7 / tmp6) + tmp9 = 1.0 + tmp10 = tmp8 * tmp9 + tmp11 = tmp2 * tmp10 + tmp13 = tmp11 * tmp12 + tmp15 = tmp13 + tmp14 + tmp16 = 0.0 + tmp17 = triton_helpers.maximum(tmp15, tmp16) + tmp18 = 6.0 + tmp19 = triton_helpers.minimum(tmp17, tmp18) + tmp20 = tl.broadcast_to(tmp19, [XBLOCK, R0_BLOCK]) + tmp22 = tl.where(r0_mask & xmask, tmp20, 0) + tmp23 = tl.sum(tmp22, 1)[:, None] + tmp24 = 49.0 + tmp25 = (tmp23 / tmp24) + tl.debug_barrier() + tl.store(in_out_ptr0 + (x0), tmp25, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 == nullptr) { + kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 = loadKernel("/home/gasoonjia/executorch/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin", "triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25", 1024, cubin_dir_); + } + CUdeviceptr var_176 = reinterpret_cast(in_out_ptr0.data_ptr()); + CUdeviceptr var_177 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_178 = reinterpret_cast(in_ptr1.data_ptr()); + CUdeviceptr var_179 = reinterpret_cast(in_ptr2.data_ptr()); + CUdeviceptr var_180 = reinterpret_cast(in_ptr3.data_ptr()); + CUdeviceptr var_181 = reinterpret_cast(in_ptr4.data_ptr()); + int var_182 = xnumel; + int var_183 = r0_numel; + CUdeviceptr global_scratch_184 = 0; + void* kernel_args_[] = {&var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184}; + launchKernel(kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25, grid_0, grid_1, grid_2, 8, 1024, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_permute_copy_26( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_permute_copy_26', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'x': 2097152}, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_permute_copy_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 15360000}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_permute_copy_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): + xnumel = 1280000 + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x0 = xindex + tmp0 = tl.load(in_ptr0 + (x0), xmask) + tl.store(out_ptr0 + (x0), tmp0, xmask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); + uint32_t grid_1 = 1; + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_permute_copy_26 == nullptr) { + kernels_.triton_poi_fused_permute_copy_26 = loadKernel("/home/gasoonjia/executorch/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin", "triton_poi_fused_permute_copy_26", 0, cubin_dir_); + } + CUdeviceptr var_185 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_186 = reinterpret_cast(out_ptr0.data_ptr()); + int var_187 = xnumel; + CUdeviceptr global_scratch_188 = 0; + void* kernel_args_[] = {&var_185, &var_186, &var_187, &global_scratch_188}; + launchKernel(kernels_.triton_poi_fused_permute_copy_26, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); +} + +namespace torch::aot_inductor { + +void AOTInductorModel::_const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) {} + +AOTI_NOINLINE static void check_input_0( + AtenTensorHandle* input_handles +) { + ConstantHandle arg262_1 = ConstantHandle(input_handles[0]); + int32_t arg262_1_dtype; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg262_1, &arg262_1_dtype)); + + int32_t arg262_1_expected_dtype = aoti_torch_dtype_float32(); + if (arg262_1_expected_dtype != arg262_1_dtype) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dtype, " + << "expected: " << arg262_1_expected_dtype << "(at::kFloat), " + << "but got: " << arg262_1_dtype << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg262_1_size = arg262_1.sizes(); + + if (1 != arg262_1_size[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 0, " + << "expected: 1, " << "but got: " << arg262_1_size[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (3 != arg262_1_size[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 1, " + << "expected: 3, " << "but got: " << arg262_1_size[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (224 != arg262_1_size[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 2, " + << "expected: 224, " << "but got: " << arg262_1_size[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (224 != arg262_1_size[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 3, " + << "expected: 224, " << "but got: " << arg262_1_size[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg262_1_stride = arg262_1.strides(); + + if (150528 != arg262_1_stride[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 0, " + << "expected: 150528, " << "but got: " << arg262_1_stride[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (50176 != arg262_1_stride[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 1, " + << "expected: 50176, " << "but got: " << arg262_1_stride[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (224 != arg262_1_stride[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 2, " + << "expected: 224, " << "but got: " << arg262_1_stride[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (1 != arg262_1_stride[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 3, " + << "expected: 1, " << "but got: " << arg262_1_stride[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + int32_t arg262_1_device_type; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg262_1, &arg262_1_device_type)); + + int32_t arg262_1_expected_device_type = 1; + if (arg262_1_expected_device_type != arg262_1_device_type) { + std::stringstream ss; + ss << "input_handles[0]: unmatched device type, " + << "expected: " << arg262_1_expected_device_type << "1(cuda), " + << "but got: " << arg262_1_device_type << "\n"; + throw std::runtime_error(ss.str()); + } +} + +static bool _check_aoti_runtime_check_inputs_env() { + const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); + const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; + return result; +} + +AOTI_NOINLINE static void __check_inputs_outputs( + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + if (!_check_aoti_runtime_check_inputs_env()){ + return; + } + check_input_0(input_handles); +} + +void AOTInductorModel::run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) { + __check_inputs_outputs(input_handles, output_handles); + + auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); + auto arg262_1 = std::move(inputs[0]); + [[maybe_unused]] auto& mv2_features_0_0_weight = constants_->at(0); + [[maybe_unused]] auto& mv2_features_0_1_weight = constants_->at(1); + [[maybe_unused]] auto& mv2_features_0_1_bias = constants_->at(2); + [[maybe_unused]] auto& mv2_features_1_conv_0_0_weight = constants_->at(3); + [[maybe_unused]] auto& mv2_features_1_conv_0_1_weight = constants_->at(4); + [[maybe_unused]] auto& mv2_features_1_conv_0_1_bias = constants_->at(5); + [[maybe_unused]] auto& mv2_features_1_conv_1_weight = constants_->at(6); + [[maybe_unused]] auto& mv2_features_1_conv_2_weight = constants_->at(7); + [[maybe_unused]] auto& mv2_features_1_conv_2_bias = constants_->at(8); + [[maybe_unused]] auto& mv2_features_2_conv_0_0_weight = constants_->at(9); + [[maybe_unused]] auto& mv2_features_2_conv_0_1_weight = constants_->at(10); + [[maybe_unused]] auto& mv2_features_2_conv_0_1_bias = constants_->at(11); + [[maybe_unused]] auto& mv2_features_2_conv_1_0_weight = constants_->at(12); + [[maybe_unused]] auto& mv2_features_2_conv_1_1_weight = constants_->at(13); + [[maybe_unused]] auto& mv2_features_2_conv_1_1_bias = constants_->at(14); + [[maybe_unused]] auto& mv2_features_2_conv_2_weight = constants_->at(15); + [[maybe_unused]] auto& mv2_features_2_conv_3_weight = constants_->at(16); + [[maybe_unused]] auto& mv2_features_2_conv_3_bias = constants_->at(17); + [[maybe_unused]] auto& mv2_features_3_conv_0_0_weight = constants_->at(18); + [[maybe_unused]] auto& mv2_features_3_conv_0_1_weight = constants_->at(19); + [[maybe_unused]] auto& mv2_features_3_conv_0_1_bias = constants_->at(20); + [[maybe_unused]] auto& mv2_features_3_conv_1_0_weight = constants_->at(21); + [[maybe_unused]] auto& mv2_features_3_conv_1_1_weight = constants_->at(22); + [[maybe_unused]] auto& mv2_features_3_conv_1_1_bias = constants_->at(23); + [[maybe_unused]] auto& mv2_features_3_conv_2_weight = constants_->at(24); + [[maybe_unused]] auto& mv2_features_3_conv_3_weight = constants_->at(25); + [[maybe_unused]] auto& mv2_features_3_conv_3_bias = constants_->at(26); + [[maybe_unused]] auto& mv2_features_4_conv_0_0_weight = constants_->at(27); + [[maybe_unused]] auto& mv2_features_4_conv_0_1_weight = constants_->at(28); + [[maybe_unused]] auto& mv2_features_4_conv_0_1_bias = constants_->at(29); + [[maybe_unused]] auto& mv2_features_4_conv_1_0_weight = constants_->at(30); + [[maybe_unused]] auto& mv2_features_4_conv_1_1_weight = constants_->at(31); + [[maybe_unused]] auto& mv2_features_4_conv_1_1_bias = constants_->at(32); + [[maybe_unused]] auto& mv2_features_4_conv_2_weight = constants_->at(33); + [[maybe_unused]] auto& mv2_features_4_conv_3_weight = constants_->at(34); + [[maybe_unused]] auto& mv2_features_4_conv_3_bias = constants_->at(35); + [[maybe_unused]] auto& mv2_features_5_conv_0_0_weight = constants_->at(36); + [[maybe_unused]] auto& mv2_features_5_conv_0_1_weight = constants_->at(37); + [[maybe_unused]] auto& mv2_features_5_conv_0_1_bias = constants_->at(38); + [[maybe_unused]] auto& mv2_features_5_conv_1_0_weight = constants_->at(39); + [[maybe_unused]] auto& mv2_features_5_conv_1_1_weight = constants_->at(40); + [[maybe_unused]] auto& mv2_features_5_conv_1_1_bias = constants_->at(41); + [[maybe_unused]] auto& mv2_features_5_conv_2_weight = constants_->at(42); + [[maybe_unused]] auto& mv2_features_5_conv_3_weight = constants_->at(43); + [[maybe_unused]] auto& mv2_features_5_conv_3_bias = constants_->at(44); + [[maybe_unused]] auto& mv2_features_6_conv_0_0_weight = constants_->at(45); + [[maybe_unused]] auto& mv2_features_6_conv_0_1_weight = constants_->at(46); + [[maybe_unused]] auto& mv2_features_6_conv_0_1_bias = constants_->at(47); + [[maybe_unused]] auto& mv2_features_6_conv_1_0_weight = constants_->at(48); + [[maybe_unused]] auto& mv2_features_6_conv_1_1_weight = constants_->at(49); + [[maybe_unused]] auto& mv2_features_6_conv_1_1_bias = constants_->at(50); + [[maybe_unused]] auto& mv2_features_6_conv_2_weight = constants_->at(51); + [[maybe_unused]] auto& mv2_features_6_conv_3_weight = constants_->at(52); + [[maybe_unused]] auto& mv2_features_6_conv_3_bias = constants_->at(53); + [[maybe_unused]] auto& mv2_features_7_conv_0_0_weight = constants_->at(54); + [[maybe_unused]] auto& mv2_features_7_conv_0_1_weight = constants_->at(55); + [[maybe_unused]] auto& mv2_features_7_conv_0_1_bias = constants_->at(56); + [[maybe_unused]] auto& mv2_features_7_conv_1_0_weight = constants_->at(57); + [[maybe_unused]] auto& mv2_features_7_conv_1_1_weight = constants_->at(58); + [[maybe_unused]] auto& mv2_features_7_conv_1_1_bias = constants_->at(59); + [[maybe_unused]] auto& mv2_features_7_conv_2_weight = constants_->at(60); + [[maybe_unused]] auto& mv2_features_7_conv_3_weight = constants_->at(61); + [[maybe_unused]] auto& mv2_features_7_conv_3_bias = constants_->at(62); + [[maybe_unused]] auto& mv2_features_8_conv_0_0_weight = constants_->at(63); + [[maybe_unused]] auto& mv2_features_8_conv_0_1_weight = constants_->at(64); + [[maybe_unused]] auto& mv2_features_8_conv_0_1_bias = constants_->at(65); + [[maybe_unused]] auto& mv2_features_8_conv_1_0_weight = constants_->at(66); + [[maybe_unused]] auto& mv2_features_8_conv_1_1_weight = constants_->at(67); + [[maybe_unused]] auto& mv2_features_8_conv_1_1_bias = constants_->at(68); + [[maybe_unused]] auto& mv2_features_8_conv_2_weight = constants_->at(69); + [[maybe_unused]] auto& mv2_features_8_conv_3_weight = constants_->at(70); + [[maybe_unused]] auto& mv2_features_8_conv_3_bias = constants_->at(71); + [[maybe_unused]] auto& mv2_features_9_conv_0_0_weight = constants_->at(72); + [[maybe_unused]] auto& mv2_features_9_conv_0_1_weight = constants_->at(73); + [[maybe_unused]] auto& mv2_features_9_conv_0_1_bias = constants_->at(74); + [[maybe_unused]] auto& mv2_features_9_conv_1_0_weight = constants_->at(75); + [[maybe_unused]] auto& mv2_features_9_conv_1_1_weight = constants_->at(76); + [[maybe_unused]] auto& mv2_features_9_conv_1_1_bias = constants_->at(77); + [[maybe_unused]] auto& mv2_features_9_conv_2_weight = constants_->at(78); + [[maybe_unused]] auto& mv2_features_9_conv_3_weight = constants_->at(79); + [[maybe_unused]] auto& mv2_features_9_conv_3_bias = constants_->at(80); + [[maybe_unused]] auto& mv2_features_10_conv_0_0_weight = constants_->at(81); + [[maybe_unused]] auto& mv2_features_10_conv_0_1_weight = constants_->at(82); + [[maybe_unused]] auto& mv2_features_10_conv_0_1_bias = constants_->at(83); + [[maybe_unused]] auto& mv2_features_10_conv_1_0_weight = constants_->at(84); + [[maybe_unused]] auto& mv2_features_10_conv_1_1_weight = constants_->at(85); + [[maybe_unused]] auto& mv2_features_10_conv_1_1_bias = constants_->at(86); + [[maybe_unused]] auto& mv2_features_10_conv_2_weight = constants_->at(87); + [[maybe_unused]] auto& mv2_features_10_conv_3_weight = constants_->at(88); + [[maybe_unused]] auto& mv2_features_10_conv_3_bias = constants_->at(89); + [[maybe_unused]] auto& mv2_features_11_conv_0_0_weight = constants_->at(90); + [[maybe_unused]] auto& mv2_features_11_conv_0_1_weight = constants_->at(91); + [[maybe_unused]] auto& mv2_features_11_conv_0_1_bias = constants_->at(92); + [[maybe_unused]] auto& mv2_features_11_conv_1_0_weight = constants_->at(93); + [[maybe_unused]] auto& mv2_features_11_conv_1_1_weight = constants_->at(94); + [[maybe_unused]] auto& mv2_features_11_conv_1_1_bias = constants_->at(95); + [[maybe_unused]] auto& mv2_features_11_conv_2_weight = constants_->at(96); + [[maybe_unused]] auto& mv2_features_11_conv_3_weight = constants_->at(97); + [[maybe_unused]] auto& mv2_features_11_conv_3_bias = constants_->at(98); + [[maybe_unused]] auto& mv2_features_12_conv_0_0_weight = constants_->at(99); + [[maybe_unused]] auto& mv2_features_12_conv_0_1_weight = constants_->at(100); + [[maybe_unused]] auto& mv2_features_12_conv_0_1_bias = constants_->at(101); + [[maybe_unused]] auto& mv2_features_12_conv_1_0_weight = constants_->at(102); + [[maybe_unused]] auto& mv2_features_12_conv_1_1_weight = constants_->at(103); + [[maybe_unused]] auto& mv2_features_12_conv_1_1_bias = constants_->at(104); + [[maybe_unused]] auto& mv2_features_12_conv_2_weight = constants_->at(105); + [[maybe_unused]] auto& mv2_features_12_conv_3_weight = constants_->at(106); + [[maybe_unused]] auto& mv2_features_12_conv_3_bias = constants_->at(107); + [[maybe_unused]] auto& mv2_features_13_conv_0_0_weight = constants_->at(108); + [[maybe_unused]] auto& mv2_features_13_conv_0_1_weight = constants_->at(109); + [[maybe_unused]] auto& mv2_features_13_conv_0_1_bias = constants_->at(110); + [[maybe_unused]] auto& mv2_features_13_conv_1_0_weight = constants_->at(111); + [[maybe_unused]] auto& mv2_features_13_conv_1_1_weight = constants_->at(112); + [[maybe_unused]] auto& mv2_features_13_conv_1_1_bias = constants_->at(113); + [[maybe_unused]] auto& mv2_features_13_conv_2_weight = constants_->at(114); + [[maybe_unused]] auto& mv2_features_13_conv_3_weight = constants_->at(115); + [[maybe_unused]] auto& mv2_features_13_conv_3_bias = constants_->at(116); + [[maybe_unused]] auto& mv2_features_14_conv_0_0_weight = constants_->at(117); + [[maybe_unused]] auto& mv2_features_14_conv_0_1_weight = constants_->at(118); + [[maybe_unused]] auto& mv2_features_14_conv_0_1_bias = constants_->at(119); + [[maybe_unused]] auto& mv2_features_14_conv_1_0_weight = constants_->at(120); + [[maybe_unused]] auto& mv2_features_14_conv_1_1_weight = constants_->at(121); + [[maybe_unused]] auto& mv2_features_14_conv_1_1_bias = constants_->at(122); + [[maybe_unused]] auto& mv2_features_14_conv_2_weight = constants_->at(123); + [[maybe_unused]] auto& mv2_features_14_conv_3_weight = constants_->at(124); + [[maybe_unused]] auto& mv2_features_14_conv_3_bias = constants_->at(125); + [[maybe_unused]] auto& mv2_features_15_conv_0_0_weight = constants_->at(126); + [[maybe_unused]] auto& mv2_features_15_conv_0_1_weight = constants_->at(127); + [[maybe_unused]] auto& mv2_features_15_conv_0_1_bias = constants_->at(128); + [[maybe_unused]] auto& mv2_features_15_conv_1_0_weight = constants_->at(129); + [[maybe_unused]] auto& mv2_features_15_conv_1_1_weight = constants_->at(130); + [[maybe_unused]] auto& mv2_features_15_conv_1_1_bias = constants_->at(131); + [[maybe_unused]] auto& mv2_features_15_conv_2_weight = constants_->at(132); + [[maybe_unused]] auto& mv2_features_15_conv_3_weight = constants_->at(133); + [[maybe_unused]] auto& mv2_features_15_conv_3_bias = constants_->at(134); + [[maybe_unused]] auto& mv2_features_16_conv_0_0_weight = constants_->at(135); + [[maybe_unused]] auto& mv2_features_16_conv_0_1_weight = constants_->at(136); + [[maybe_unused]] auto& mv2_features_16_conv_0_1_bias = constants_->at(137); + [[maybe_unused]] auto& mv2_features_16_conv_1_0_weight = constants_->at(138); + [[maybe_unused]] auto& mv2_features_16_conv_1_1_weight = constants_->at(139); + [[maybe_unused]] auto& mv2_features_16_conv_1_1_bias = constants_->at(140); + [[maybe_unused]] auto& mv2_features_16_conv_2_weight = constants_->at(141); + [[maybe_unused]] auto& mv2_features_16_conv_3_weight = constants_->at(142); + [[maybe_unused]] auto& mv2_features_16_conv_3_bias = constants_->at(143); + [[maybe_unused]] auto& mv2_features_17_conv_0_0_weight = constants_->at(144); + [[maybe_unused]] auto& mv2_features_17_conv_0_1_weight = constants_->at(145); + [[maybe_unused]] auto& mv2_features_17_conv_0_1_bias = constants_->at(146); + [[maybe_unused]] auto& mv2_features_17_conv_1_0_weight = constants_->at(147); + [[maybe_unused]] auto& mv2_features_17_conv_1_1_weight = constants_->at(148); + [[maybe_unused]] auto& mv2_features_17_conv_1_1_bias = constants_->at(149); + [[maybe_unused]] auto& mv2_features_17_conv_2_weight = constants_->at(150); + [[maybe_unused]] auto& mv2_features_17_conv_3_weight = constants_->at(151); + [[maybe_unused]] auto& mv2_features_17_conv_3_bias = constants_->at(152); + [[maybe_unused]] auto& mv2_features_18_0_weight = constants_->at(153); + [[maybe_unused]] auto& mv2_features_18_1_weight = constants_->at(154); + [[maybe_unused]] auto& mv2_features_18_1_bias = constants_->at(155); + [[maybe_unused]] auto& mv2_classifier_1_weight = constants_->at(156); + [[maybe_unused]] auto& mv2_classifier_1_bias = constants_->at(157); + [[maybe_unused]] auto& mv2_features_0_1_running_mean = constants_->at(158); + [[maybe_unused]] auto& mv2_features_0_1_running_var = constants_->at(159); + [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_mean = constants_->at(160); + [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_var = constants_->at(161); + [[maybe_unused]] auto& mv2_features_1_conv_2_running_mean = constants_->at(162); + [[maybe_unused]] auto& mv2_features_1_conv_2_running_var = constants_->at(163); + [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_mean = constants_->at(164); + [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_var = constants_->at(165); + [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_mean = constants_->at(166); + [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_var = constants_->at(167); + [[maybe_unused]] auto& mv2_features_2_conv_3_running_mean = constants_->at(168); + [[maybe_unused]] auto& mv2_features_2_conv_3_running_var = constants_->at(169); + [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_mean = constants_->at(170); + [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_var = constants_->at(171); + [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_mean = constants_->at(172); + [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_var = constants_->at(173); + [[maybe_unused]] auto& mv2_features_3_conv_3_running_mean = constants_->at(174); + [[maybe_unused]] auto& mv2_features_3_conv_3_running_var = constants_->at(175); + [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_mean = constants_->at(176); + [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_var = constants_->at(177); + [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_mean = constants_->at(178); + [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_var = constants_->at(179); + [[maybe_unused]] auto& mv2_features_4_conv_3_running_mean = constants_->at(180); + [[maybe_unused]] auto& mv2_features_4_conv_3_running_var = constants_->at(181); + [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_mean = constants_->at(182); + [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_var = constants_->at(183); + [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_mean = constants_->at(184); + [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_var = constants_->at(185); + [[maybe_unused]] auto& mv2_features_5_conv_3_running_mean = constants_->at(186); + [[maybe_unused]] auto& mv2_features_5_conv_3_running_var = constants_->at(187); + [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_mean = constants_->at(188); + [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_var = constants_->at(189); + [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_mean = constants_->at(190); + [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_var = constants_->at(191); + [[maybe_unused]] auto& mv2_features_6_conv_3_running_mean = constants_->at(192); + [[maybe_unused]] auto& mv2_features_6_conv_3_running_var = constants_->at(193); + [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_mean = constants_->at(194); + [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_var = constants_->at(195); + [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_mean = constants_->at(196); + [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_var = constants_->at(197); + [[maybe_unused]] auto& mv2_features_7_conv_3_running_mean = constants_->at(198); + [[maybe_unused]] auto& mv2_features_7_conv_3_running_var = constants_->at(199); + [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_mean = constants_->at(200); + [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_var = constants_->at(201); + [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_mean = constants_->at(202); + [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_var = constants_->at(203); + [[maybe_unused]] auto& mv2_features_8_conv_3_running_mean = constants_->at(204); + [[maybe_unused]] auto& mv2_features_8_conv_3_running_var = constants_->at(205); + [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_mean = constants_->at(206); + [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_var = constants_->at(207); + [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_mean = constants_->at(208); + [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_var = constants_->at(209); + [[maybe_unused]] auto& mv2_features_9_conv_3_running_mean = constants_->at(210); + [[maybe_unused]] auto& mv2_features_9_conv_3_running_var = constants_->at(211); + [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_mean = constants_->at(212); + [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_var = constants_->at(213); + [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_mean = constants_->at(214); + [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_var = constants_->at(215); + [[maybe_unused]] auto& mv2_features_10_conv_3_running_mean = constants_->at(216); + [[maybe_unused]] auto& mv2_features_10_conv_3_running_var = constants_->at(217); + [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_mean = constants_->at(218); + [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_var = constants_->at(219); + [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_mean = constants_->at(220); + [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_var = constants_->at(221); + [[maybe_unused]] auto& mv2_features_11_conv_3_running_mean = constants_->at(222); + [[maybe_unused]] auto& mv2_features_11_conv_3_running_var = constants_->at(223); + [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_mean = constants_->at(224); + [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_var = constants_->at(225); + [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_mean = constants_->at(226); + [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_var = constants_->at(227); + [[maybe_unused]] auto& mv2_features_12_conv_3_running_mean = constants_->at(228); + [[maybe_unused]] auto& mv2_features_12_conv_3_running_var = constants_->at(229); + [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_mean = constants_->at(230); + [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_var = constants_->at(231); + [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_mean = constants_->at(232); + [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_var = constants_->at(233); + [[maybe_unused]] auto& mv2_features_13_conv_3_running_mean = constants_->at(234); + [[maybe_unused]] auto& mv2_features_13_conv_3_running_var = constants_->at(235); + [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_mean = constants_->at(236); + [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_var = constants_->at(237); + [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_mean = constants_->at(238); + [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_var = constants_->at(239); + [[maybe_unused]] auto& mv2_features_14_conv_3_running_mean = constants_->at(240); + [[maybe_unused]] auto& mv2_features_14_conv_3_running_var = constants_->at(241); + [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_mean = constants_->at(242); + [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_var = constants_->at(243); + [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_mean = constants_->at(244); + [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_var = constants_->at(245); + [[maybe_unused]] auto& mv2_features_15_conv_3_running_mean = constants_->at(246); + [[maybe_unused]] auto& mv2_features_15_conv_3_running_var = constants_->at(247); + [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_mean = constants_->at(248); + [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_var = constants_->at(249); + [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_mean = constants_->at(250); + [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_var = constants_->at(251); + [[maybe_unused]] auto& mv2_features_16_conv_3_running_mean = constants_->at(252); + [[maybe_unused]] auto& mv2_features_16_conv_3_running_var = constants_->at(253); + [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_mean = constants_->at(254); + [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_var = constants_->at(255); + [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_mean = constants_->at(256); + [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_var = constants_->at(257); + [[maybe_unused]] auto& mv2_features_17_conv_3_running_mean = constants_->at(258); + [[maybe_unused]] auto& mv2_features_17_conv_3_running_var = constants_->at(259); + [[maybe_unused]] auto& mv2_features_18_1_running_mean = constants_->at(260); + [[maybe_unused]] auto& mv2_features_18_1_running_var = constants_->at(261); + + if ((long(arg262_1.data_ptr()) & (16 -1)) != 0) { + AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); + AtenTensorHandle arg262_1_aligned; + aoti_torch_clone_preserve_strides(arg262_1, &arg262_1_aligned); + arg262_1 = std::move(RAIIAtenTensorHandle(arg262_1_aligned)); + } + inputs.clear(); + [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); + + AOTICudaStreamGuard stream_guard(stream, this->device_idx_); + static constexpr int64_t int_array_0[] = {1L, 3L, 224L, 224L}; + static constexpr int64_t int_array_1[] = {150528L, 1L, 672L, 3L}; + AtenTensorHandle buf0_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); + RAIIAtenTensorHandle buf0(buf0_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_0(arg262_1, buf0, 3L, 50176L, this->device_idx_, stream, kernels, this->cubin_dir_); + arg262_1.reset(); + static constexpr int64_t int_array_2[] = {32L, 3L, 3L, 3L}; + static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; + AtenTensorHandle buf1_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); + RAIIAtenTensorHandle buf1(buf1_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_1(mv2_features_0_0_weight, buf1, 96L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + AtenTensorHandle buf2_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); + RAIIAtenTensorHandle buf2(buf2_handle); + buf0.reset(); + buf1.reset(); + auto buf3 = std::move(buf2); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf3, mv2_features_0_1_running_mean, mv2_features_0_1_running_var, mv2_features_0_1_weight, mv2_features_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default, aten_convolution_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf4_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf3, mv2_features_1_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 32L, &buf4_handle)); + RAIIAtenTensorHandle buf4(buf4_handle); + buf3.reset(); + auto buf5 = std::move(buf4); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf5, mv2_features_1_conv_0_1_running_mean, mv2_features_1_conv_0_1_running_var, mv2_features_1_conv_0_1_weight, mv2_features_1_conv_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1, aten_convolution_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf6_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf5, mv2_features_1_conv_1_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf6_handle)); + RAIIAtenTensorHandle buf6(buf6_handle); + buf5.reset(); + auto buf7 = std::move(buf6); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_3(buf7, mv2_features_1_conv_2_running_mean, mv2_features_1_conv_2_running_var, mv2_features_1_conv_2_weight, mv2_features_1_conv_2_bias, 200704L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2, aten_convolution_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution] + AtenTensorHandle buf8_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf7, mv2_features_2_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf8_handle)); + RAIIAtenTensorHandle buf8(buf8_handle); + buf7.reset(); + auto buf9 = std::move(buf8); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(buf9, mv2_features_2_conv_0_1_running_mean, mv2_features_2_conv_0_1_running_var, mv2_features_2_conv_0_1_weight, mv2_features_2_conv_0_1_bias, 1204224L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2, aten_convolution_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf10_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf9, mv2_features_2_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 96L, &buf10_handle)); + RAIIAtenTensorHandle buf10(buf10_handle); + buf9.reset(); + auto buf11 = std::move(buf10); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(buf11, mv2_features_2_conv_1_1_running_mean, mv2_features_2_conv_1_1_running_var, mv2_features_2_conv_1_1_weight, mv2_features_2_conv_1_1_bias, 301056L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3, aten_convolution_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf12_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf11, mv2_features_2_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf12_handle)); + RAIIAtenTensorHandle buf12(buf12_handle); + buf11.reset(); + auto buf13 = std::move(buf12); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_5], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_6(buf13, mv2_features_2_conv_3_running_mean, mv2_features_2_conv_3_running_var, mv2_features_2_conv_3_weight, mv2_features_2_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default_6], Original ATen: [aten.convolution] + AtenTensorHandle buf14_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf13, mv2_features_3_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf14_handle)); + RAIIAtenTensorHandle buf14(buf14_handle); + auto buf15 = std::move(buf14); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf15, mv2_features_3_conv_0_1_running_mean, mv2_features_3_conv_0_1_running_var, mv2_features_3_conv_0_1_weight, mv2_features_3_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4, aten_convolution_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf16_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf15, mv2_features_3_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 144L, &buf16_handle)); + RAIIAtenTensorHandle buf16(buf16_handle); + buf15.reset(); + auto buf17 = std::move(buf16); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf17, mv2_features_3_conv_1_1_running_mean, mv2_features_3_conv_1_1_running_var, mv2_features_3_conv_1_1_weight, mv2_features_3_conv_1_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5, aten_convolution_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf18_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf17, mv2_features_3_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf18_handle)); + RAIIAtenTensorHandle buf18(buf18_handle); + buf17.reset(); + auto buf19 = std::move(buf13); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(buf19, buf18, mv2_features_3_conv_3_running_mean, mv2_features_3_conv_3_running_var, mv2_features_3_conv_3_weight, mv2_features_3_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf18.reset(); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor, aten_convolution_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] + AtenTensorHandle buf20_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf19, mv2_features_4_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf20_handle)); + RAIIAtenTensorHandle buf20(buf20_handle); + buf19.reset(); + auto buf21 = std::move(buf20); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf21, mv2_features_4_conv_0_1_running_mean, mv2_features_4_conv_0_1_running_var, mv2_features_4_conv_0_1_weight, mv2_features_4_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6, aten_convolution_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf22_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf21, mv2_features_4_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 144L, &buf22_handle)); + RAIIAtenTensorHandle buf22(buf22_handle); + buf21.reset(); + auto buf23 = std::move(buf22); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(buf23, mv2_features_4_conv_1_1_running_mean, mv2_features_4_conv_1_1_running_var, mv2_features_4_conv_1_1_weight, mv2_features_4_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7, aten_convolution_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf24_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf23, mv2_features_4_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf24_handle)); + RAIIAtenTensorHandle buf24(buf24_handle); + buf23.reset(); + auto buf25 = std::move(buf24); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_11], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_10(buf25, mv2_features_4_conv_3_running_mean, mv2_features_4_conv_3_running_var, mv2_features_4_conv_3_weight, mv2_features_4_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default_12], Original ATen: [aten.convolution] + AtenTensorHandle buf26_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf25, mv2_features_5_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf26_handle)); + RAIIAtenTensorHandle buf26(buf26_handle); + auto buf27 = std::move(buf26); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf27, mv2_features_5_conv_0_1_running_mean, mv2_features_5_conv_0_1_running_var, mv2_features_5_conv_0_1_weight, mv2_features_5_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8, aten_convolution_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf28_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf27, mv2_features_5_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf28_handle)); + RAIIAtenTensorHandle buf28(buf28_handle); + buf27.reset(); + auto buf29 = std::move(buf28); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf29, mv2_features_5_conv_1_1_running_mean, mv2_features_5_conv_1_1_running_var, mv2_features_5_conv_1_1_weight, mv2_features_5_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9, aten_convolution_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf30_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf29, mv2_features_5_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf30_handle)); + RAIIAtenTensorHandle buf30(buf30_handle); + buf29.reset(); + auto buf31 = std::move(buf25); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_14, aten_add_tensor_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf31, buf30, mv2_features_5_conv_3_running_mean, mv2_features_5_conv_3_running_var, mv2_features_5_conv_3_weight, mv2_features_5_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf30.reset(); + // Topologically Sorted Source Nodes: [aten_convolution_default_15], Original ATen: [aten.convolution] + AtenTensorHandle buf32_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf31, mv2_features_6_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf32_handle)); + RAIIAtenTensorHandle buf32(buf32_handle); + auto buf33 = std::move(buf32); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf33, mv2_features_6_conv_0_1_running_mean, mv2_features_6_conv_0_1_running_var, mv2_features_6_conv_0_1_weight, mv2_features_6_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10, aten_convolution_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf34_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf33, mv2_features_6_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf34_handle)); + RAIIAtenTensorHandle buf34(buf34_handle); + buf33.reset(); + auto buf35 = std::move(buf34); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf35, mv2_features_6_conv_1_1_running_mean, mv2_features_6_conv_1_1_running_var, mv2_features_6_conv_1_1_weight, mv2_features_6_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11, aten_convolution_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf36_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf35, mv2_features_6_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf36_handle)); + RAIIAtenTensorHandle buf36(buf36_handle); + buf35.reset(); + auto buf37 = std::move(buf31); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf37, buf36, mv2_features_6_conv_3_running_mean, mv2_features_6_conv_3_running_var, mv2_features_6_conv_3_weight, mv2_features_6_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf36.reset(); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2, aten_convolution_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] + AtenTensorHandle buf38_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf37, mv2_features_7_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf38_handle)); + RAIIAtenTensorHandle buf38(buf38_handle); + buf37.reset(); + auto buf39 = std::move(buf38); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf39, mv2_features_7_conv_0_1_running_mean, mv2_features_7_conv_0_1_running_var, mv2_features_7_conv_0_1_weight, mv2_features_7_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12, aten_convolution_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf40_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf39, mv2_features_7_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf40_handle)); + RAIIAtenTensorHandle buf40(buf40_handle); + buf39.reset(); + auto buf41 = std::move(buf40); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(buf41, mv2_features_7_conv_1_1_running_mean, mv2_features_7_conv_1_1_running_var, mv2_features_7_conv_1_1_weight, mv2_features_7_conv_1_1_bias, 37632L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13, aten_convolution_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf42_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf41, mv2_features_7_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf42_handle)); + RAIIAtenTensorHandle buf42(buf42_handle); + buf41.reset(); + auto buf43 = std::move(buf42); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_20], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_14(buf43, mv2_features_7_conv_3_running_mean, mv2_features_7_conv_3_running_var, mv2_features_7_conv_3_weight, mv2_features_7_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default_21], Original ATen: [aten.convolution] + AtenTensorHandle buf44_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf43, mv2_features_8_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf44_handle)); + RAIIAtenTensorHandle buf44(buf44_handle); + auto buf45 = std::move(buf44); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf45, mv2_features_8_conv_0_1_running_mean, mv2_features_8_conv_0_1_running_var, mv2_features_8_conv_0_1_weight, mv2_features_8_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14, aten_convolution_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf46_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf45, mv2_features_8_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf46_handle)); + RAIIAtenTensorHandle buf46(buf46_handle); + buf45.reset(); + auto buf47 = std::move(buf46); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf47, mv2_features_8_conv_1_1_running_mean, mv2_features_8_conv_1_1_running_var, mv2_features_8_conv_1_1_weight, mv2_features_8_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15, aten_convolution_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf48_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf47, mv2_features_8_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf48_handle)); + RAIIAtenTensorHandle buf48(buf48_handle); + buf47.reset(); + auto buf49 = std::move(buf43); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_23, aten_add_tensor_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf49, buf48, mv2_features_8_conv_3_running_mean, mv2_features_8_conv_3_running_var, mv2_features_8_conv_3_weight, mv2_features_8_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf48.reset(); + // Topologically Sorted Source Nodes: [aten_convolution_default_24], Original ATen: [aten.convolution] + AtenTensorHandle buf50_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf49, mv2_features_9_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf50_handle)); + RAIIAtenTensorHandle buf50(buf50_handle); + auto buf51 = std::move(buf50); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf51, mv2_features_9_conv_0_1_running_mean, mv2_features_9_conv_0_1_running_var, mv2_features_9_conv_0_1_weight, mv2_features_9_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16, aten_convolution_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf52_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf51, mv2_features_9_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf52_handle)); + RAIIAtenTensorHandle buf52(buf52_handle); + buf51.reset(); + auto buf53 = std::move(buf52); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf53, mv2_features_9_conv_1_1_running_mean, mv2_features_9_conv_1_1_running_var, mv2_features_9_conv_1_1_weight, mv2_features_9_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17, aten_convolution_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf54_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf53, mv2_features_9_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf54_handle)); + RAIIAtenTensorHandle buf54(buf54_handle); + buf53.reset(); + auto buf55 = std::move(buf49); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_26, aten_add_tensor_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf55, buf54, mv2_features_9_conv_3_running_mean, mv2_features_9_conv_3_running_var, mv2_features_9_conv_3_weight, mv2_features_9_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf54.reset(); + // Topologically Sorted Source Nodes: [aten_convolution_default_27], Original ATen: [aten.convolution] + AtenTensorHandle buf56_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf55, mv2_features_10_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf56_handle)); + RAIIAtenTensorHandle buf56(buf56_handle); + auto buf57 = std::move(buf56); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf57, mv2_features_10_conv_0_1_running_mean, mv2_features_10_conv_0_1_running_var, mv2_features_10_conv_0_1_weight, mv2_features_10_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18, aten_convolution_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf58_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf57, mv2_features_10_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf58_handle)); + RAIIAtenTensorHandle buf58(buf58_handle); + buf57.reset(); + auto buf59 = std::move(buf58); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf59, mv2_features_10_conv_1_1_running_mean, mv2_features_10_conv_1_1_running_var, mv2_features_10_conv_1_1_weight, mv2_features_10_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19, aten_convolution_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf60_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf59, mv2_features_10_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf60_handle)); + RAIIAtenTensorHandle buf60(buf60_handle); + buf59.reset(); + auto buf61 = std::move(buf55); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf61, buf60, mv2_features_10_conv_3_running_mean, mv2_features_10_conv_3_running_var, mv2_features_10_conv_3_weight, mv2_features_10_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf60.reset(); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5, aten_convolution_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] + AtenTensorHandle buf62_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf61, mv2_features_11_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf62_handle)); + RAIIAtenTensorHandle buf62(buf62_handle); + buf61.reset(); + auto buf63 = std::move(buf62); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf63, mv2_features_11_conv_0_1_running_mean, mv2_features_11_conv_0_1_running_var, mv2_features_11_conv_0_1_weight, mv2_features_11_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20, aten_convolution_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf64_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf63, mv2_features_11_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf64_handle)); + RAIIAtenTensorHandle buf64(buf64_handle); + buf63.reset(); + auto buf65 = std::move(buf64); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf65, mv2_features_11_conv_1_1_running_mean, mv2_features_11_conv_1_1_running_var, mv2_features_11_conv_1_1_weight, mv2_features_11_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21, aten_convolution_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf66_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf65, mv2_features_11_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf66_handle)); + RAIIAtenTensorHandle buf66(buf66_handle); + buf65.reset(); + auto buf67 = std::move(buf66); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_32], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_17(buf67, mv2_features_11_conv_3_running_mean, mv2_features_11_conv_3_running_var, mv2_features_11_conv_3_weight, mv2_features_11_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default_33], Original ATen: [aten.convolution] + AtenTensorHandle buf68_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf67, mv2_features_12_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf68_handle)); + RAIIAtenTensorHandle buf68(buf68_handle); + auto buf69 = std::move(buf68); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf69, mv2_features_12_conv_0_1_running_mean, mv2_features_12_conv_0_1_running_var, mv2_features_12_conv_0_1_weight, mv2_features_12_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22, aten_convolution_default_34], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf70_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf69, mv2_features_12_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf70_handle)); + RAIIAtenTensorHandle buf70(buf70_handle); + buf69.reset(); + auto buf71 = std::move(buf70); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf71, mv2_features_12_conv_1_1_running_mean, mv2_features_12_conv_1_1_running_var, mv2_features_12_conv_1_1_weight, mv2_features_12_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23, aten_convolution_default_35], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf72_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf71, mv2_features_12_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf72_handle)); + RAIIAtenTensorHandle buf72(buf72_handle); + buf71.reset(); + auto buf73 = std::move(buf67); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_35, aten_add_tensor_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf73, buf72, mv2_features_12_conv_3_running_mean, mv2_features_12_conv_3_running_var, mv2_features_12_conv_3_weight, mv2_features_12_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf72.reset(); + // Topologically Sorted Source Nodes: [aten_convolution_default_36], Original ATen: [aten.convolution] + AtenTensorHandle buf74_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf73, mv2_features_13_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf74_handle)); + RAIIAtenTensorHandle buf74(buf74_handle); + auto buf75 = std::move(buf74); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf75, mv2_features_13_conv_0_1_running_mean, mv2_features_13_conv_0_1_running_var, mv2_features_13_conv_0_1_weight, mv2_features_13_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24, aten_convolution_default_37], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf76_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf75, mv2_features_13_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf76_handle)); + RAIIAtenTensorHandle buf76(buf76_handle); + buf75.reset(); + auto buf77 = std::move(buf76); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf77, mv2_features_13_conv_1_1_running_mean, mv2_features_13_conv_1_1_running_var, mv2_features_13_conv_1_1_weight, mv2_features_13_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25, aten_convolution_default_38], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf78_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf77, mv2_features_13_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf78_handle)); + RAIIAtenTensorHandle buf78(buf78_handle); + buf77.reset(); + auto buf79 = std::move(buf73); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf79, buf78, mv2_features_13_conv_3_running_mean, mv2_features_13_conv_3_running_var, mv2_features_13_conv_3_weight, mv2_features_13_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf78.reset(); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7, aten_convolution_default_39], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] + AtenTensorHandle buf80_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf79, mv2_features_14_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf80_handle)); + RAIIAtenTensorHandle buf80(buf80_handle); + buf79.reset(); + auto buf81 = std::move(buf80); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf81, mv2_features_14_conv_0_1_running_mean, mv2_features_14_conv_0_1_running_var, mv2_features_14_conv_0_1_weight, mv2_features_14_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26, aten_convolution_default_40], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf82_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf81, mv2_features_14_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf82_handle)); + RAIIAtenTensorHandle buf82(buf82_handle); + buf81.reset(); + auto buf83 = std::move(buf82); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(buf83, mv2_features_14_conv_1_1_running_mean, mv2_features_14_conv_1_1_running_var, mv2_features_14_conv_1_1_weight, mv2_features_14_conv_1_1_bias, 28224L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27, aten_convolution_default_41], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf84_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf83, mv2_features_14_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf84_handle)); + RAIIAtenTensorHandle buf84(buf84_handle); + buf83.reset(); + auto buf85 = std::move(buf84); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_41], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_21(buf85, mv2_features_14_conv_3_running_mean, mv2_features_14_conv_3_running_var, mv2_features_14_conv_3_weight, mv2_features_14_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default_42], Original ATen: [aten.convolution] + AtenTensorHandle buf86_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf85, mv2_features_15_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf86_handle)); + RAIIAtenTensorHandle buf86(buf86_handle); + auto buf87 = std::move(buf86); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf87, mv2_features_15_conv_0_1_running_mean, mv2_features_15_conv_0_1_running_var, mv2_features_15_conv_0_1_weight, mv2_features_15_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28, aten_convolution_default_43], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf88_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf87, mv2_features_15_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf88_handle)); + RAIIAtenTensorHandle buf88(buf88_handle); + buf87.reset(); + auto buf89 = std::move(buf88); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf89, mv2_features_15_conv_1_1_running_mean, mv2_features_15_conv_1_1_running_var, mv2_features_15_conv_1_1_weight, mv2_features_15_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29, aten_convolution_default_44], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf90_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf89, mv2_features_15_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf90_handle)); + RAIIAtenTensorHandle buf90(buf90_handle); + buf89.reset(); + auto buf91 = std::move(buf85); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_44, aten_add_tensor_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf91, buf90, mv2_features_15_conv_3_running_mean, mv2_features_15_conv_3_running_var, mv2_features_15_conv_3_weight, mv2_features_15_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf90.reset(); + // Topologically Sorted Source Nodes: [aten_convolution_default_45], Original ATen: [aten.convolution] + AtenTensorHandle buf92_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf91, mv2_features_16_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf92_handle)); + RAIIAtenTensorHandle buf92(buf92_handle); + auto buf93 = std::move(buf92); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf93, mv2_features_16_conv_0_1_running_mean, mv2_features_16_conv_0_1_running_var, mv2_features_16_conv_0_1_weight, mv2_features_16_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30, aten_convolution_default_46], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf94_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf93, mv2_features_16_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf94_handle)); + RAIIAtenTensorHandle buf94(buf94_handle); + buf93.reset(); + auto buf95 = std::move(buf94); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf95, mv2_features_16_conv_1_1_running_mean, mv2_features_16_conv_1_1_running_var, mv2_features_16_conv_1_1_weight, mv2_features_16_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31, aten_convolution_default_47], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf96_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf95, mv2_features_16_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf96_handle)); + RAIIAtenTensorHandle buf96(buf96_handle); + buf95.reset(); + auto buf97 = std::move(buf91); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] + call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf97, buf96, mv2_features_16_conv_3_running_mean, mv2_features_16_conv_3_running_var, mv2_features_16_conv_3_weight, mv2_features_16_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf96.reset(); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9, aten_convolution_default_48], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] + AtenTensorHandle buf98_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf97, mv2_features_17_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf98_handle)); + RAIIAtenTensorHandle buf98(buf98_handle); + buf97.reset(); + auto buf99 = std::move(buf98); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf99, mv2_features_17_conv_0_1_running_mean, mv2_features_17_conv_0_1_running_var, mv2_features_17_conv_0_1_weight, mv2_features_17_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32, aten_convolution_default_49], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf100_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf99, mv2_features_17_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf100_handle)); + RAIIAtenTensorHandle buf100(buf100_handle); + buf99.reset(); + auto buf101 = std::move(buf100); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] + call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf101, mv2_features_17_conv_1_1_running_mean, mv2_features_17_conv_1_1_running_var, mv2_features_17_conv_1_1_weight, mv2_features_17_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33, aten_convolution_default_50], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] + AtenTensorHandle buf102_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf101, mv2_features_17_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf102_handle)); + RAIIAtenTensorHandle buf102(buf102_handle); + buf101.reset(); + auto buf103 = std::move(buf102); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50], Original ATen: [aten._native_batch_norm_legit_no_training] + call_triton_poi_fused__native_batch_norm_legit_no_training_24(buf103, mv2_features_17_conv_3_running_mean, mv2_features_17_conv_3_running_var, mv2_features_17_conv_3_weight, mv2_features_17_conv_3_bias, 15680L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50, aten_convolution_default_51], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution] + AtenTensorHandle buf104_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf103, mv2_features_18_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf104_handle)); + RAIIAtenTensorHandle buf104(buf104_handle); + buf103.reset(); + static constexpr int64_t int_array_4[] = {1L, 1280L, 1L, 1L}; + static constexpr int64_t int_array_5[] = {1280L, 1L, 1280L, 1280L}; + AtenTensorHandle buf105_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf105_handle)); + RAIIAtenTensorHandle buf105(buf105_handle); + static constexpr int64_t int_array_6[] = {1280L, 1L, 1L, 1L}; + auto buf106 = wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf105, 4, int_array_4, int_array_6, 0L)); buf105.reset(); // reuse + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean] + call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(buf106, buf104, mv2_features_18_1_running_mean, mv2_features_18_1_running_var, mv2_features_18_1_weight, mv2_features_18_1_bias, 1280L, 49L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf104.reset(); + static constexpr int64_t int_array_7[] = {1280L, 1000L}; + static constexpr int64_t int_array_8[] = {1L, 1280L}; + AtenTensorHandle buf107_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_7, int_array_8, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf107_handle)); + RAIIAtenTensorHandle buf107(buf107_handle); + // Topologically Sorted Source Nodes: [aten_permute_copy_default], Original ATen: [aten.permute_copy] + call_triton_poi_fused_permute_copy_26(mv2_classifier_1_weight, buf107, 1280000L, this->device_idx_, stream, kernels, this->cubin_dir_); + static constexpr int64_t int_array_9[] = {1L, 1000L}; + static constexpr int64_t int_array_10[] = {1000L, 1L}; + AtenTensorHandle buf108_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_9, int_array_10, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf108_handle)); + RAIIAtenTensorHandle buf108(buf108_handle); + // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim, aten_view_copy_default, aten_permute_copy_default, aten_addmm_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean, aten.view_copy, aten.permute_copy, aten.addmm] + static constexpr int64_t int_array_11[] = {0L, 1L}; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_addmm_out(buf108, mv2_classifier_1_bias, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf106, 2, int_array_8, int_array_11, 0L)), buf107, 1L, 1L)); + buf106.reset(); + buf107.reset(); + output_handles[0] = buf108.release(); +} // AOTInductorModel::run_impl +} // namespace torch::aot_inductor + + + + +// Compile cmd +// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o +// Link cmd +// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin b/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin new file mode 100644 index 0000000000000000000000000000000000000000..5098c505ebb138fa361a0971a8c7d89086e12f3b GIT binary patch literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJz`FBm61Q{^C@`iK(E)oA44uyQB{n$c>-iF# zmeRCLXrPHsd-*VF9}u4=HffrKCN@o5KLDx_+R#2seC#4LAzHd56ofQ}_xnFDm$RMh zB~7bs;*{L;{NG=n=Q*E!?9KysMI({?CMK>Z`z@Q7a~j^hNdRvTi}ArL9%K`&$og=< zibWVRn$@Q7`Bn)tHO@^7n2%<&>Qq|HaAu8~GudEg;d&XifTyozr*RWq9n6mF2JCUo1CjaBB^P1~D= z)(rP{s@^DBb!4IDbGY1anux|PP-=~eU7DFPCp~N0RzSVxVD8yDCPrRWt1aIW7n{T4hVyC|CS(r{~KjwJa2D|}yZQTh*r@=;<)#;Vt+Jj<=} z=v}YjH~cvl5r*_-HlPBcQ!nNHlcrx~F{*CAf7+fl8k3Vv+ZTN=mzJVqHDI;RnhL)E zo;&X`1;3dwt$NKu|Dq+r%f;MfVt4826yA9Pk9$t7ng6u~6fCRG4d6doKpV4}s)c?@ zmtYRdQe%TXi*pD^$<7cqCg-;SuRk{wqKIh;c#UoC!F#|*Lihp6F~|8+J^ar|e#NIW z{*d!4FIL2!=lJ*vLeBoc@qIn`tD@wmI$c-DKF4ofLCD#6fgc(S-ehrf+$ zm&DSXE{(+=r!lqV`273`OSF&Ak0kh%F~aZe(IOGjR%{=F5wbCrfhT~9vOj|{68T$l zGwu;!SS0OFyJ^4Sm8&(UGE>IPoU)pYhVw|xO1mwVt~RFa^a_d&j*6AGo!MsEYWTHu zy;e%QbAGkq%1QWkJ zMq^aF6~mI&kGnQQ(?!tsLoE7S6#3S81*kJRD*!~uVWw@1XjhVo&(e)t)xJr$U37b# z+|sV5=@z=}r`sUiJlvL+mKK+^>j^aJ#`(BAj{-caeVvf6kkdlirkO*xVY*FsDZe+M za9dXpIE&lj(&F;++4n?Rjv;-RWg_Pnk;6=6`P~ceesGp2VVP(Bz2&7P${NxzPidgS z7_%r^t>NfJV;s+umN#YldUM*`ld+g#RVp}Hnr5?h*w)ABl#nNhZs2U0a`dcjl#04I zIUdSpAJ)I1AK5dyTc`iA@!fin#v=YDZaf2A0rbq#oBx?EI)xs}j{i%2Mbejp{yoAT z*)8VrpB4CG0d^2$tywG8ZQZaP-<#7fdXFDrhHK&JQa5JC3cBI?p5Bv~i#B3%6J}=j zWi*z(oW^pO(^&p;8Y_g1<#QL`Z?Y9eLuxO1-;J>w=_AS^oDvm1H#)|S5IBWy$7ege zf7soZ&EJAW%yA_|*+1;JW%hS4t+bU0=t9WjHLT`gwp-5cgn0UlXSCCE1N z3CplQn$5u1h8Iwf2T%mPKsB;`*QonOlULBi0Rl3ZvZI;NT#@3OsTzNk$A-fvZC1zsS z&FWaY@uTQ}7uG@N=)q zo^AQI+o;#da|S-AmVG?K;DykE+}^$RY_05@b*t&~jYUXhjjs4$rz<|CyDPo}*$$Ot zp=_0_wu#4d-v~v|Jud{&=SB294r;KD)Qj*$`}IbNu3Sfykm`vNwe5-$mN{mIT1BhXGK(d$jxb5ACr*{Im2QR6i;9!)487qnGsv&Q61y{=T|xi(_MPKQqJq4VV}>FPX??&SF% zENthoJ=5N;NIY9P?r)w_IXP_^o)eIHvV=TO1)d7JI5;}b)$m+U_sOS@ES@^Vxn)a* zuHfd`=aKSYoA_)VSfqt9sc1Y=_AC!UZh>cvh~WZp$~q=UJ@L0>s2 zGQpiFWcG7El;wbjIvNG!>Q|VAe~zV+=Vp1a^1@FT<^G51qNKti5-XgS4N(v_Ae_w< z$MU(tc%J7WOUE?@sUFxuM!1X%O<{(DV1Sh_MRZPw6@|RvGH$4nd3qq`>lde1=fN62 zN_Xr0nB^9+N{=EY=GlIb@50`d2W{J_&{OGAuoz97ACUd>#i&&(dG;)x=+|`}?a{}- zj9RC*b-8e9gn9#;w#pP+|B-&AF_uoSD`0sW{$pFkoE(V_h34ZiHcz-uZ6OlA+siO^ zVP^ox*$88cTLU=8^bWeluI_-N^nN!aA7P0Om>U6IJBW9q+uJO5ZGxSl7mawW8XJTU zc*#TWdFWrerpRKO6D*1OU=;JQ7-{1dz00J@$8!9;96!Q%Ua;VuCFAi+kbQa$ywXYh zy>Dyj-6wZXsbP1lp(kMYHzy~KMB0pA1Cs;SCD}D|QgA0mC&^A6NwnFwF(;nq0k>jo zQL?aw6!_(v>tP@llI%yV1b=6H>o#Zv{Y0yuO;94~dNswq*Xr-5x8R8npG~pTDe(9A zUj_KqKFR-hg565IBoo+SE0Djsi%HNi+9^6{&~2e#{#w$qPB zfA-1^0 z+v&yQw14r<0Qr}6*#1CcFGGJzLwe@Xvh#>99RnR(1<7B~*jpzR4E_%R6MstbLnHjt zv2}s}793n-`Q9pD0B=1F&RDTY?nr7E~nno4nRYd(jkBg3<;On8^i{bkp zx=v94f^Q!wmys^BOysVo%MnJ_xYB?bzsvuJ${kzmsBL zocoB;{@rdrsj*Wjc6C69(ElN9n)|$-^D^7PGjmkF-^8~pHMtSSYFht3Om z%F&Z4zrF-|w@xCQk`H@T!4}-RdB3)X{D?GkqD!C)WKXUk|AO2fH+2bgf$a1e^1qPB zyPRl8T7h^`;z^#4!fyEy6b3%^ z*ndgN>mhlXDtT2$`dtU%CWsii-jwpH&Sam?CncxwKVL(BaTR%Lsc!kdt|6b0A1E@? zC$dlHb+`OB#J`&VMJccRBzfZNmM_9S{(9EHK`Bo;4_``N;plfaBrlhXLbG}Gfk+a_ zKCPemfJ011q$X58G-}jJD-SR({@?l<@cgx=)`iI@##i-=(23=jN z^&f>tc~!q>RlI6EC=IGn<&V%8Bd6-4?5I)IOO0WB)W1Pplzn8vJ literal 0 HcmV?d00001 diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp new file mode 100644 index 00000000000..90c865f5f5e --- /dev/null +++ b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp @@ -0,0 +1,965 @@ + +#include +// Definition of AOTI runtime interface functions + +#include +#include + +#include +#include + +#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ + try { \ + __VA_ARGS__ \ + } catch (const std::exception& e) { \ + std::cerr << "Error: " << e.what() << '\n'; \ + return AOTI_RUNTIME_FAILURE; \ + } catch (...) { \ + std::cerr << "Unknown exception occurred.\n"; \ + return AOTI_RUNTIME_FAILURE; \ + } \ + return AOTI_RUNTIME_SUCCESS; + +#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ + do { \ + AOTI_RUNTIME_CHECK( \ + actual_size == expected_size, \ + "expected " + std::string(name) + " vector size to be " + \ + std::to_string(expected_size) + ", but got " + \ + std::to_string(actual_size)); \ + } while (0) + +// AOTInductor uses at::addmm_out, which doesn't supports +// arguments that requires gradient. For this reason, we +// enforce no_grad context for run APIs. +// +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct AOTINoGradGuard { + AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(false); + } + AOTINoGradGuard(const AOTINoGradGuard&) = delete; + AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; + ~AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(prev_mode); + } + AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; + AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; + bool prev_mode{aoti_torch_grad_mode_is_enabled()}; +}; + +extern "C" { + +AOTIRuntimeError AOTInductorModelContainerCreate( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + bool is_cpu, + const char* cubin_dir) { + return AOTInductorModelContainerCreateWithDevice( + container_handle, + num_models, + is_cpu ? "cpu" : "cuda", + cubin_dir); +} + +AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir) { + if (num_models == 0) { + std::cerr << "Error: num_models must be positive, but got 0\n"; + return AOTI_RUNTIME_FAILURE; + } + CONVERT_EXCEPTION_TO_ERROR_CODE({ + std::optional cubin_dir_opt; + if (cubin_dir != nullptr) { + cubin_dir_opt.emplace(cubin_dir); + } + auto* container = new torch::aot_inductor::AOTInductorModelContainer( + num_models, std::string(device_str), cubin_dir_opt); + *container_handle = + reinterpret_cast(container); + }) +} + +AOTIRuntimeError AOTInductorModelContainerDelete( + AOTInductorModelContainerHandle container_handle) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto* container = + reinterpret_cast( + container_handle); + delete container; + }); +} + +AOTIRuntimeError AOTInductorModelContainerRun( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_single_threaded( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumConstants( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *num_constants = container->num_constants(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantName( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** name) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *name = container->constant_name(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** original_fqn) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *original_fqn = container->constant_original_fqn(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( + AOTInductorModelContainerHandle container_handle, + size_t idx, + bool* from_folded) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantType( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* type) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* dtype) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *dtype = container->constant_dtype(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( + AOTInductorModelContainerHandle container_handle, + size_t idx, + size_t* data_size) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *data_size = container->constant_data_size(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive) { + auto* container = + reinterpret_cast( + container_handle); + auto constants_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { const auto ret = container->extract_constants_map(use_inactive); + for (const auto& pair: ret) { + constants_map->emplace(pair.first, pair.second); + } + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update, /* user_managed = */ true); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle) { + return AOTInductorModelContainerUpdateConstantBuffer(container_handle, + constant_map_handle, + /*use_inactive*/ true, + /*validate_full_update*/ true); +} + +AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->free_inactive_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( + AOTInductorModelContainerHandle container_handle, + bool use_inactive, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_const_fold(use_inactive, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->swap_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumInputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_inputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_inputs = container->num_inputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetInputName( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** ret_input_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_input_names = container->input_name(input_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_outputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_outputs = container->num_outputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetOutputName( + AOTInductorModelContainerHandle container_handle, + size_t output_idx, + const char** ret_output_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_output_names = container->output_name(output_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetCallSpec( + AOTInductorModelContainerHandle container_handle, + const char** in_spec, + const char** out_spec) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + *in_spec = container->get_in_spec(); + *out_spec = container->get_out_spec(); + }) +} + +AOTIRuntimeError AOTInductorModelCreate( + AOTInductorModelHandle* model_handle, + AOTInductorConstantMapHandle constant_map_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto constant_array = std::make_shared>(); + auto input_map = reinterpret_cast*>(constant_map_handle); + + auto model = new torch::aot_inductor::AOTInductorModel( + constant_map, + constant_array, + "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models + "" + ); + + if (input_map) { + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + } else { + model->load_constants(); + } + + *model_handle = reinterpret_cast(model); + })} + +AOTIRuntimeError AOTInductorModelRun( + AOTInductorModelHandle model_handle, + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + model->run_impl( + input_handles, + output_handles, + (torch::aot_inductor::DeviceStreamType) nullptr, + nullptr); + }) +} + +AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast( + model_handle); + delete model; + })} + +AOTIRuntimeError AOTInductorModelGetNumOutputs( + AOTInductorModelHandle model_handle, + size_t* ret_num_outputs) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast(model_handle); + *ret_num_outputs = model->num_outputs(); + }) +} + +AOTIRuntimeError AOTInductorModelUpdateConstantsMap( + AOTInductorModelHandle model_handle, + AOTInductorConstantMapHandle constant_map_handle) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto input_map = + reinterpret_cast*>( + constant_map_handle); + + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + model->update_constants_map(std::move(constant_map)); + }) +} + +} // extern "C" + + +#define CUDA_DRIVER_CHECK(EXPR) \ +do { \ + CUresult code = EXPR; \ + const char *msg; \ + CUresult code_get_error = cuGetErrorString(code, &msg); \ + if (code_get_error != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string("invalid error code!")); \ + } \ + if (code != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string(msg)); \ + } \ +} while (0); + +static inline CUfunction loadKernel( + std::string filePath, + const std::string &funcName, + uint32_t sharedMemBytes, + const std::optional &cubinDir = std::nullopt) { + if (cubinDir) { + std::filesystem::path p1{*cubinDir}; + std::filesystem::path p2{filePath}; + filePath = (p1 / p2.filename()).string(); + } + + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline void launchKernel( + CUfunction func, + uint32_t gridX, + uint32_t gridY, + uint32_t gridZ, + uint32_t numWarps, + uint32_t sharedMemBytes, + void* args[], + cudaStream_t stream) { + CUDA_DRIVER_CHECK(cuLaunchKernel( + func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr + )); +} +CACHE_TORCH_DTYPE(float32); +CACHE_TORCH_DEVICE(cuda); +CACHE_TORCH_LAYOUT(strided); +namespace torch::aot_inductor { +namespace { +class AOTInductorModelKernels : public AOTInductorModelKernelsBase { + public: + CUfunction triton_poi_fused_convolution_0{nullptr}; + CUfunction triton_poi_fused_convolution_1{nullptr}; + CUfunction triton_poi_fused_convolution_2{nullptr}; +}; +} // namespace + + + +AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) + : AOTInductorModelBase(1, + 1, + 1, + device_str, + std::move(cubin_dir), + true) { + inputs_info_[0].name = "arg2_1"; + constants_info_[0].name = "conv_weight"; + constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[0].offset = 0; + constants_info_[0].data_size = 540; + constants_info_[0].from_folded = false; + constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[0].shape = {5, 3, 3, 3}; + constants_info_[0].stride = {27, 9, 3, 1}; + constants_info_[0].layout = static_cast(cached_torch_layout_strided); + constants_info_[0].original_fqn = "conv.weight"; + update_constants_map(std::move(constants_map)); + update_constants_array(std::move(constants_array)); + in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; + out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; + outputs_info_[0].name = "output0"; + this->kernels_ = std::make_unique(); +} + +std::unordered_map AOTInductorModel::const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization +) { + + if (!initialization) { + std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " + << "aot_inductor.use_runtime_constant_folding=False\n"; + } + return {}; +} +} // namespace torch::aot_inductor +using namespace torch::aot_inductor; + +template +static inline void call_triton_poi_fused_convolution_0( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_0', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 12 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_0 == nullptr) { + kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); + } + CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); + int var_2 = ynumel; + int var_3 = xnumel; + CUdeviceptr global_scratch_4 = 0; + void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; + launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_1( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_1', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 15 + xnumel = 9 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_1 == nullptr) { + kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); + } + CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); + int var_7 = ynumel; + int var_8 = xnumel; + CUdeviceptr global_scratch_9 = 0; + void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; + launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_2( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_2', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 20 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y0 = (yindex % 5) + y1 = yindex // 5 + y3 = yindex + tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') + tmp1 = y0 + tmp2 = tl.full([1, 1], 2, tl.int64) + tmp3 = tmp1 < tmp2 + tmp4 = tl.full([1, 1], 1, tl.int64) + tmp5 = tmp1 < tmp4 + tmp6 = -0.0312186349183321 + tmp7 = -0.18273277580738068 + tmp8 = tl.where(tmp5, tmp6, tmp7) + tmp9 = tl.full([1, 1], 3, tl.int64) + tmp10 = tmp1 < tmp9 + tmp11 = tl.full([1, 1], 4, tl.int64) + tmp12 = tmp1 < tmp11 + tmp13 = -0.12337345629930496 + tmp14 = 0.12138354778289795 + tmp15 = tl.where(tmp12, tmp13, tmp14) + tmp16 = 0.05455135554075241 + tmp17 = tl.where(tmp10, tmp16, tmp15) + tmp18 = tl.where(tmp3, tmp8, tmp17) + tmp19 = tmp0 + tmp18 + tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); + uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_2 == nullptr) { + kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); + } + CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); + int var_12 = ynumel; + int var_13 = xnumel; + CUdeviceptr global_scratch_14 = 0; + void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; + launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); +} + +namespace torch::aot_inductor { + +void AOTInductorModel::_const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) {} + +AOTI_NOINLINE static void check_input_0( + AtenTensorHandle* input_handles +) { + ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); + int32_t arg2_1_dtype; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); + + int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); + if (arg2_1_expected_dtype != arg2_1_dtype) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dtype, " + << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " + << "but got: " << arg2_1_dtype << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_size = arg2_1.sizes(); + + if (4 != arg2_1_size[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 0, " + << "expected: 4, " << "but got: " << arg2_1_size[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (3 != arg2_1_size[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 1, " + << "expected: 3, " << "but got: " << arg2_1_size[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 2, " + << "expected: 8, " << "but got: " << arg2_1_size[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 3, " + << "expected: 8, " << "but got: " << arg2_1_size[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_stride = arg2_1.strides(); + + if (192 != arg2_1_stride[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 0, " + << "expected: 192, " << "but got: " << arg2_1_stride[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (64 != arg2_1_stride[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 1, " + << "expected: 64, " << "but got: " << arg2_1_stride[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_stride[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 2, " + << "expected: 8, " << "but got: " << arg2_1_stride[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (1 != arg2_1_stride[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 3, " + << "expected: 1, " << "but got: " << arg2_1_stride[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + int32_t arg2_1_device_type; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); + + int32_t arg2_1_expected_device_type = 1; + if (arg2_1_expected_device_type != arg2_1_device_type) { + std::stringstream ss; + ss << "input_handles[0]: unmatched device type, " + << "expected: " << arg2_1_expected_device_type << "1(cuda), " + << "but got: " << arg2_1_device_type << "\n"; + throw std::runtime_error(ss.str()); + } +} + +static bool _check_aoti_runtime_check_inputs_env() { + const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); + const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; + return result; +} + +AOTI_NOINLINE static void __check_inputs_outputs( + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + if (!_check_aoti_runtime_check_inputs_env()){ + return; + } + check_input_0(input_handles); +} + +void AOTInductorModel::run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) { + __check_inputs_outputs(input_handles, output_handles); + + auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); + auto arg2_1 = std::move(inputs[0]); + [[maybe_unused]] auto& conv_weight = constants_->at(0); + + if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { + AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); + AtenTensorHandle arg2_1_aligned; + aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); + arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); + } + inputs.clear(); + [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); + + AOTICudaStreamGuard stream_guard(stream, this->device_idx_); + static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; + static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; + AtenTensorHandle buf0_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); + RAIIAtenTensorHandle buf0(buf0_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + arg2_1.reset(); + static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; + static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; + AtenTensorHandle buf1_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); + RAIIAtenTensorHandle buf1(buf1_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + AtenTensorHandle buf2_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); + RAIIAtenTensorHandle buf2(buf2_handle); + buf0.reset(); + buf1.reset(); + static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; + static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; + AtenTensorHandle buf3_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); + RAIIAtenTensorHandle buf3(buf3_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf2.reset(); + output_handles[0] = buf3.release(); +} // AOTInductorModel::run_impl +} // namespace torch::aot_inductor + + + + +// Compile cmd +// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o +// Link cmd +// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin new file mode 100644 index 0000000000000000000000000000000000000000..000ca4c1209b77cdaec3c8757e532677b79ccc0f GIT binary patch literal 8968 zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8YXC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7 zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM# zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew$8dP>`$ zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1) zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5GuFVIK-CY(EZ`C4GTUaK$2L zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3 zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty z02F5dZe(2`e4Kc=$%pwRQS}@3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8KFIpOH77K*R{H z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn|U zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC@q#ggXYY*W& z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM# z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f zaF0gz)9(?V>4(C7I_d98v%g93M{PB?<|F=0Y6OQboct=_OL6Henm2?q5@Rb8dR?2(87mIVOHSAc?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma z2PCeK{b#acB;VbBF-P?YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS? zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy zX;T0AF8WpVCl87*-V#gk(k}YzYNH|^)Jf!hx{MDSt=TG81f(G@hxR@S;eJuxU_y-WA7n6z+AV6L z&H=?w^}nge(LnE3Wb$^wM)E4&}Q7kG>CM(ANG0^Xb1#Sq$YwAN-H8Xo!uGROV+!a z+3~U)ld?$>LZ||@eX7)ls!s^V&rtIwVMrfPPcXs@hRgxd-}8pgtBGOW*B5xC?M)N{KZ%my{7$RG>%?FWK5e3 z7OIm$#V^bk%kE+UGkeadRVw8tiq2%d$yg!ZtUgh!HY-nLY73d-lbJdmVHr2$r~Tu{ z-E6avcd9e_lZ{|;;dr6oyC)W_S*zM)+8_S@%QpvKzk#t~gf5z)>07krs~_nnV`JJ? zJr+C1w6@*=uzV5!bS;+DF6mFlObs+s`*8WAi*f~wp2LMCn5+}sY zYKZ7BuoC0B*&QR$5g(X-qo!|#PJJhJWL>wZ8>jB6*gH(qx6yQjxH52^bjqy7YS+tFRrAEjxQx(BJ-PhA~d`%?R=h=K>mK;AnWCy5OQjw+zhf@#o!(>8Uv_4XnWHg>KZ%=30yhPE^3-ygixLVs^^3s)29zEyp@l z+g!1o+;h!Mq^`Y*^tCsUx%MWeA|^8F_2!J+*y)I}Ym_@#c00X1pC|`K*Gx}m*%EYPN3ieQ0`snG;nO!VH;mT<$|5Z7f(64kLp6n70GVpC2tBx>(~a< z`|&$h(VN|{-W2pw40zU%^9z-_wJ3s4p%zRO0;lL&4Fp=7FRA57p$bj-MBlrbQ>a}V z3^u)Lu&9L_)SH3ct`M;=6WDVKFd84074d5ru3Z(q3XVv8Xzz?wtX;g7|TATOP-lb97K?6rR2UD)AgwiPPAaJ#5*ot^T zS|)sA_;xA_ZHxI(h%!M5rD+KlOV*P`H+037_MjF{a&s#4Nz$dI#6SjD)*l-8n8^lk zD(PTdVsL`wCrLfc*J6nr;QpDXu>4%9d54)aX-8M_NASY$9)EZwE-f|rgGH16z+E7a z*W96k?>T{2t(1y|riHJZLV%|{E(Z>$_wDoQ#X?}0oLaz-w~$I%J)yx)PiRW_N@#~u zUD`-_!70ppHs1FFD-t+&dMXS&BLe3kP)>9$xQI+-On-n#q;&)cslEWQqdft_GRLeV zK%O4iFu-giz?t3vdDgokgZl$aA@SD{A&K>cs1|D_J(1t*4Ur$=To~fsNQnH*cWq*d zA4QkW?`mmEm3KWsLaZ-HnBb~bf((~%9UI9VP9?)IQ|V}&8MT+QqJpLQbA&Q!HZzl+ zIXHdKK~X&vUF1bKoz+hGf#p!+o?WQe$U*`*em!& z!Tk;_I0f=ycxf1cFk^CvU5AlwNsKLGPPUD$k@<|lT7ZpPYM zLfBwt7hPxDx?ujD9Ff=9P#2vW0bPFx_Z-_h%(&HH@93~U2K)Fh<(CIw(r?Z&BR<4N zF&~Rzo;64lZ*+%2lh5S%PdR>yk>aLCcLNl=YG0R=UcAyt`2O1%p(lE)bKe?zl%4-{ zJhP;A_%#nXa6Z9q#+3+fkf8rhXyo8g`zckWeKa+4C{x`(lb?B<=jU9lE&W)IEx*kqkc#v%c{WDBEXJ6RF zE+p73@Rz$s3lDgUzljIbc>r|yx5YOMLUha*IlLA8ZR{_X2Yt*>Y{E@ct8=cy^_^KV zI_wz;kiWN$vWZ440XmDtpzqI<>~u30BbaTsMrgez8bh2uMD+77vfrPC<7Yr8d92q& zBhKk@qF+c1oNf*ch4L3jeuUFU1U+`T8IOnbT_kUif6UkWNBVfVImc43DhVd%P|vo{h%$I&ACGU$^;Cm8gPb@tZ;Gx@=2?Pu%&qx~>n;`+YF zVfbfm8zue~`fD5x>Es`8W9YvL{~9{duO(6p`pI89+x%J${th(~lHTI+n=k1IrzP}} z{nt1>)IW}V8RiZ{2lj8=9~^|s5GEM>zAn~-`@e(WLB2i&b2=Lpx4wYio&;STe?i~r z$K%A$tKwS;ZKuC^GeA*O{*++PHc_4c4{k@{g1#f_{IZtL-cPXaHa}%lk2;;7>g;TS zZ42oT`VzpleeehO-D4kmd-tGXs*C!(USU$ez z1zXtn^5)i%e_P652qnd&SKeDg{uSxyyPqTf@*47|P<*NVE&7rLxYz#eoQwZJBxbUA zZxWMT5>|U;ba62ySUu2#e@@E38j+_*O|N_$8v~!JJ>o1UH^_|0(^Sc;eWc$F33q^q zk@h1gulAYzr<71~3jgypY)eDV16R9rkf)U*(4)wB!r@tqLJW zvK4LglO_4oEB|ZtK@ereyjT7sqgq!rD!G3F DI_8#s literal 0 HcmV?d00001 diff --git a/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin b/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin new file mode 100644 index 0000000000000000000000000000000000000000..88b88a29bf7f3c8af0026294261be1288801c901 GIT binary patch literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+tXHT<*}C&<5~p-+N})qq5glck1Vg8beU42X+dW@g zr=`^GCN`#xPVCd9NkeEKHi?gDf@#|LKv6YSh)?^lPhEy4L`#)~f{@1We*fp?a<-FA zOVetbI5qb?|M%DDdG5|V{m`LfO6qttL6*P6mijdL>s=A+ptRSPXd-IX;p%S7bX)L{?4n}IOQT-Sb0o=oSmFD!i_(7}l#dcaGFGKB?O62^ zkG}5I+?qRAM}$6o$?GqG=+YZ_|GeLOSD|Y3BQy4lQ7aZ3wkuj*E-FRGLcl6-nhHMh z$U~2qg5OMo~>ex%B`IsdJF{2g37 zB$noMX)N>%jj>H<+U=bz(mB)K8R1jL2*0aOi$q9Up*;vj$c9t~o&_q%{sP88;P26m zxCek?frL9#Pq;NFUn*A%vw6(SX{%AIRi7?fiDH^1O0^j~v5cZKqhKZMYO|5BYHm4E zDNiNpb8e|t%^0a!i&>FHMS_*knsopnd; z>6xd?wPvO@-kO~|(K}V^So-SqVQtPOFEL$2ocHA=Rb1d+?zy0+5z=bVx#gyzy z-F6y834Zp6KfO8d+gB0QAaX6PP(a%pSo-Lyb|N?&_(%%|FR{Ra)&lV1Wi1fY1B;p# zOa`tjX{P~P1)v8Ou3TNxo(;ak0*ecmmli&{!Wp$wAYFU^8dxp`0#~km5DID+85F_9 zZ%)$~)b?Rm)CO_aW@)+r+5w0KUkW1M8m|C#Mt22(@Hx!1EdlLTQt>6ak*nHw=(dAy z&yZW%?KIs)w*zz=rkjJ?;^N}MqIM5~Cfzt6cjqX;%i6aId7hjW(l*Vibc@q%rbqds z9);Vwg}^3m3yTX&OP4?DEHXmh;H z>4Wk5o>XQp7B9z5A;!VDvnQKgM!`X#3VYIoV>=F+cpI(e%_+QjR+GDFr|`Z1;? zZwkxy$@N+>Pu;(QfwA>7kb(h<3a2%NGgE6Cvmy;C>^$+jYn2N|3!yhk6+3tp;` zOc;jw(Nq$y)*O$5EPx{DSt^k2){Tm5G{E#gT>k=~= zucve@TlpT@O!J(cgc2TFT0ez)a?Scz^rTKL!lUVSO7Oz@9;bMq1u7Xih1^M};4={5 z=U(L<+j8xCty0d<8F)3#yLfED=buCA-Mj5(Iq#YktKsrJMM$NLp7>y=CqAXSC%!|e zE|p{WdP!?THeWIcBX< z@@^UPq8#%@i51WtC0`>2T16|>GKD3whA>I2FHV)Qxz@_3^^~(316IiE9oj{ ziZ9-R*cAHfNqhZTB86CAq{t~(wH&FJ$F-J}mnoI>Vojy}!P1_>gDr}KEzLWC8mpKY zPmdoM-G4xAD~is0o7&7rXN_ZyYgl!BmuwkD_k^LCd79JM3Rhzg8yjuFwCO1^?lMoh z+zllmBeq>?IQUIDl80`F_OoK3XdG( z+_I&@y5Q#7=aKSY8+aY}EYiX#DjH7|`B10hiywiWDsC+Uh0E;3<5NYw*sY4p*O49R zVA^xgRSt?wa3`|K1KbZ~+2f&(Mgh6{WhUWY)l$i;r+Be4!cQ3G{`={oq{1Q+E1Z`N zkrg%|oJ!`#GU@DihUXzg$2A41p4NOuxJ(_I{0v#a04qI;&ha5kZ<5j_iZ1%l8 zlNVy_LJa(agSP?RHz4_+iLiadOCFK@$!Lc?2Z-z^2_EjSA4<4R@JNUKNW$cQOVqMM z-<01@fIiS+-;?yIC>sI&Gx*baoC9}94cE$j29OfQVK7JJi92xOo^xvo(!#Zs5C3S3m z8{6&%gRh*wcbwapKL=RTJQ(?pG{jnBIyjP1z)@Z=4$lYKb|)0#8Rq(8fg$t<^gqiU zSVH}`g|<84Fzso4gFyZz9kxH#*qhMb(vY4uTDFb&5+TsBeUSV$jlFwL!QlS{F!9GE zKQzKW9orfB_u_Qp`O#m~&`#}dEW@0>4DXDRe!-7^qWJiQZIITleu3*3{P0)5!1)Dy zk@%y0f8#%eVLj-MCh|e{FKiDgW;PewfGvBfqf!2saFfwKaAn#-5vF0}x;XTe0~QUF~;7 ze?P{)KKCi3J-yTUuEySovD-a5g#Hg<(>(C42ltx?AAbTrj`Qi^#~wOh9{J{Dk5ke< zzdp~ve?y&({E65!JR*t@m81ai z(0L(GIeJ><*Fn&GbrRt<`S4c~Y~H<>_sA;px-@jQN1z8}?N#Jomiyy|9)TW^onJ-% z*YbGR)qh}+5%kL6C-8MIKsmX%RZMnBSe1eHX#XX*;rgT>iTQ$*e^ug1o{qv^`8Wy# zpZe_o@=hVBi^1y=C9mpOgD~MHC7&8DNO@IfvQOuel2iCsR*}E7f;_cUul$u&A&zhCqZd!UC=$v+Ib zx>o8x3Xk%t{y(eYRpVi4P>m{oguWg*RUc(XjjCR1^xLEU_3EMo)Tjz5N534QR6P{C X8dZJN=$E7TR2~(-8dYA^sO0_$U6JTG literal 0 HcmV?d00001 diff --git a/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin b/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin new file mode 100644 index 0000000000000000000000000000000000000000..cd3b21f44c86f0181124fbc89ca5c63a37d6c9ea GIT binary patch literal 13832 zcmeHOYiwKBeLv)-MCxIQlAlWAnD(U3fdh&!kyPv?nqzwiU?ahTVOz6|p-AeH$P{x) z$ya_Xaux*Lhc)Xqpbh$^$hU47_UM3OU%Yf})1bhx7Tbrep8_Kvio_ih+g$_HEBE{T z&$*YE4<(kD4H%XS_}+8=@7MXCb9L#LPoFUw;+y0EI^jf&HVH|*ucYPC>X zT`nxF)hp#fX|=k(y0X?-1~lbS-wj=Tsc2WqJD{{H^@_a?mV}Dim6f|Q2!zGEqbyhE z*A@#4cJX{g#r0%mxr#A$bF|JPQ$O)7$v^6 znTh0-iYqINcCof3!LQk?jn&3R4Q3O!TCF#V)kflzqO6PLp6~ycipi~_XUXioS3BfY;=;Bdjg#wb|&YwAV`dmTN7ZQcy%5t&ZJ6@mFrd~oHQ=50uMLwm{O(92? z8tux|*8w@~gNDrbM|j4*Hs-_lhG?6eR}nBvx!f&mRp5OsfEj&N;Qd|jcLnZt^A>t< zkn#sx05eJgJ|1oPwP{G67W(ln`X;VP1Lc?(?7qzB=BjlmbYs;nEiG5eYbA^di^cltYW0QX z;`9ROOAEE?3-$SOsrZ@Z+ET>393Tw@;}7Sfj5{z#0qOv<8FGfOR352v`FgZ3KUy0yl2m_;Bl%brFCM zZ{GNs46OcOEPzogz`4GIzMq^VBs78hh2Yov0@gvwTIFpFxA$*+aO3uE$i|#nZdwZz zb|2^dQNn}0r8or5B54lbw)O8;fVy5LG=LlD7?y(Xg7%;uf>tz_$aN6p!5>(N?vrpj zs2KhBcmv>zhq87DtX<^%meTyuz%A>|mpT5iK>qe+M#r)V+09#>H%9QLHNoNU@ixNS ze%@Z-jmE=E+#aBji@e>hVHizv^c-*E&9_C*HPI=I%k=r51bSJNKL!3RovUfGJwtg@ z9PQ!lNg?{4FqCC{7T_CO)?>sz!P|EQd5%M^?wg|ShXVc^8Rvlfxu$+phBtqoB_q+hM&>U%pt0WqHJm<4{_>K)s)lQ^xj~zF#dH6Ii zWuKT$v_Mb~Q29i97BDq=wt$0wwN#j|EG}0QkFaD`$~sT+c|)7ZJsf-X<8$j~2366` zp1)i6B-m*YHW!+5esw**rd>$FgsD=axLl5J!0h_cO58Y?g!Cz!_!%JlEVDo$N^$47 zDpt!beJUaJK&F$&)%-%mRgp@1h=*et$Xc~Mo|K?OQz~)Xk4Y0_So3X|j9`FIW@&My zQH!rMjIvr%l%*h-@;Z^CE(O+JNUagliS#?1nt?_gBe?93Q7S980M8iJmu$~xhSg3Y z7(n=Ap@T{|^uWMYyi*LTumJ9hYg*z;YzlGkg($6E)xkq3T47C#pTr~FKCpO5a9cpr z5>SGZ(Cj0p+qS^lrHj*qx2G&EJ`22Et+r6Gsdh=?avFHL*3z62myP58B-f>AYe~=I zEHt@Ec9Gg_grFBv+m1l=LTU#fycbfuNJDrpq;>?tdm#0~V03yR5k>?V{JoHpM@EB4 zb$)FnzO?L?SVlNm>G_%b>})!dm9hoR%4%H4!p>*`phOwg*XBLAu62z?(nYk7P2x)h zX;}-t%t&xE8lNU(vFz99m4>E+R&0fyR*7#0G?eN`NNkD7bRZ)y<3KPT9<5C$Bi;@v^0`!@65LBdm80 zv=-#YhhhX>6sy^XqD&VBiGCFQcTu@^6y0-CXnKAwTJNGzM1B+lW>B(rd}go=Toj7H z&&AbKV}}5N*;G82xqw_87qZ^)P?acQc$RQbizT~KY*cEiE6b&gIR3U);$JU|EAg|b zM;@uHFP9pHm14ai#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2n`{$6d8J}$sZzjOFXVys z5}9LHaKNEKso1etBK90ihH;(-Z7ia855}?$bQ+65i6j`C+Oe@I&)6BCv8*(IWRa13 zFcw^0#=1FzvFRNfoA!*&`;0}+wHcdq1G2L^AX}HYSP^b)Gdngn5JU2cZHE}VSg%$bv?d7{MlboylS zbmo-wdTMgd%KGiECT}*Ph+VZ4$^A%>n$xFnj^I^Ez84UNj z7A7laMa=_oehfE$!^3^w$HPt2V7Tr3Fp?WBjk~@NqwN;kxas>abk|7Zit@enWhKZ= zIE4A(B8ItUG%3p*>9Z06&DL>R&PoWf!bwz*1Crnl@WlY=G9b{=|{@YJqFFRSS8k1!7YjC7qXA(Amq=HJ>nD3?OJv zMP5>^8T0U__41n683o4-j}E@V%1b2>-(i>}l-SIyS?s1p@=^%KFtgfSl+=PIpaD4c znop+d8C|iLaJCie^&c{L{s1I<1*hF9J3o6Yl|VMPVHDv>9Y_gVGG+KO{L3-+rr8)0 zXb##&qdr7?2U8xdd&r4hfdEQKCbkoib+;}#pZD-V%djgA?+=d2M_PXjtyIhW`{Nb< ztfC@s69eW40>$}xyRwcyx^`Yyx=7_$7TtuMjGxxL+ko@M5^Wf`G;l?QrX%vz@j<8y zjp;EphsJ&5Tu6R%652m3MupS?rEVWq?%Yy)@I#a_Z!X`deDi>cw9$(Y@LNygo9F#a z6?!0|VgpLm@Gle{7G3f&5ro!6Ooav`Y6#J^InWea_I3_q`SN{HhaZ73Df?d&_4x#MuzC0;I%!2bC}k~k!a{2 zM|`jvJpQW_yCU+{eSbVn^%C*wWniGaULtMvGHDOD)hG0i%z?i)-Z1)ihmVSK z+};}vWu3j@6Ro*IyvADbnvV84BO2R8QgytHe>IUe<&5uc(T}RD%fqdB{ST~%DQC}K zzrHuH9yo8UH~dl#rphJjt*?*q3wiPD>uZl+s4ae1e`ge(5*PD*#KL;Jf5=~NQD-QO zHm92JHkp6KZ-2N3Do(N+J>I?Ckw8vWc!F7a~-F`)S%C9~D z9~R>^+1S^dz1BS8_S0coMY##oe&eRa^+hbI31tNZIb`a9xr*xnV~1$w!@|CIVdT}5TiA{P8`P@fVwe?4^A9u0cG zUb;Toci0{YnhqiR8~KOXfXV-pfXEw-nYqXt4|>0HLjDiiV?pm%TZj+&M-qPeJwfT$ z0KuOA!}jjrZjO032WhJ|C}{IqW;=9psP% z%qN^-{NBplUk@SweZ$PJT)@A&5us%Ko(lgyH=@SvKB+f%zngGET**qYzH~nNqL>;y z_gm>lwJ7!r_=7o5U#b?>{dLmm>zl(o?Cz(|aA?a{A5)I5Z|M*3Oru_Qx%SEV;qS36 zs#hR@@jyLII=iHPb{ToEO{fqZ2pz;h?fRHH_lKCswP~sM>EAHQ0TlFaegynD&tNF_ zpx)3=TKg;N|FsPz^`hlXt8?!8MQM4zzHvwKJlAagfu-IWRR=sgg#JI6!^_XE&m!=@ z76JAi_OH%s&HzTadH62rRlYe7@4fi{$nahH1b+7F4ib?bl~Xj{O^ia)utn)+s`y4K z*EsNu|1<|4ZBO|0bQqrC&Dm znDiRNM9Z5j~wWP&Y^3-elmvrJuDqGj|`a8vZ{s0P@Q@*(ET~x3&FU`Vahy zCWsljPW^5ke*!#hnlp6nBX9WKxvSshbMt>3`ah1}r6WH}s*~T@hW}}!-_OsI=;Z&q zZTOdZ@YBvt{uE#kfkPN~{noW<_%}@*(0&d#TO=}i8F$Kjq64nt20*3eV*c8-DD`!~ z$WQrgm&@M%fgK hW&1BT-tN<{Pg=q@aq(oBc>DPTv<;sdANT3<{TGw(1gaA!++RKMY`+)c~O_QccXo6|l`T|MzhlNT+b?ErpCEx0rSypR_f&zGwQQOwLaNkhG(}t!*i=%qi#BlsyR8+w9BSh zx4i1CZ5A!BR59xfciODkQ&kUB({rtAy;`3#E0$aKta`=F6nupnq^b$sc6no!xpvcb zXQ4ICy`8ExidGF-sCpbOHR?@7`d%(AMaOEuYM(U~ z{NR0e-)9PbGi_S6s)hbVO9YpTxy!`v($Oir^8_CEtXebwYYWIuI1tE>RQY?(e{Bzc z8`mz0r8!+1i#$eSa?8p2`C%4opPV0#@hM}3-`%4{BBZUzUIZg#BPs)r0~Ka}0%Iuj zx5Q@LL%^_5%A0mlUc)U_s`c_r2{UubYBn16Z&t08cZj7bjcGfzf}(??Y^Ch_Y%^sw zylSdeEvB3~uhOXJj7&P!ta|q69IJH5nzEZhamrMxQ|?UNtKxiQCGXJTd~@14T=vSv zBS*4cZOU!l$~JR(!77Ved!0cM zO#I>mjbUvch9xbIyEa49MbP#`Ec{#;`PO&^s53e%07SrHrfmyp*OH1)(~Vr!zCpKL zbbE~4(ypiJ7P{@H+W_5M+?JM>7MHY}2{h@(`M5iW0G`*rM#xvlX(4UXtWLKfx=nW} zzu%{DTh|aci`(MT;_~wOcSTxGAbpsnLl+m3!*po*olEb$f1W2{nP>go<)tOc8qzRN zX`sRwvnX1%QP&H`7@j9Bcgptk=CnDUwwPg+%Q#t@X0v+K)<@}-kSB?5;B1+y>lxiB z7IblPJd(*gs()TTHa@ajr~lEh-Fkt>0{$g#EDc;9^z`xD{+TX1g&xU_{Y!lX(wBw) zabb__7IOH{2z((AJBYE`tQKpwZrF9tozpLSj~`}+W8vviH)cljy5V@P-jkTiHezxU zW_tHkG?uxV#`4ch4bibt5_J_KnodIptx{uQ$W*=Sz|^dB7;3>sN`AIve82D%}ULXDav5? z8*+(0zM6a3GdC;&*;NyO-FU++uGl-ehV#eORVIJk-kyS7I z*+xEL8TLmqY53Z3eG2jbilFDIMz-e|HP2}B3OYDIKn7DLlP!#oQk*k2qf(W0Hv_K) z!}v%(pBu}JjZw_5>VPaxCeXo^L(oJJiruWx3<3mj!1{O`V?&afg|B3;*6^v+P;M4H zBV%x;IaBm0qM>*=*_`zANVRM$hO%@89Eg)!RLhNYmZ>fn$mU8h;rebp-6B6^SHrr* zObj_09cx#9sGQC6oF0P`9$HpEj)HQ{`e*gDPCde->2yl)!uc+zc%Wq}899aANvGg5 z5a8!tm0a8MY^PDHmgWq6PAz$OhQSM=1KB-$?AdC`Giz4U;~R^R${1bo!A@6vN_SU$ z2QnQh$wJ90RcsTF>7EgYo_n76qtA)xc^uSW9jO=LiT3M_5?#5DC?VAoC2HFhB`kBy zdZXmsGU`V;8i*1N-5DicBV}4etJN}tC9;k%NvtPMm9dp>h0)83lkW@#KhE($oO~-?DI7O_f?BPQnBo}cgho|Ok}yI!WJ(&Jz;nl?WmB7=eX zScJ_J?h{*xgzxs!j9uF4!%;TO*y2_njxfE0uCePo;4r=44akRBtOMpoK-Ui9-RSl< zi`*Dv=jcTvUaLk1-~(Rr(0d;G*KR1V$mSSJU_KDWd^AGZ_(ktBY4VXA|1QT5F?zEN=Xt=b2wRjaY#|AL`Q~~E$b|&^VJpVp+3wp0ji8@u#n}WUlCD>i?7OXa zoZf=RK6p0C&L+Vhk6#CPU!UZEEXMW`FL^}r#}aM!Bp|Y%CU~IDo{_La@L-#LTf*dj zOVq7R?~;F;0==)zz9s3!1RDhXBly!k^Xw+Fzt!#M0krQW_%z$nV*Tu;5Oa!)8tg}I z0X+Wfp9}3U=&vE(me$?@b!;1yaCv&a4t=kk-o)NavS>@bylsPycz3m;>2{Zq1FKB z9}xP(L#;s$4|4e(@Q1A7lKu>}`uYje$LT5LKhjmNXji?WQL0zOuU9nMS+7V(y-q$I zN%-;H6oP$n6c^d&`QcOnFHrnj`uJ=A?VsAhhLeCd4{(nuA3sh2jt%-S`ftQbV$IFp zPU_gg4z|+^hkyFYEo0oq!fC*o=EKN;tSQzG)4`EW08a3Fad;ujcDj)W&oI}Q4E3Ww zp#O1p=Q8TQBeK(tMrr@zn*s7K>9GC2#$Ja0mWK4qqh;q2Un&ARwhEHJq_H4R}=Xl`)79$9HD;Z_DS9iC#e1$el^8zb~TME%BzU_fgTkd zKf%{S+>PM-AG%Ia|AIdp?gtbn6yimGP|h>TpAp^g$l+ zfgigv@$>uw12nTPiX8+l3nl9A@qL;o96zn+;xX}&qEL6OL0Cu_`u!!%?H2y zz(bU@kFSq2@L!Pu+MdO}(&^5G1ivpf4WCH-!Hf2w)#OJc@}@3gw>*9G-DQHS|2jXs z7^_GD;-T|Go^teL%C9ei-mQ}eXXL|PMX>qzZr-o1AwMh)o$3Ad zzpf!4lOHJ3(kHS{=XJOIHpIW0{{<y|ITKK^>vz(FZbIS*b+Ug79>HzY5Y zi$b$`^?^td$Ud!~d7nc}hNXR?2e0{?1tinxlgL)^qTlzBPu=oAlJY@bns>|JCHjZm z*~O^j?*(05tMwm+M|oAhXH~pvJSYvSQRR=&mm{a@qwJ_r)k}>*d(^*vU6gljEAk~R|(hfvb-UhMTI+zcU)AZh90larW%#ZJQoK3Mj>&JLK zi!f$*v!3tzRskzDFU<*<&!#u)l?0*@cA}bK14$Yz0)x4%V(+k;DMDIuOpHDTIk`dY0!2p$+gB zlHDs1;A(!|^y~1D>|$G#mqx8zgs9dfKynQN{9jC^~8zr*qL%@FNeLe83d^X3DfG zWee4a4GtH-dB{XJ=-78&M*@#AqfWqo+i?gEN#}9?myW|$nx8tX*XS5!aJ)1&+;hmT z03&Y!oEDQD2XPC~Umoe;|Pnl&Nq(4`I0fhAMMd15Xx3;ADkAlF_nQogC)vp7$cFtB?d7@z{DbB{#)X&>@$Ywkx7tN45{sz`gh;tq#%PkyUMgmb+W%MpN2g&EMjP` zHa`&YK11Uo;; z_>;KF6ma99r_SB>&va3HKb@ZZM16VEmx2Cc!XDYpXYrR7`20BRAjYy+E>vvYupQrB z(l^;Wk1(TV;n7Ps7INddQS)8BCmowqZF0qC>gY8$k-qjOGS}Wj_S%~m51GhjHd`}t zV<$q&u2Jpe*e&$rJ@5GClAf8!v1I~hP^WlN(FaCrw@==QGsjT{1T-+}-k#01QE&^W z((RcHaO}`26K_>c(JbH%%1IrkR#7R5`Zo%aH;vPFdW#kR#BJ;7&23q48hR-P+`Gv6 zx$2^^Ac9V#J&YE8t6Vbb2-KUc7&0AcG^SCP=*QM`3hiu*!DiMC7VUA1db7~m5hC_w z6lFjGhT}t85g&-@_2NSd-J&1k@WXOSK|YZ&*n`!HbP66;-GG9aK@s$E+6CLM85Q5~ z_#V~Jy+H=kKmcxx##0S^jQ%-kAupmBlRLY zQGC5oq9!*HC8Txolk ztX-`zx}`XIU&se>9t*|E8%f7@ruo?|iJs8ApVZDb6Dh=cB1KNQs&=G79yhzCe4Em# zAl7sy94z$|9_+NZcV&15P+?7HCo_{LChk5V+KQs{+NLrK(U~{z`i51*x3s!3?VmLi zGf#770xfL=2GQ7ZaDtki65}rOq|0t730cu~so>x@ zR4PhkmTMz6bUIXW51k)xNzX$8^=D<3_h4Z=i>@9f@oZ(dzgbG<^qgh5j*?7sds!~e zQ-QmF7l-hRtKqp2)yMZ59j>zphuiK^(UQ3V?C3ezX0GiAk0j~i?FTH|DbW+kIlQKL zHt(ea^1F~#D7f|_p5wZ&b3Ls5vv@CKx8SD*_c}0V70HKzz!iF z@gt%2WQ;Y4{<$F{;rD$m_&L7E{ zCjfN%ra?JS*XQM;8?W>dzV|DV&=dX|b?*jxl%0R~d|^4#;-9%Fked>0&yw`u0A?q_ zE-VkW*z;J^r?4LbuE$tYvam}@@XM%2fxMJpFV*Axo77#qkaEy3)CbrUZ3HPE6@eeaBcKGI^}lk`G@je!0!{As=Z!ZxzM)7{1cXx&fni)^TlZ9E)dwZc^m z_G32#{>jCQUu;D||1ILJYpwlI$96#pm#6Q9(ARu%8+$j&`s(s)SPOK-d#JvHy@%S` zM)Kevtq<-%$+AJN??HC{?ICvet&)z z^b4dAe}G5p39dgOygeY793U_bi$I`<>iu^;E(j>ou1d%rNmZX$cb z+}<#^w?EcbCr5B`e<{BHL=fLi5%7<2{t?bUhJ14E+OsP6%i=lM!T4!HgO++_hX-PJ@sxc~bJ?x%VV{C8P`>O;gg2D)lxJn{8=?>6sy^l|*i#+MI0eDZ|($k!fzl#=$@^;rh~BQikkLiB}B zccvs*k+(fXCI0Y5{a`(L>Rxm;KS7>;S?glqa#!WU)vPo~JoMbgdmLswW%BD*(7Syi z!jtmxWmd2SV>fSp1Npb4{9gh|vFMh!H<16ObQCwWj;mY#;s)}+;LV+}$M{X6t)g50 z7D1}DImGIEnV>_Gp!SI0b3@Ehur?LK(K0XPe3|+%}1fcer>{Cf7IfeiJ2J%PNk*DXIZu#GDAU`GLQ^E~i<8;65mcP1z zd~F^36lb^mjgZH`o-^>gl&74BFD0*V^t(RE%cn_&X6#0>RKG1#{^!;I}y2sx9j@O1?M`P41{BPk!|rFFObVNpNqo-Rfue?REzTCe^nZpy3r|7?m^ z%@0X~YF7Cp^j755{wO<|J literal 0 HcmV?d00001 diff --git a/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin b/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin new file mode 100644 index 0000000000000000000000000000000000000000..e8cdf9d03a89109e7d9e93424697209d3422b085 GIT binary patch literal 13240 zcmeHOUu;{~c|YW(BvMjD$#GpNv*)YSLR8eb{7=bAmMC$OWmtk_O*(7|3XT*hksU?S z_g=}A9XE_S6iHDu!~Ov2ThOOI4A>ruEiJmu4`~+!MHlR0=))d1quWDgYqYF&#g>)( zecw6f^72y3mRJt9As5y?=llQXeBU{jy!1n`mP)$pqdaoRx zzvFqaB9_G{?ngyP2+OHDuI&~}nCWqQL&7{3oNA-o7FJ`+sx{V{!g4EZ*K+Ne+iVnC z&01k?)2WmTg+|e>ZB+`TqPtoxG@ABCp(Js5iMzxSmdkPOp zl>^8x;-IYYoip zN?j2>xHy1lY!#$1(F;z|aiq`z_#&2KSir-z+;+ij!$Y!*Mblndt=cLhN#4Us-}^2q z|AEjxDh$aK>-BZJ*s5~$ExYM9-R%}4^yy0;!V1Vry`A@ui+-PG)Ybm%MrFfluB|y0 zSJu61Elo!^pxd_!8h-Yv$DS%k{z9@)tk;UDUzCKuT?(fP*j)xHg)2|uamRk}f3tzS zwi@*R12xd$Y^J)=Z)*$Eabk$+q2@e-qh#KPji~vE1 zacnbxe2D)mieK|-jep4e+Kax}HyA(CN2u9fGk$ake>6;f>eIDPb{Rk3N2uBF0)J$> zpPx=t68?ty?;PUq;X0+TG^fiDkr!!k+@`%+tu@M)J*o?bm? zwYEj*yZ`k2-y8j_W2$4Sm3pgUJ46Zp{4f9Zt$^W?e2DWKmZ!y+u7OcLJ|?k#RzvOgH#UI7D}boY?#Z|5}sm<_IkxNosGhoWKmef zavA5DQ*de*DrTO}CUt(97EZMFhM6+0(y}Sfs~1wK&zrwwUObaKZPH(U>9n~_<1+pf zZYc>|7WCw$``=F&oqaE)mi}9P%cL(2{b!^-vb&tYUrOSav#^60Yfi0Hub5V);o95g zEuSol!fF-qOlw-3`K)QRT-zK<%q<%!u!%zQ^hanc^-&s2f0V{DAEmLZ&sZjX>-DBe zA?H(j+v_ebK20Bt&f#n-o2m4&xJckSsvVyw&DcWgfz+7?(Zq}^A|MnMQJ#&U=#h!e(qt8JwMbh)MMisYZ z)m_Ws6>Q-ES+I#IyOhi(Da_5fRjnzqo$=`RrjyBRCbgVgO2Krw>~Y{&f(FG$`LK0$ zWN<=(1cnS`h#-yH861t0fa37$qH@yI%>C?3z3EADNoA;{^6=EzEO``K4~_zN3X+02 zUn^HMLpjZ9;5d~<-HIexriR3jrUZBky@%T-3^iWd$U5weifvdUznSY$XS+RG1? z6G#o^1br2VH-$&^Ia6^+VR1+6QD(z(^@5{_mFf%MzPVrVXoEgvi z50)Ad8_G>;xHVF`8v7kZtryIfuB#859I}kO@Gc`;JC*mcbHE#E7#G4Q|RtmeY?5s6; z;%B&U$fvc;QhMoZ?%}g?H_~)Y9GB7l3|X&c573ZRTFIdW6 zR&Jv>^bTU+TFBs_If#MFgwDQ$7LMQO`G7s?oK~$P*$h@D#Y00q*S!nc=I3v>cEbUus8%4`* zc=k9UIc8D~4yHk#JSQc`HdOn=Mu3NtlVcav<^UH1JnUbNBa{a#JsPNI;1r&L(8?heMJ>j5?a*Faq>iFkXi?Ak*|;Ng_{>IGY+;d9;?iE?T%9<1cI8<#n>@?X zq*gwo%1Ud|VF{z`#=%TnZFvsqU?#3snV&wBwAU!=QftYwAU2pqZ?ZprIu{Sw0v)|M zM3?oz<&&olq{2cVn7&IZG zfF@l{`tzgohEv2ctrRw3RY)27SiJ%_O+2vO_&9MhjMw=5## zQ89)2bQtq|gtYNR?__B5b2a{_8sEdXBU$k7MR5EI6ekYAYn{YD{ML!y;RxY>d-do6 zdKCJ$50@{7dV*eqk>htyiMefMavo!8O61%y=cY$V{6Sn?zBt(vuV79x-HAY+#VAvq>HZa$Hnip$HwTb;ONi378h^F!9O;3 z6!1f%ivPt?@euKnhmap>s|=GvKPFhc!y}mVUp(3qUj{d!lLSxo#P2A$MeuA-d_}>C z_f#9T0_^P5K{v)8j2i3hRpuZCbzhUqqh5^6*nz*Mej8|WI<&`kO zSL2aMz*qiEh*qg)-p%Yf9hD$O;P0V1041(4L$jeqlO;(z(o z$Wc#!DF*$rcP0I4@$CzsUIstq6Z!pmI}%vWk-&P6jL>>Uy!9Lz8C=iEz=AT%`5%(uwkg_dQR$1SJI9z$XqU|{}Oov(s`cGT4hyDXUwnF0n8T2J0lrO$$S42lo zI0cAJ8-L{~VLow!?0NB-uQR`t$9`h|i0PicrY!^lz_o(?;$%Pn%0FLzLgI&HFDCbP zqF;r*HwWX7wr6!d!`Oc!(RNhBGCq5J+_QHzA?EG52zM0PANdZOKboC||NVGhXL+B$ zbA5lQzUhKzajpoWH6+_5ZB`3%lM@O5iPIRk$zW6xD~~xhc}eGUT3mT=ed^C_&+{C{>^>lsicDP|8ju*yGlN(93uPl z{2!D*hWPjMU)(JPwVxzUd_nnT*vG%29+*|~lym>3=2-`A3fp!{(q@8_j?Q2tTmk8qC!7`6NppzCYD`lGlh zuloO!XkI-&qYUa%=a100BB$3!+tH(5FFpG0QT^(5(foSU>!e4&9Ix#G`PA#9N534! Qr}L=o=uziYk6P}(0Mh}Ap#T5? literal 0 HcmV?d00001 diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp new file mode 100644 index 00000000000..f283030cd98 --- /dev/null +++ b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp @@ -0,0 +1,6 @@ +// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp + +// Compile cmd +// g++ /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o +// Link cmd +// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin b/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin new file mode 100644 index 0000000000000000000000000000000000000000..d2228db77f98247a3ac53ee64ff6864a708d7d4f GIT binary patch literal 9528 zcmeGiTZ|i5^?1g+u{XQx-L#}#=ws5xkcu$t8Sk#0q_o?nO^Zm>&=e4j$}qOaYlC;} z%*;C8Y|}ac6@*YL0UsdoQT2;Rzz6CN5<>ZCtEl3mLh!){QWZt0DrvOYCIr-iIp;oR z$M!CT-KL_VSJ}DuoO|B)oHGyS=?4!UP-3y{7T`mH-#}ZeX?o*M4!m(2pC2E29A+U8 zgLu6WVgR~d@dGcg3%HW=;yj1hZ21+p)B@dY>Q%Q~2R(3FfgX6(pzd0Ydetf~_)f{P zTsx>X9jjmm#fs(Dy?LwV%vA$iT7hR*-Kslh8B?L0%}MYpw&#@I7a5-8J6;ooWteSq zwR*v>p{vz^!Nt1k2eunbQT+6hQ&^a@%AP&%NT61AaqT%Zu^Ga47s72?oSF75-}ZgZ zv<}|Jc=t1exJJ;jf)-jxdhukWm3pIEWJux`DMu^7+G$}|=sJ9sOdO9_Q8|PjswOto1ZBxypJFnx#8epT zE3|8f*%86U#rl(YE6|?b7TMv;hY+vAcpv`Dh>u0^TLr~e8GWjczAoq`os@Wj(Mu~< z*}l#21FHzJdXeD=`|y8fczG9G-TsO~_7ANh#A*TY+s0Po(-tQDg3;g5N8iD*Q($RL zLxuR0G^d77wcEQO(K*%Lm0(K%!td^r!Vywe`~t$9H!dUa2Sh2*z-uh_&gc+cW9S*g zCWHCLWKj2tm8x4>DB{YRv;BJA{d(1&%pC_P=8oswdJFGN^L8`mAImm_R>{san~P0* z&d)WzQ7Ip9RJ;ZA#9YZPPiM1r*VG$}5c~C?fAI3)FFyv5MA!L<$JFuI%GFED>Y_3h zyQ(V6>kwO3TL`RNTzPBdlBz11nEI5WA@*rpuHgDLh+V$;_9Zoj*sB+pFJ3{Sb}V*z zS!pZi#!Dz#VF}so7@{MJLERoxZ$Lfj9dse}YKkt0>9T_^Pm(5eJ1w`;<#TxX>tEh} zOHJb9&E;jbbxp0(-Y{LrZ1qcYVKUjpWkN;?S*A-GFBegs7E^}^CdHR8ub}7T!V0>g z#gx}$=m$keZ#te|t-D%Y&*Afs?aeuX=FeMuGB)UTse~QYxBTh}N1LH;DLSgAV~?G4 zHAB-2d5w3_qlWQ>_E~Ld&-8AM{$_H!wLHyv{1dob260okpEy<$>a2Gu_A9I6hNi)nmED<<| z>56X|S~A_ZJ9iJ(9zzw-rDWQ>+sv+_&@M!k?lw)t!Onvgz7)AdtAH;+Ze}mFj9Q78 zfnE@_23FnC%_ja6`_>RVvw6V=3MMb`rbFh>*PHqRcN_atx)|8ilHNj3{Yp(2@xX?a z_HZ}1mPv@;tgq&pzM|I63eKY7sE61U)0hV2pY4{Udj>Ymw1<0!xu%D>vspu?$dI}h zYQ%P&_E=br48__B8hS0zeRdEHwrk+kFsG-caKBJ?#GYqhcV~xT;%&p&hsWf}^E0r~ zrx}4cgASHTArF$`1{a&{_8n6dw~*j7GpNa5D1=OyYa!D##c`pgAFq}|OG+k-8;M0u zrcUNRah9kGa5Blk@DNbqZY|S7+-$mGXjnSpm4s4li4~#YG+8Zi)fk@$t*RL38h3r&%oqR?YSU zR?nQv(0jbVQ&`K3gGQGCl2^2g701FyJCrAUG(#V=+(#BDI_R!2awiNbOAKSLFNofN zFUZyB3&w2YzGOpRW+J{|9d-Lcm4_+8Qoi38s=y66Lu`HC$TRNAum9|1rb2J_M7&`& z+I2L%3QGoetM6>65;x)xvi13cRkZQbnGXFi%&0@y1o-)sdF$s%Mpl=zIWxC^`o8^0 z;UDt?-EQEAqLy9`PUw=DH!eD3>K78rV!`Ut+{fx&+&!pGpx6WX3=L!|RhxQwp;nWe zCTHT^B##8tX@pdhwL2bX7UuH2t$VKAW+lRDSdXJV#XAd@Z=|zC8_I6hXJ)jMMC&<0 zcsn%quHN1qw?ucxllZXgJFGn<#m5A@Q1F~4K1KA7lYOM@n`%E{FXQ_K_Zu*87s-a? zQW713_+$b`aK4e&<9toJk3phqpDDq8^)SB6f1(5NTN2P#0iMUd_}DPBg}o#rp*oR=_)r2yaXqHsdL~ZN z*r0cETKrSY{}A)LIFs)*=pC5+m*ZBk=*3H!gzx|Az~w~0UOcdl9I0oc^Xa8nhkf24 zgYFoGiA5o42hQ#&oL(C2z;|#>pFmzfxD|(GK?3JfFdYoDm~<$B2u~~u@6!a2!jD@C z_HE+c?HESfKix{gEYZ?cQ~OSDMB_?O}x zcm^S|Cxh~q5*_%ifExr4b>MpfMt`=o0G>v$bN1OS@X{nqw3H5J4~*Q1MqGFf#=Q|{ za3_!Z7gF%?7AVX-Iv9k9TX>uXK%;O)<@%Kld`rkLj3Pfj?@s2qEo49GJB#_i^dUUv zrAU6e^Ll3syqbb^YcLf@#Gr!uUVQ{kof#fvxiFy*!wW+gPw0xL;*E0z1qtJqZp9gY zocPbDV!t|p-?B~%doZ5qmNG)}6}CT#@r?Jxf1oG+0|O*F9>#xQpgaEYuK1riGae7^ zJ+FlRBuL*YDdp4|P}tYCiQ9(h{L`%jvv-Kub2|*Q#J4m`nBS(*|Di#$7s&rz1aD1* z@Es{oFyAnI_W<*M5_SZ`!#{iZQ#oRRbB{p69|6`yG=%ZIcX;BXA|2xS=U^+HAGDu> z`tW zt{+9DkKnliAbwFU5*-#uTLhf! zr}}BHQn1yC&`SVCv<+i{}sA9~Fvg$^S6!%duAe zk>BK^{Qo?XR?dfoLOIL$5qdLx@_eKoIm`2sb5tLDUXqC;8N@8lNzPF|@==~g4;#oZ T%Q?zN{>eD@^2xZ$S@Qi0=ju#B literal 0 HcmV?d00001 diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp new file mode 100644 index 00000000000..bbe94294805 --- /dev/null +++ b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp @@ -0,0 +1,6 @@ +// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp + +// Compile cmd +// g++ /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o +// Link cmd +// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin b/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin new file mode 100644 index 0000000000000000000000000000000000000000..9b7c06c6f791df59d0650fd339ed10f850f64651 GIT binary patch literal 9528 zcmeHNTWlQF8UA-??W`T!8wVPQ3SUzyHiR z$%>nRWM^cThq|0+ksR4SabwV%Lx{sEH8YU zZ8Xbv1EIR1fGbU}71~}n!TB?)PI-RTss;9(qkx9%VI4RPNsM889l~3%B4MUmExXl< zgf4@(k?&rC80Uv=D{R9nxHD#D^pAbl$b_4 zdJ64*VD=`k8M(e0w*vj;oe4^`ycc+lMtksI20oR*ZS0^}4|K_u&63@aitOM*d2g{U2OI$kjaXJ9n(*XGOa36TyE~4}S;OK8fWu zUm9gj@SGbuwX(9GvYk^a`?F$6#P|a}S|Wt9m3a$rB%4tg_ybUB@-e1TAB+rQOd&E# z9SrCEgJCnM)LpMSU%|?nwOh@m_k?R7tbo5#@q@W~^GVGkZ&7MhYXi_yX+3owV?OoX1?_17t03o7SiVU>o%{29g5}b=)$`gi%U#qo7FkWd zkVZ6TU?DBe<@)%5PS_`HS4z8z&E3QUd!glcxSxj`c{suLv|YU1&cl5eE}TEVdj8@$ zfqh?d89l^FFh-i02|Zm~}$EHD^udZ8Gd?6`i(awcI5~pW?15JF9M>*UoynsT<{z z9(B^kO!IO5^ZN4iYnZ=LPS7Lny=%0@4 zvE9-*{>%tpnt&a|=(gN)!_f`L3xh>{vpsJ=8NQ9@Cf%5yn$QhD4D_CSY*x1=G+X%t zSKfyC3D_un0yf4!0UHwu8{>sdpB%ZnlL>8C>I6;Et^D+S+zZ{RZWcJ3BY0}X2|0JXT0 zOTce6*20Fd!s=EPk3(@XLUhG390QKe_Npp86HRk00L?6H7$GQIb!37aX$G-HwBxZr zq}9xGuAR^~8llk=8{rG8iLqFkVDj92!>GHG8kp$sV*!)RqFJ0SO--BAhp}CO+D8)& z9}Hj{HzPu%H!yLhi3n3jK(!ic#CZiiqK+RH_8QHYibDq~c+4qy)0!{G6h1M+6O2$P z1fO!Nj$)`5L~7aB329N42uoI3DW(Dk8SfKx9nkY_;AX~rQ%7~lw--K*%WP0Ti5(I0 z`seh#&bi&dG_-&hE_Rs~*{PZ$s%illlBVGqd_fX~?MmR-q2o6jZe`KHi%=!RLjgV_ z94#C=aH!ny5^xW)xP>-f)p_CLW0O#Dgl@{O(M~@tB2VKWJ5V1`plGml6eCsjPID{p|vLf$E)%U^dz3ATaIWC1M<6rT`mXC0v9eQCN6$|=f&UU{5Ls&C})uX(D|L2bD*y4<)Ry}^fJEp zD@CNIPbtBC*Jbo1Sj6)5a;hUfZ?J(IM`-t=lZotZhK&WzxHJR@?4J2 z=D2h*4YDj+6AZK-)O+! zgP`v>&u^o5*j~Cr-vEOBsg2OSc8S!kR@5XJgpB`ra(;}XOaXi_`pZ-&?Qd)dX+kL|j-OT=Gg};M>zK8nSoEThu zKz^M={Db|hk2wB1!`rhlT+UG%=Zk3Nb`tSFPkY0mp&y;OrN|<5_I}E?hDme+9q_&p z+K&AK|I>8msz&eZ$w(NvyP(nUa-?T@zoCz2#Qy0o3iyixhCi%N9Ecszhx3v}KEWRy zIplY5AU{&_nLV3J=`RX;B#-^ogFca;NdF$dL{S|^_HGKci=!dFTfUu3AG;NL#I>T) zxg0&aNNJ|^??!Why<@8W2=qVY=-$N-iF-z;^L35h$kDYi9YX&Duw@;2iF$!8_<(nrYi zFA7~Ok$U-4MqL_Y9`0wNhhxUmkGKrbyM1CpT|P|IBer<#<~@8F`8TEfTd`!c=$2>y zx^c|;nsoG+A0hwa%g8SdNI}uW*OYYIl5N*tB(#5x(Jd}Ok!Kf>wN(1Q@8vNx&@_BxBRWq z`Jp?LO0ib*_kpgijm{s(%^|A)&!c$N{D3s5W|coiZ$(b+kFuj?wO?vZ+7r)D>fkB? iIjenAb5f3DRQu5-Mk>u}PRen7Dv#ZADz9o*a{mB2)l@wI literal 0 HcmV?d00001 diff --git a/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin b/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin new file mode 100644 index 0000000000000000000000000000000000000000..6e21efafc59f39c347cd5fc3fb16d696fc03742f GIT binary patch literal 21056 zcmeHPZEPIJd7isHl6NFUQnD?Pl58z)u~87^cza(YV>ec!*hw2Za^e&X;-u#zc_hyv zkLSG;a}+I`R*b+dir8&|A}NX>4T`oY`XdGUBQ1hPMV-Jk+BE2o!f5{ls*oD7B#@S3 zyNw(6dES}b+ubAW#-W9x@Y>wnnfG&^dFP#(-CI3&*FAT~W3gP1(nnnVjaoPNylmX3 z3-yoc>!J_&%ce+uTEtP8JW_6`fnl3kI7D}~ccd4{knXfi+TWY%HYOPwEFD;ba z*=D)6P+F>#Yo%<#*Seef8n9BCeH%2ol}5!~f$EI-J-@g-U0wtz)uxnZmTL_(u9+v& zX}7YtyGTJ{d3QC=R;EwSm*(8^Qbnb8VX<1nJ$;Xq%b|+eN~ujRHOh^K7TTe{jNs`M z@ayW$R;k$nh_s6^^MFj(t20tbdB3&|Z^(avCsae3^5WvWTdpri2i4u>=5lkjjur;= zWp*o4^Zdd!;K;L$OXk+@b$Uitoe@4C07`AeBn zd9hkXU?Wz7L%npgghUw%Aqh!ck8^hW5O^meRN9$hS2Kd2Crpkq8SUrF2)j6+dFkuA zBo+pnrS^7&+$g5@(Lim&+~0?K`)fA_DEjtc)LUv_NB!fdj|J-c49AngKh(kh6~piG zg**c-dVD5MyPkJ#YE|mH+7;&R_oTkNqyFEd-dhE=$+uh7-`lP*cfXGMeM1571~uWU z!hcf-{}zs;rk3|S;P)fE4)#C3zJ5$4w;o?VmXuqiSbw}ji>WZS5|6-`mQ5rL$wgIM zog|8}pAPim93yJ6@#a!}yt(YoEL3Z=r)O|inlCq&munAI%j3BjHNLRCR2gqqak`u> zk5_6djq&nwvpT+5ogS~RHW!v_xwMlRZ&aI=_twiZpDWK-8d~vq$x}V{KCwc&(({k)Adyq`}U81_q*NSJ7fmwLS+%_&IT#*pMB?Fp6&kYzl?F^HH+0H zE3M0{{PLqpr4SYRNW`qYv7cD|@uAq24eK;6RxeMn_zzUL{S+Jn|RGinC_4K0xqODMJ+- zhRToVx=CI7)c{98g4D}apXTw@7!jjj3dBpo0_Q~Yhk1!xZCN2)(RwVRwsFa_aPiQ9 z!F%bcW&L9;z8;HNw@53egwFZUJnB#p2q9&S#H>Bk zeJAi+Dq32GwTM@23h?e z?`)nP;)#+kfr8ufisxs<4UlJfMS-h9i6|2hI3Qt9xHmo_EFInOxZ>0fOFI(TB9j{T0 zI&=T>2H<)1)mKGv@VY6DV-m1_pDnz4#hPJ3G}9<#m_e>yy}EgIb8}Ot6Y_6vUfX=t z`h!Qwy}{s>Bm+AyU(@)zsM|0M|G{ka$|Wr=9nCI!Ild7?6f^6kSCA#E%Qbs4J%Md| z*`2R6?Z#4RDpOYJ^6V^Dy^T_%dZuC*xq?k&Ute)+HZ~aCK-+1of9GqqQ?%35leS(0 zml{}6Kk(Vxm01$|Ov0m=zz;aiXYF6M*QN@`ZT=M}j@y&GPU6qhO=M7)2R(E44Rvuj z{eUy^rurtSFAM!s+8*sr=J4m}`pG=(0FY{-(RWOx>t$@B?eytl z-cHw>uHAvo?pP{`ol@rb^|t6-cZ=EUZZUV=E#|Mg#X`VhF1t(dZ=$mJHtL=T=)OK< zY*M|OkDeaJI(XL37K&<(#d!>GrpDA*{f@lzexxp`n+BGtG4~FqSZLS5Pt?ucku9Q5 z9lfW72c@-{(lj25)-vy7eqO|Cm5C!gZFrsHar=SYO%+Jhoo0}bch`&qGi)I?DQw!( z@=E%&#w3p>jmp{$J)A}r?v)y$o54W&G^1v^^0L;UQ?BsU#F>UUy{Q_ZGc76LWFyY9wRlby^++_+vu6)??fA8wukLljjOGpKbVW9zIpyvR# zZ%RjK-O*H>^HZQMm1on7%f=74fOUixr6Y3h0La-{pMmi$Z28Pok*+SCUQB&Q3&*X`SgN}CF>5fK%Yz9@i@&h>H=sk&Wtc+1gf#vFX0i++5L zxf*EAL_o$|?QO6|I}xnWkln6Mz>JOjVxEt#h#s|oP{IV?Hv)-TAL#ZoUHFlHsDJ~ujw{IBm%wE&pI5|?rauB z(-3qjDu{3k7|*zsariq0(-WJ`Hec$^kE7h51X|ddBO6J#6PCD8% zhzn5_00l;)RcK$NipfK&BE55u6EaOtX38@Ql@cB)HPgYqlZezUbOgU_#>XtLF_YG# z#}IfN3?`y=C~KKcY&`FOK5QV>flU_O5^Xg?AzFpdi&mlcej#jV8Dm7^E-Rs(&HLC) z1h7F`4P&EYaF)HZUEnY{#j~Fe9g20r<3(XRbh5}HQ7k$GEbDdATMeBoECU`%idl(< z?4%EmlL_GCATvb(5;$iy!ky8>JpEPxf>6sRbG_6Wq@Qn(b$+v z{W!Ptk&_DoB&mg?Yfdkv3K~y3HJ6*nPMj=!;G~{VJO=chj?JAI^kC+&*j__X{K^vP z0E22g>^9ToI^LhQ(sRu-o6K-X*(tiL5J7U8bwyCAz$+^GK+$we^Br`-(a!J;IJbC6oF1$BSjV$ zCQv2>h6#-=z~_?1BME|9Ymw`rxutU2t@&gDX+n>0J=Vb#;gN=F8zvus$7~NAY!7s& zRE*9#cRge7grF=C3(8R@Z5ZdwT##w8?z@JjwrQf0GIfCkZwx#>F5@D6bQo^%Tw^_An@alTvn-E9ZOq#1!_CT!bDL^kLxw;iCLArD&)c_XuK_lqC zPt44DHt2p9ffwSFpwUYB6buc7st|{SPPj>;Nn8p5CUIiirA*zyh&KJ16Q}~r5)^#2 z7s9xXun*amKIap=FdK(+AciNU*QF)28Kg*mm~|cJ+CU6X3UZUeF@eSeNK-NtwOW!Q zp*tnuY9>92<-imU?l7skng@1!5zO!?3#vR+hqPPUn(Z?z+BCf$@^_)6bW7d09! zLArD*)qu{eOUV6gY=E(zWpWD)gDGu9E1^t1i+msA&}KqS5wKIV|mLvIGFVKuPZ7zfl25%su zbGobO3q56~igK9#l9+_TNYpzS1B$fUv&3fPBE#D;DUS%z7Uk;fykJN>W~Zyc38{`i|Bj;@La|T>b<0SVi0oGcq0niQI7{7$s!0FdQQ32o!Wl z3q`qpZsM5L1KJfF0%r57bx(j84Q@SEJDQK%9xlW?hJc=_9c`-3Wx#RxtTU3xb9(H_ z9z3*A38nj-9)NVB4NjyFWsmEtklq%j>$ymxb2!7a)d;6JUeC1?nbTf)wFiNLSH!(2 zF|mtBab{7I(~+j#X>)akxQ>23-N7fLC^$@Xkn0Ve!v#w?YW02^1UARBB=H>`^?JGz zPDHK9)pL;gZ`)IH^vilU#c_=;-xeP{3dk%5?NpS{-CGpvWxb zty{^6#(Po3B~)OW=VS$@N$H$P^mUNt)^7@2NALE92ZJ}y0n>iZ;c!2X=ksVh z7X*x8BWHr#lV;bR7^_d%cqnUrB+TtUA6e%DPi2@XKIrJ83AipX*3K2K^8; zsG&&RJ81|!TFlYcpauy&YM5fM@1$X>?L(e(>2%qZv2Ul?4YD0+ns+I6y?&5qAqD6R z7&Fl%ul4Z$NZWJdB|rc+Je)xU6TE!;8T(EVCGL|Sp|xFlaLj8WGbY|(*c3Di`O)Fq z>(Zk^t8@x)FmCw12m!^=f!ghq<@H$5FQl9Tb6_y^3PL7`6BCF^fw(;g%)*OI><#o9 zUQBcbpNAiPk+Br&HoSTXppMtg4qr1pz;Q5vS1j9*F_JX-JvsA2G^zmcXgG>|X}FCJ zw8(Hw6~W=nW8C|OybHfda|+VYME<>I63TPR_Q#uf1CgTc!qjJv+iRvhyy!%%AAvfB z$+LL%7mD9N8qp6i`HT>GED*YmeqQNi3O-cqY+MoDO$Um6r5(&cMT7%ZCE0wQ(|G4Y zI-R^6%2S-d5b{u#r{VX4k~{b?5(8iS^xg{!VV2pNZwfdc2IH#e$5oa))ks{K^h+jq zNjdMWpe)_%La#ei%9zVB=FICmFX7EPe!k+XynMk1+)BBXX1lZT41xVFKWV7cX1NkR z3m$w#R*_d-Df3(W^7OP@S;4O--qLZlL!W$t_7=6hB^Ta)q0UAwm1o$_)LKd~O^hei z0Q|7=m)NWC=~iN2;C>>Z)>(I;pCpyUH-75cs9)|;_=(@fUccO}GNE#pIu_zj&{{y= zQp2IL{8~0ZkHKP)Uz`Bl!uN;Pp)Hko_i_16JbxGezc~`ej;nS2Z2uYjeGPx#LcLGg)5P#3$FL5}jez*s80F@X*1%3g^A3^fZ8abg7y-CHN-4DfaUrf+j zIp_;-erB%!ZLY7IE5DcE!5@;d54_`Ma}%xi^s>J5YXq$)_*KJ)cF^P5d9&-ewb+(? z-ax~*4ych;BQb_+Z9tt{OKz#(#69u*I?Ant+Au8Y!l3GE4e8Pr8jAABDn3BqF^azn z1L`ZSWFmpvJNE-J(9g9}>V&~WeQ!|xPAionuF2QFG^m~%1b-^UuS@RiHvEqy)txBg z5c#f_`NqW9JGVBxrM?I%{mVfA+Wsx|q$y7is-rEXY!tWDmwME*<0#)`(H|*4|FqiI zimSsY-}D;ju;1T8R{`Mq^o9k06JxDpoczf+=r5d7k3Th(ls>vS-LJkLS0k-He8+(p z*^fqDBp&3(LI0#|zBMua{?!?j&w);VfycpCU!UKf&(q&SeY78^J=os|x=nSjqYS)? zt<)j)HB|7pFiwB_9NgB2+ywQFwGJMl2g6!^Z&wSk%)|G`M4?}%BZe;W!>NEg+G8Tp z`2YA*iG$Ljn{Vk?7vr$EU+~#ad(RHWA0O!-_UQ{L;Fa9(<8=YQV{6`s@v{DY!5{Ya zwQg$TkMdVqiAcP3MdGEai}8~19lWnw3jLjI99B1gr2Xk4{g9UbFH2oKl1R3O(Y$pGf%O>T zDQ4)9e~s~}?8$?)_kLtEi8uSjVIK*bV(GLmvLppXY?%*Pm=0ARtJb`1_>+Wgpu|f3!Y~ zH~U*s4)E_M0Gj^-W|4)%)ZgZRLCQY=xwV4`CGZp9Kl)g^KWOh(^rzdElJSM%$niAl z?nxod>%=boJ0kHX{!LrTc2frcAdVB@KLR`s7(5j5M}OStl}&qZK=&EEG2ZQO9XJpw zzaad+zoTv_3h2@IELs{tU*(T8Cx44axW(59eR>Uo=13 z?#Kun_T=Np4>!^NL6JWNeobxXhyB7WW%MWV-C^;U`R>mKP`>e?U&eee>c;zIp7QOjOOV8Wqn7_fFO!6y1`6BSb;84o#+Df5}NuY=N zF@B)_{0a3#j3@BhZja8N8XwZ(FzL)E)Yso1&HhNMyW@;Mi8(FbC5Wx-d~=)I*B8{ET%daBM%>&#&u}U+8aK>qGlRU*A?AjCuGaKSg%YCv>#8U)n=|VGr_P zr$1hMd$)8vX#7)d_m;$)*B>sXUaB|tlgFq|^v6y6AWnTgo)Bjl_TXP~lH$Ve`O_!< zYJCa#*DLndb$gPZB00q0E&i;-U*dI)?4(QnSILi%EM@U0=_V4g09ES}uSvJ7OP3K> z^at@0ruSe$Cr3i`qd&v+c-*Ipd~a`ip7=NF zrh0Y!qZ0I|9>efD!lg0(KQ@4JcaL9QU>V{b{ZCo$Px)ocx5)1S`$tet_UQhk{&AFh z<2qhgK813>$xkRh`!wx!K_60*XTLA$m&N{`UcW5%d;0w{?2oh(?e!n}3+u@Nt|zz7 zV?6}?_O-Sw&L>;v*G)N*2-5dHl9GJBb&UCi_ziaX{y%pF`6-cNs!K3mL*J;|mu$C( z?c39<^C_65eL?)6R!@FGB_v-mNxV2HUc-dSbPE z00W-#N!?#?_sBZN2=F_43;am}P;)$g1o(CVPwuopN4_N z^s#^YdV~6&d`b4Kj=G(_4-BZy;3s25&wSc2>7KxFyEhBLpVQy3J$_s zZ$Wz!IvfoiPpo#Olnt6+?#6~CSnf%w2Tgf;pu_&&NBaAT&(;w7Q{rLE`XS5gf2`l1 zY$!edqlN0?Z5{IaS{A|!`-%2`#FG8UzBTddK3S?K7jRiT;{J{Y44G+zH5W( z-lw2Uj&)1@cwE`7uKup^Ke zf~C$6s$riFq5lWvQt9Ncedq(FyFdA9{7W8kd*3JSI$65^VzqMG(seQ@L=wYun2k6R9=k`qw-v<`SW=|@r!2C$0|h*@*v|&nqr0CQz`XR z(4#s@$D;x{7WBnGM|qP_9IYes4~+cEoA_VYLH=$tiT-n>aZy`mc90+DCIH9Zze``e zb5#D_no#%X-u0q>LZObD3a=TQOM=H^x==q6ti*kCQdi`SLCSNV5S9N|?SF^;JEpX} ztuKCzc=DVMd3*!>s@Th{)NdY_l;qf^w|#a1>(TljFrCNvX>A++guKOB3L@L>pnRt+KrDpV*PWout_UgRTW5WABxlm)F(tg9x5KFR8^=gsPqBx&_4i0s6r@BZI=KQlnY(%H*zw!Kp-_52$QBmA5luCx;f-tL;*Gs>yl?A$Vp?QG zKl;l>NC>MoSMyxY$zrC*#d*2r(XP#v^9^B@>sGNmQxTR|Xn2v*|2yIgVS?NVX3=wWJmu2U=*%d>WJ%9pb_3AH)LE#yBE8E&CgaO)73 zVB2O(m8?^Ot3{93xk|a_Ic0Bx;%DUx*@anq#&zZkdQmEtF?S0kh4j}>2cTTHB{Sje znp3Murgh*B;@!*O!>V4x_8PE|^kQeUl~%QwV<7SSP-%PHL}fn!+C~K-9;Z~Ab)D)Q zdtY@cUd3Ci!oz^P#Kr?Jv+xr1|G#;x6Zr1=!n{?PnW+^#nab*rXgm@er?hVC_1$+I zyTg|BcEWZ_MF%N~T@IcAd!LP}(6RZPOcIVhrH{dX+nMlBO6QsWx6Z_-il9ESm*y~~ zaKsET(sR(>0Yp6t7#C3#*WgiLK0Vq)e>>oY*wzET7w}L3zC}@do$2E}^c6*~>9oXC zOs}nIvwfT4eQki6J;(6=9{ArGuAhQ!^jE`V|4Pg=6(@vooVim@>B?dALx-H0a{mNH;k6N5gmcQf+Q@e=tH6RVngUd zAQGW*Z@xP2RovWMv7BGXVP?-dwMwP@P|+EmFA9;HFHS7fAI?@z<{OK(iTq-j0is+CO4rp%&FSxGQa_ogt0wjTt>RDp^mNi8qM}mS0k;) zAWe5rcPDi>PON21Zt5OnsXV=c)~s(AX6KDz zT6@&(q^^!`d1d*c^!G9HmqLWl1!v1|F0ZcsWp!nF6`?aj;U>9HA-C!Uw^po_&5VUN zB+qeY3!Yhs-A21#ABnfZI)~&4qSR2$&W!t>SNH6 z{up#j1azcR8_pcHy9Wc(E>#9iitFe#{AAfH=FMa(BbI16i*&_ji#a$}y?*LOoIM6* z;nLukdwqJc4T4<&<*!er02BL;+xW^V=j<%LsLF}MR5D6=c?_(qqD|t|-M`tyf8fX( zf+sgGI0?bz1)gGL{(PlwEl9VMC=_Ek&nf1u23)Ppl`IvIBudnnOZd!MR-v$M*4@;a z?xIp|R&W}EJDkMcj3E`s#h{0XEIb4yGht8@l(lznbcG!lO|7Hy~`;) zB7g@~8mGDs@?(*+=vO^?k$T_*Wf{pi5&kA_YmQVQJ zr=|Do2c?(XYZZFwo=m=8FOicQ@Df}-USe;%yo6?kY}8B6mPy~s$$*zr-Cpt$$HAscX$*n0fb5o>2Fly1sTt_vC8&qDz}lblL?boEK%vzwPSp?5#2qHn}gu=RL~ zn6jw0r+yqax~05L$%OA~G8J@|Y6|D>jC_2hI0GoLX3|rssiOyPJ}T>&#&d2{ng#Hp zcGC4Mr;4w#hBf1zvNSP=Idu>vZ5;wx*r@(e(bH^fG6x+;D}YGLs!It6yD3KM(VC`u zHj}twj^&oM)+Fnb{2YoyN4hDbT`Y!UA+wM78Wpay2nWmV0h3EIE5MGPhHl{&JpZjE zdAR+SRVe4_CFL}}S!xBYrGx5{$jN5iLLD!2-R*P_Dc_a%5_U6wRO?;?=A9hbFt{`b zM?_>iDz;+WPV;SYPP!wbf%#NKGzs>pVM5_If`kxn?e*7vV!sf}+x&Gzm>qaS?C4nY zM}2@lB%&R7Rsy_n0(UgKS|YMLDjvi|NE3gNkzuxlFF!zN?97PBP*lV)9|>bV86j!h z=&pq(AE@ykYJ49f`A&`QHpqY7u2hq5xRy!q-rFQ8CwMD$>pF6@o=ZKRT?)1MdW#Id zDkgR=Dvf(FmSf`V(ojo$2Xndt`w7;Kh*(x6;#?f`%F8hT&&9-3jVRwm-LxIy1^#Sf zP)ze31@JG$#rGS7gLJzWefNpD_(L4@gM*i2eN(@peCBCcHWPgM16>)z?SA4IG13%gl-&6Q(OpF5m0pVYGVu18-a|hUg)--+hg(;%Vg30P%xH;@lpw*9(V#_Tsfu#3If;iM3()Ys5cVlV?ttz$ao@ z$9TVZjeOXGe1N=1#VyN*czsVqts(0zL;OBYc$5snuL3_#cv=Jhwjo}L3zMB|-YCR0 z=Y#nS%li_qu^;A+7}2-!&-2>HlYQJq&?A4N4FlWJlt%#a$zG~ie7^~Mqm8)2H(>+h zKVu;Mno=I(XFkt(U;bX?_aO0eyyBs~Zht?UEZ4$x4H*x%e-EwuDW84&T@j=Fkp7JW zuglsr>%ZC^GH3^R*I&9*T6IFR1ReTHiFp3vsc-$AjsAaBbU1zjEu%_HFmvhacB?dg88QNA0`6eAhiHZl4UF zMBqOk0V)@=E_B*5t(H~w*wa{|4>qa?Yw<@F@ZyK?(=TdWBvQP3JkBYFL__asT;nj} zjgp%Me78-6m{YGWbCS&OyJ?TC!~d${|C`S#C*AyHUpI^>f2Rz^L#?BA)Bb!N{@-$O z#|IccB-$dn`LC0xYMVmN_NxHxQwY6B{LBpypOo|JbO1)vA_U|dY?%@yQfN}(Qg{0O`mKD|F$j~?}Y=`pB}zYo>EXnH;BebQr)k9^eoq3QLg V_eYOGKJrh;QPb;D$5oG-@1J5L-jDzQ literal 0 HcmV?d00001 diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..bbc7d301593f72433c5b7626638d0e6f0f4a0813 GIT binary patch literal 11656 zcmeHNUu;`f89(-Qnm59nx<+SDBYiKg0b-XzH{#N z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W z)q1gzTc}pns^!&23D~5^eJyQZje z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo zzd zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l z8cXjac$Fp{)2rGySn+!Tz$2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^ zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hwds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe; zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJg+VS(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4slX>$8MT1f}SwbtT6 z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9 zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+ zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G zsDgp>M-8;CXUW!_Miv5%hx6i!lEx1F+_ zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1R{!MY@Lij z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA z*27_}t%&+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z$nlNg!-(Mc zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;%4A;gOT0 z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3 zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mYT8)SViUf}%2 zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p zIl#3z8VlT3jBdQ_aBuXVL3zNWSBWnFWH}S#$6NG> z?z1AncRQ zIiD;oU3r&}+oVD*^JK z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB` z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD> N+sV24aj&l2zW{T-tULe! literal 0 HcmV?d00001 diff --git a/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin b/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c5783aaf76e89eda9066225eae34d4470b74a950 GIT binary patch literal 10296 zcmeHN+ix7z89%$TzQ);g45UdDnhC@Z)O5TTU)JGf2!U1&A_Xc`C>>^R_C|Zl?ATq~ z1U6|!tEwVW+lM0c0riQ8wlC#@N>zp0f(j3ahyDRogerv6)OHC_LD~Mk@0>H9-L;Eq z6VwYw@wt5W?>pc5X69_3JbvQta43}O6S9THZ$wkgX?XJnxp?!S93S5Ku$U2P(To0S z5fZ|x&(%HGb26Ceaj`7dJlgfSO0FTS%7Rs_6sp4V@(s`O+@e>l*tKfWF3i{SIoqx{ zUU4CBXB;m(XIH9j*)HX0iyo%7=Q_nou`+8X$9*~5lTe>?+Td+d3OQA;%wV& zshV+0aJA_1I$N#OJ*VQ0Q~a!4J~KaS7hI>D*NalIg1MV7DWt!4+5nXWTQU>gt~>R* zWZDGoAl}^!KCI?7Y_9_axHLfhtZGD$f4q&^1!ZD+ziDeY(a-#Qc9DuVjRUYWy`#1S*Z zaMwY57ZCL*U|d8|+<-@c`Rqs+{XKviVpkXZA;3cc_zp$!4W^HE(N`6{rqdFam|k14 z&h{OK_pAfd?0JUwcESJ7aQzfqM}I9$_7AQD)NCH`k>U0DG?l7fF#T0s^ewbQ3QBWo zhKM{)V{GVbvw2wbx6U>X_w!T;f*ArzFw&LqFJ49Wbv>F36 z-9z1d)ZI$madfMzt1$n1TH4ex2@_{%IYM2Xx`(Ly0(JYTdyJ*>^cq^TzI~WoG=^yH zQMZ@61#~N`E0?9ePm;eBB7`nDTX}0`ZS5~>t1D{=of!%@$$bjBwUBq~#cIV&TT^(! za@^UxXV%O1blee^lgpuG*X??7DQ`|v`Bf#_v{1BXD`vv9GHFwm`qd zCjCuL9Wm21rtzm>Q*pq?fsdcL`CrMR^8QF->LcZ)NnR52r=>p9n@-^`A>rw9=z)*L zdNEVVn^wN!xr^pji|1is)f~Klnb!Q|xM|fq*X)YNR%P2P*^VE%@;VZqfR5xRpdNcD`24&#V zz^HpmYH}R}y8z1Fl1u_74xO;^l~u{w8GKPy;zy}ulydSISQ$l|z^Qv=yNUnk%^L`w z+`ixh1d|tdijn!{>Vh>d-Aa>R3I0F9wM^v5Xi(v9-^FX*N|}-;#6{eND(wBgxci9I83a%J_cR` zBk_d7S47P!c~+gbsfJPxG>8%tlj%5lH(#>miVE!}V5`DUC8nm+lhcW5a@Ez_oWwH% z1RzVnS3*d^Zc<7HVhX0_ay}dnlu?CELLuoZRedaFjhh5bijejBjE|97!|o(g%Fu5X zb9qgWQ>K6dezHb=f=FhLwgUr+UEf2Pe#DG7$PSgd(5@geqqT&Il06tkXPa_Sz}3dxnQI{bs44*#_54*yOh+C-9utdpI~+juketbq4y z`MB?WN_x+JP=d9EUb-jKuiHyx)8sW+wzkuL)!OqI^bu5Q>pDF@!`$Ms?fckRMWTODcHI^ zMNC=Lx~F~|x4NagO^LYgYa$tRmTC&;Zb3f4lAHmQScTM7a_ZQ`9mix@(Rj{nO0xi7 z)K9yf<|pt+bq*QjvqML1Y?kCZ~ ziKFZ9S@}wiUQ^ECtEHajS~{SviJVNv%`f0}uCtx#BIVoiZo*E+k89m+K-tNX4Fk&q zXGCPIU+jdc-S~^_l5^4=841j%BBDvK&kPX?zY)ZRc>AEg?h%KDSlQ*TBf@OM8)8q} znm_6T{2|fbhG!+f8z*r`v#%v0`}@UXxCm+DFETvDw(#W#2#vjI5gF_kQOt+Km`_GX z8aKLYp~;78{D&Gpz=$p6Mt2+Jzi!v6Nhe&(BzX625?xdX5xkYUYZEyNe8q=z%b^xu zZ;|2GMaAAlrSc%gN>rR%9&Cy4Voq0JKgGHc5i5#BT!?{Qc{vK;g{XL@(a(2Lx9>)H zfj`$65Hoy70sPA`@q@;|0Nw8OzxQ-Z{2>PVfq|>BzP(q`Ki)5H#~MxYFGpJ9+gOqQ zIOHuyTjD9TuF-n9CBCQDWPgM16>)z?SA4IH0YB0b-&godREz-sA>m(qx{vhla{JhU z)(oxBi=l?-o4HkpTIM|i_D62SdbUs~w8Fr@3V$0$>j3!0ZfIeCx*vqRSI_r}H)5iv zK|4$U;NkC~#!m4f6048+K|k6U+==uQgDmeparTukvJ!2dH6$)XfRFSMK0^2lu~1J# z-Bpr2maj65)`;(~mViG;0`Z6SXd}wA{CB5nI| z_Lcq!TeRJ(3PlF6WV|&TJ-l*^2bs^A?F#QPAk0D;NRxmx{bM}?N z5%Qno@mv(^p<#cG{G0G%IIGRuNW8dkKpgbK;h(*9;}o%o3+J&m41bOI_t)i_6DIKS zDArNlFJ2=bb|4=h?+J0|iXq-O5K(K$dfO1cj}aavgYav>j}e~Mz`tXN*J8qC=bE<( zF~j*_KF9LD%xmn2xhG2W>-ZOV?c>QlZo}x2zx@pZ+tHLq0P@LRs#$!$348k+F@9VMJjBm@f$_flgUIgz;^%n9LI>U6UN%{-h3OhH9&G;sTK7^u`}VscO8FuE z8v|aKwHelby*p^o4)U(Qe4n`9-NAKo2eH7ud%r{zYE462jEScfQHHVV-G>5AcwKJ@ z|5i-gv-p9ay4z}f%MdTd#2z0HrvJgUZ6EvUU3b`bKlm_yT;u7<`;Q;9ANb1s4^q%R zZJ$QqKOzEDE@WM3H)lpIYwEG5u|ywiR1Y@d?^nRfAHh$*sCAG?@#^t7rxX$my{B=F z!-zLZZWi#JHW6Y@y}ryzGQaPny?GP<*A@TYd`>y(GxuS%_urOn2KNTGwG-IYt5(OKiY&py#YU^R44x*H{n02 z_~XhT(ofIJPX6~d;csrBpZx6PzZU%X*L4PdtoSMB!KV2&jDF82esx?lScuQcss2rw z?5Fe7lwiN#sq_;**qZz@EX4jG5OdNMZ1n#I$fi#IpD6wyEzLXmZ<6^V?(87c{9goK zw~g|T{HD0-|Iemr^>|V#)T53cfw#k__ebl|quwt)2KDjxq1qQsuSdO4dJOWBk9t2e Yy&m=c=rPDg{^>YsdOhm6>QVFk6SbnqFmd`Kl z_@X!|Dk6&eoCpb#?JxBQy+M5uBmEq%NSL3s{!*uDi)?2t+wPoM71=?{9%Ki-_F%PB z>#nwIXU_Fo&01r%v$nc?ZqNqSbhz(^ufJ69wVL<9*=zM%y)}sBM7GyjzB`A&Sid{Y zW^3`>+1i<2eWfL`5?F3`FeY08xb8!A)@m-j*01;bQs_4LI&wNH;NZFgyEd=~Uc1UE zj>YFDiqohsFQ4tzyGxw>Zf|w4Iv950Hqll){XxAm$bDFrWs_|C{?pVLS$W4;eo(f`5zgdNp^+ zyThb^(nY9|34ChSjca{c5xbh!6oT-_ zMzlzTwiS5;#-waypW>ZG9AVJ~6bk)lW)k-hFd}q(u+lv~SnV~I+MVXP21d2B_5SK= z=jnF+c-|7Ck+;6ww-)WL)m{5?FTd94_DlJ~({^F0_;h1%wp@Is(JnVyE9ag)V+|LJ z)-zUrCEFc}(6_(;d;c);T~sUqZ{!sT89PHaJ}{mQ=R!szJR36B!f%Ms_5Zp4?&c5P z1qSG637rUMoDt)$MnlFVQHJ4{L!l4w$lwm?NN7`X55q5oLPm@Phv7{TdjA~&kR67v z3J3p^A&23Y!pOi991DwCN`o&aKzMu>#$3pFkZeChw@={q1LGUd(fGT9?ieEo<31X% z(Cr}Ip1|#f6!{Xt5%O%FMqj{f(>Sb$?-@BFW$E^7;kTeEBZp)sM_i}r_K$4qucfX3 zWZ=&wP$Kd#<973gQK4~{#>Oe!-!uM>ISB*hxE1L}%*H2iyLr=i=Qj!Xs}S9fkl|m& z=(=%?U>U~|DN$ToZ0^-*G9@sbtGIw$*!QTZ2r0rB=<=MYi5- zVn^%O`t7f@GFA~in8mrS)$3%`2AIiWZ#&z`m{umcSjn7SUR|s&*ZSBkpLi07%IsRZ z-(Fm9WwNc#pf}9ic_}|EvfVnigG~0ERm^0&gI*>pH9BTKL_>|`Yq?_|l_t}psgS=r z>-lZS6g)Ejzu1f72ptBW=?vP9^8}tnI&pr^Bo?}l6mp+Dp>T_cDY4Ld#5D6R z4)%d-K2kD)6Z5BPIJ|cnwZ+!ib|?2REt< z&oMuUX30-5HNh&JfQ6p6ozH7R@q}1B({k`g3oJtu0)&WI?Kwt(0f}A9nfI} zwBn;!OaZ-+_u(n@ULN>j&WD%U3GdfKarczdK1zW@&x2l~T;a6a%`OjAI9*P0FAvYF zpazLfmf>5FCJn;2BTqfnko6}VE8!C#RASBmJqLKV1Bgobgq7Ydv@Lc_EN%~>O~uSP|# zk#yH@waCdLy6Mwjlri%?j}CFl)90zvmAyuWs&45tjl-vS#?TKSv@e_xiUx9G<=k?1sja74k;_NDX%-8nRVtM&?umLcyS$o}3VKBj zh9-*A1U1;=95Tm4Ol6eKOW1>CFNH%PO5P-UUJ^&bqzVe?rAX@#Dp*tw52?h( zC7~kKcE#s@iR;V#PKhsqfwS;zyV=q)G^vcBkp^W{t1V^+yPE_Uz3Id`ghwI29hofd?CZQoHi}1fd}BeHTH6`>(0 z%du!#T2e=4*)=LwKvphhjx5%TQCTho)iNl{E$qlrb!k`9`!6-`4~eRAZDrlD>=8f910sJFfLm_#-0=8<+UiP{KAW(bg|^Aw3z zD6bS%Lz9qCFG*Qp21=LQ0?_YNdaNKeR5g07`k>WaU2ZpqS^PBCpdTOW%h^-;haYaO zwHt%la=ky`lZ=!yvot52-2{7r(sU9bXK;3rN7C#m)2D#+HR_F}Rt+z$(1)e- z7jd>Bhw_=DnOx>cI1A?-3({GHzYS+0HR3EP%9^(2Y~FFU4SsS#&& zjke`%!Ev_Yah7%lud~$R+whjeM%+c$Z$Gz1$6eF(_-h9H*2FB?j>8ZeaagNDeoOb< zxh2=(qSs-pA^r?w7H-F3h>bX`=K&nH?%83>b-3(x7z>oo;k=&E_qH^U*oebeoODZ+ z@7dw9<8VIbahP5Q{0{5JyGMs1HsY`@7BXD9XNN1U!v(Ly*mB^o|Exk=OOH-vbSA^w z2Jr+~Xou<1d4&fh%mnXAh{Z}E(Z=K+(n{c8=nLk_0;J_yV)!2@TEQcs*v1mo(h9MSTck@0;?Sp;^Q}FnT_af#V*O{70uZ6bkpCEb2r1tcrKs`A;6TshmMsZe1-~qSF8!I@GTT7Hg(=dNYr3Jijwo zEXQyA=q8T@Hv>`4TnU%E&Yt)$#+>A0zKd#r;eyqe}c*~ zaZ-#=5xggdx)NPJ*+x8wCEMhlfx?Kkm&v#Ryl`6H3Ki?RZk8B4N&DADu%HhR*Of` zW;yl{fqOi3a#*%V(*3=#$~ml#m^jBP$V@IL8QavEIw}hn+?y;jg@zncew;f!XmkMC`rR!1;@b`LO%foK~kve?YlN-*&b5H<5(; zN~ylM*lVrf&ql%P;s`5$>NUz36#S^*Q3qD)4e}vzK7l}lh#Zge*AE#Q@04T4r#$0w zM645iulxuiBJefB`-dFZ5WDa}+)fAPzr}mtkcj*ENk7ROpT)Nq_ic&D{c(|-65?h2 zi_A{5E&iwhY(_dGB9n0el9&x+Y(+>LU-S(H4SuYi|Er$2@Z{h?qi+_xUw0fJ0bD`+ z&r9?OUZH>DkLS;aw%BnB_JJ9Z9x4|W@a)Wp=g&`Xi5D=Y;JyLaj)+agA~upDYSZ@+ zxVDf0^!3Zc4_{1**OTCn z#ZYDO=!D|GFd-f#ULoQ1P<&bvqu&eDiRS z|3D1?PVVA$it^he{ri3T z7wpL?N1pY?_d4)K68iQ8^zFwgh)d~Pup7ji z3~?#3Z<^?=Zz(A*eP$2mm*K?_)(iEG+ad65yeR&DsJ^xPD#3ESaX!mK{VjXKSl1JF z<2dG5#CQ&@TURmPUOUM1pXMjef8;Z5@7f}Gg4P4TDXd59fYp44Jf;swdcJ+F7m1R; zk6y)kv~dONML6ZIFQhNEMe(P2eIb2wVXPF=KJvTsLYU{n*5;KY>_zt5v6L>av_BbH zckM->pZUXby?{QH3l6*fO-I&!a4Lci2_^kCxnA5)@lgC6Z`_R+`GWo!#iRA7?F7>k zDj#WklHo}Qo?>{)fp;)GO)%`G?Ky@yeyYcX80QNnczwtEwd~}j&Exi@ulotbKlc2$0zrf808Oi_!H-TBYzl9@cIZ51y4=-={sb9 z5*_xe3v);?Uy6VCE-yT9$GCr$J=9OGuL)@%@D$Uh9C(u99Rx%FygkkEoI~Hu@NNmi zzU)8t7yUGECk(RB`w8NC{(NLhTn2*nW9-){duE3AN8aDSkM()xDZt&Q(3Qx~Uw#GB z&jUgHfWO)o6Z};yP4c%NBRDoo4acw%0vu1$gkv}v20XQ2pyL^i68~hH07)l!dM_z= z;obWL&TlT9*dwq*61));2b?o0`U(D|?HJD|xnCa_m@hGBzJUJ1a2K4zMe<8CfbS3U znBc=ipOc?tGM(VPVLCRt{OPdBcwu~rd)N!_3=7(DN&W#|A7X6q$EX+fqlfINna@A| z_~Y`t@EFbm3I4d3=jUrzDBlx&UPvL)s1Mc`o^L|RqkgkfrK32NItkU+7{u zm6(?hufN#OI6vxqG<*3oVw(2PNE!$7?;nExIPX^u{R?8|eur*^u%GdMxMVlDqgJ^0Zb3SXlF{I`-;!5#pgOo7Z|g^%_fu$)Hc9Ag4BgB?3HGkoVwi z^sg2naEwAg#Dp!^X@>v~vt`1XSmqxTwJ zBigIH*RM;Ud(w-xhOd*DFhi+7uC;iMTl2`BYD#AyR^IyIXc%i;5_;N zsQ{l)@;OickLw@GDY>{wzW|T9_K|X2`l<6r=NvFOCkC RZ?7DqZX>7jqn}#t{{RQ)El~gf literal 0 HcmV?d00001 diff --git a/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin b/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..c7bbcfbc8082bb2f81a0bf73c6c8c5e63be6fada GIT binary patch literal 10816 zcmeHN-)~#h9Y6MUg3|`)2T=(tTW>})%9h>uM-scGD}h4Uv?4mnUO?z_?dv24$9Au8 zT&JPbDHB4|RHp4^NSlOsLTt|yn_%h#RCEt$|G*?vp=s(m^d__HKR~kC^N&&*ImPP3U0Y%R>}o)VWpb4O|xXV zg<9UsT5fL1ER~&QvzT8jxR{!*V--q;(xPeEwwdu%ZCgoo$#U}cPN{M7)x1-KzBu=A zu~^PpMZ{ciIh-q(s;*UXXDE?|ozJc;nhTD#oL4}xP{Q2F7bVeytqw$~W(r}Vn^mh? z6+&C!EhN5Mz{6GCy6M*8A=$+eDKCvmA;*y5xMYel5s!et?bh?#o6TCnewV*ubfWS zo~oV66>NL1=-RHme7auF)t9Y&dd8@%vd|lU`sv&KZ|-4i0HKQ(327st&5u6NR>Q-g zkF;~)Kd?}9L#qSQ2B6MT%iON{zA&Y|Db25DYHx6$0#(C#9LhsOh5B=BbHO4M<^b2Pi1x+l5v zNt$wzQ#9L49T#?KMj}_WBQ$-GI$@jaH8+}n6CqQIdd-?wo12?|-GpP!O--Y`yi9qa zMKWr6r&=hN^o)_lBb((c=3TwIY#xkT%)nz94*RNEEu6{gb98Xa!(BIU+%J~&gl=Rr zx;V@qPb8kuzooAooSo9?GnbyyGc;!Kk+^gmxEavnXYc-3y69qfJdysl`ZA<13H=9! zJ+hlg;gb;f%na-x#zM7_E#`G2UviyQeH+K_0cKPzJk06F%G``@R9r{zO360Vn_RgW zpSmGt5;w+7^2V4+-54`7H^$7Y&rB-0osyL`HRn@!9ZNUO?xL>~r%GmK0 zY!!S1(+4Iihcid8^*Jhwzy>Cr!!z+V3a$ZVA5P8y$MzpL@wKs(Gqd=LSc)H_6RT*8 zEofvVZ(?pre|!fv-4l2B(2YZE2XrT(n_|J`OF=A`YsQKQK7kW!GUr+a+o&Vp>Qd2= zX`!$t9ikuXk%d?4iFRQpecEIW^WuWXRs}u{Y3fptE+IhEP6kVgrYoP+8lMXSZ@z6Cx z$y+HJO9e@Gl5KioHj_+eGAT-=qe>}>ixBeQN+FYe0&YY>NFoci?J2}f47{L%MCOWR zk4ly4B*Bw}FV&T-N8p7if=SLwH%}MrykfAWCx`@++@RX3q|z3Z4KB!f1%kv(>G3*N zL+uV2mzep&tkc_X8jv_T&JvenB|GFsl;GCXcplI%pzot z+1e~m*^Fm4*)~f#?97EoFLD;h1y8Vq|l#MrKz1(_|hxBAW!Jyy#~{^gIqKhmO>XoQs<8juOHD2T?+*D@xemn%Yq+-GNp7%u#aBGhUQwUzC|( zlnK7GZLuf1!%V2s{(P%RVqJma^aun>#<#Ve;y{Xfp(f(KSQ9u)I+rxTE7}&vQIBBN z_W7K_LaZxTFLCWA)}eCBl}oj5p>?Qm26aOvFT6}%uW=p?TLDzAl9I+;E?d+$+k zrYSn#z*KBLI;);?T*Ip1ZD`$CaL*WunWs9L#(CPCA;k+RNd-nHt`tjgm-&H#Yr{|A zCmUZ6>^w@6e0WF!2ZxSj;?$tcF4T;Lm15CT$+Zz1j#)flbkX@H7xWYk->3LH7jRBGSC5zZ6#maX&=_ zC6Wvw#`ivs$Igqxe^pHB{2JkTNZ>>#DJSoGk86oT?T65O$)=~bv-n0?&GUnJK)(F5 zvRNlz!?SL%o$X@fzm0S=1_gHt?sj0=%8?HPYXb;`MW&&my~`EQa}T81uOZY2%IFrP1V5IsQYApJ1e1 zP@{KOln>RuEGI#{(n!F$36uYI zvED6uh5PO_=wmH*LDI7^HVXQu@TYb0!cAm<#KCJ_wAQx)UwRSz(H8rG?}Fn5|z95Aqt?YYeh05%3cgjriXudHHUdVCb8y#~5+Mg#OTP z&VYU%2+9}NKg{(H3wrpadL*zueS!7q>!bCFcsw1%e%%r{#RpwV^PmPtmmvdg1z22Lh9L-eQd%Fhky0be@dH5!^|4TUG1NI%c7fSK-SA|KrUeFR6u zi7+qpJ8W3|76bU5Y0#NG{sjFFN7JZc_^MxenBC!sb9OgyRNoU$Op_~)dVUe`5c7Wh z0sW!x5F@pQ#;y&q=T}*j`1|+b0wB6N&xn3?h&{Oa38Qnp)p|~27l+umM~BdV0XEH} z-@pG}^Vp-0;YUWlV82L7iqK-*sI?@qVoCHRcI-V~Mi{f+i@ zFZo*~QXU?jNKl@BISrWLyt=#=m!tsk(DNEUUNMeyF2>TJ2XzwZ1^KYHB>B`3e!YeKn7lT_{IFNvcgK2{AXVBNVmAIIG47WHwMP7` z_c6O-UYYmdXnI)6za{Y`Pp$;z_h4ZVLYIBER|x82@M=WKt95)5!i1Zbd}`38yjo|n zPZy1nQ}|zOA)o0XPbC$U|Kk?&C!~B_Iz;yQbFPGy{Rrak<$t4xec}t+$D=5(U#f3^ z`fPu@8AUxluwHKVfO_XmHY``VtUmd#Z7rt|7#4WVf23#YHC#ZBlLFU)cSDczD})| r8vXY8gHkY|QdX^#8vSwZQ?4q=ltK#bI$d(PuzXro?sx5*}%jVWWQ$fa!$kBw+Z0wVKF{@&BJVh6hHE!n!*$ATt!mb5Wpi?-VHZua zYPscE+ss>Tp=4HT&a_#vr^+s-rt4VcYPmXPmMo{}TGf)7%6JObNmT>7?c(|>bL@ug z%tC9DdplLB<*f>`PQ|3D}oC5B|IN@dEi z>Lnh1-Kn`Xcdm{IefpB?Pyx}YSMvTz)32}?RkuGlZBHAu$;pQ8ioTaiOVP0!u-a!$ z1wVM--S?S--%OfTrEH;p(GvdUVs1CFyL5C4?>vFWJ+0Qv|JnlbmR08l@E<=8@*Mq+>0%qov)u`2~hssu>e267VwP`!iM$y4hv=Vl8wvn)E zZaGmY=M(igw^XZUj8rnwD7*IUb*pgLnz9>0al%xp6V6Q4E#rJ-h4P_8Gl%W0Ggq6d z9hrNq<~%l&ZDwZ=S10Qkw>&m?sBok>b#$(1OB{SXU2A4I-2UIFTi&I$nGbC_wH1KPEu;&JGE?a=9eJi9~B(U`-(#ATDfje?#$zWbl)qEqORRQ6x$%aOh` z^zRh*$Zjr!|CGSzMqvjrmK)`K#nuhG>N<1!W$*DL%&1#DsI@BNCCq9yDBVt#Z+5BKStBV#pMw zu=@==M9;0{9`?+2OF(+X1YkE_Hwzh9=m;D28%DoS(EgZ_bi@o|YNeR5cdwgAqwvV8 z7QJjEpRf%3>9JAxT5~)K@&JmU6A2%&>PE#i8oYva93UWrDU%(~q$$jqicu;{vXg?- zf}Tx{<z~t;IEuxibnZgoTLzpDi6Q{~pyIW!Ovf|_$L(YqHr!USFZ>5fv z#20T-Yzn>gq}_fkkwUB|Qsk7YYDenjajhlgWlANzSW{_#u(YS}-klUjTbg$OHP&P% zo6hbZ+qYkAD~is0o7&7rXN@C{Ygl!>sB9XO?omTA^E9Wi6|Tl0Hnwp%L7Scu<1X{0 zV`=#a8L{nB!@+OLkvw)Yw4Y68?#U(Tg0^CL)|i~BRFujL*G6pE>CnkNbiTYL9Y;6c z(xf{Xz6T52bXL$&p-4PiY3^@^QaL$o8BWzB^JEEmo{F8jyE!;I!`1LyQ1{7)juaj` z#Ia>dg>}Krv(F>t!8Y*G+_OjvV^Yz0qR59jLhe$pmD*^v&W zJqKOopvVMwVl=s*`=Km*Jk-%BAXndJ68=>!mArb27b_$Dgi-FlpDs!&EF!VOdD#%7 z!Ulv>$sA7j(QJn2Aw|bE1*x9bd`7rT9h&?Mqk;iex)jkl?N{XUhRf7_mCVrdFkino zwK@-0|53U_=f^C!h*f$VF)_z>y?l@EYCmV&)gnEV9tVrju=xQQmG4EZeBQBV@j$<( z>v)eo{$11>wXMm8DpS2ez2Ei82Uw&7=0-r*4&vSDmKF=$7-8q=MI&CT zh6V??FZ7;={hFK=6*Bi`-JFq=R>^piaJhntZwwOWMhyPutW zIvC-#-(n1~g%Id5PLC0NAs!Al#hPHG5Bd1TQ3Q4h3Me12H{2ZL{DVS&aJV_d;UO-+ z75g1%Es06%JFqLSDVX;m&%6I_h=u=}^pz=cWMclcTuE zKF<%Q3V4CyAL!$+{kME(fQ`ffZye+vQ$Bta0~{IhVD#UZ8^@ZPzk}4Vg{^Fx8w~#B z)myUM#=>d9n&!dCf21MS4%5Mri~)}EdU1Fm$hJA55YI5z7Z3EKKcN2!cIPtczcsYY z35RL_;+p~TFX^!TfyQ2e{-%cX%%f%J5nmz%I<^XuzofCZPAeGv9|9)+xa5aM_@`s* z0{E0{FmA=r?=se80i=M=qHMgU)b_!{px4Ae!&la^|PE`z^@a3jPGUq zD=@4F-O)ro$o|=_1c#`fxqXs%f-$N;hhIyun;lJ~it;L=exQd%$4~I}5OqTM{)etp z)W6^l2Kxa;359r(AC&XV1UrZPy%i7jJB;@qn@7HppQsaV34M@9eBj5fO#D25#2*F% zDL{UqZ$pm6L8tt`0r*}b6O8=A{(amqAU3bD3vu?;9P5Js>)V2jp6F`7Blkrr{BZ-+$2_w37UoMBdaz?3Sl*zPn6t^x?`}w5E*FJn^XdbUB#?bt zKl2`kn2bpKME76wHw#Fn(Ib&9|3$y=A)mVCe<o8x z3Xk%te$T3S)p$@ERHMotp)W^H)koP;qpFu0{r0GTy}Bp?HL3#2(Jx0RRS(6kMpYj* T`sFA-l}E*|MwM4JD!G3EVVCTx literal 0 HcmV?d00001 diff --git a/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin b/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..4f37daa42cedb56958717d999109c3a635cd248f GIT binary patch literal 9528 zcmeGiTWlOx^~}uL*>!BMotCtQJ|=AzToJ}Q>(?eJt=sg4NYv045Ut8EyF2S0*qxos zj-7R!w3~nmLa3F150Ln%`b8w*1N8?9p?y_(ZgibRw!oUsdp5@nFtGv)~N|xo> zp}Xi<1v@NOEUzBatg18bhPbrCz;->?o43qtBxh?98WlTmN*{=fz-c(aA`0WS%~$IM zyNa&5A+xnu_Zp$?g;NwiqvRA8=B;vI*Bl8{T@TlRQx%&LY>Hq3XY^e##BYSlYQx1=q1|vNO z?FwRc#jpvn{v_TC^yhcPcKGr^#A`6xga0z(l^A}zp!h1IPxsK*1-+z`5|1%@X~i1b zw;8@~4Ix%9GJJmz{_hMg?}BUFUs1{afi;9!Eg*i!$Xa|_!h~Ni`s;e=+cr$-7$NL`8Z2y@)zL0d%~>%6y3^9?dX*6Rm){})-L8M*U{3TF+{pTON(*Mhk0i@7%J`##Hs;pjqdqrDPN0iH& zs-6X9MQb9kdSUgg)weZG%_`bcs*c#Fak+}?*Fm{>;hncN1+kYetX#N+L@h54W$Fe$!xaTPr$7go^~T~W^} z=m$l}SagDhTle(5F^kVbb};XRdZT9T&Dvntr4n}7hShM7IrYvq@_fGH8>2GFskDjMFkADI;n?>9d?q^Tz|97&e8$V&rzOTGI$;+Yq zy<8vZ%}?T=$?^Fq)Pp{{4YyErbi?t&U`gL>uNw!$xAA#NHx_25bi)q=y(b=)O^GX-I8wR@~}+cJeDiI zW$3Alf9LF7c=i~ofG(vn!JWCuH5A%~sM4Le9O7X2K?`4syrNaW7a%XYk6K2x#E*ed z5VR(qx+k_;_)qR%NAS$n1)C_Cyuh0dnP00f8VlTQ>`$3uXuBn&iJmqpRYSxB8&)R3 z-P!d_Li|>J&8_PzYTc^fNfaFS5W8Xq%Ygi|y^?g##HN`EaL>%G>mlxJ)sQJNq#i^X zu^ndu7FIJ$v35e=sD?&^CBkRBCSG&X*?b=N3uQ;_1txZPmW&+UHq8B)ra!di=%rYVjKRpY2ziYzJRSlmeLlu#$@ zpEyf&3UJDigHaMt;vPNQMBH4)H+4KZ;+2F-ZJ8A6CotTMU;nh8)hTf6sOG+)iCwB$ z2udciy_CcF2-RpA+JbH}-NnGML&vXI-QtpgZ#Tsd@A&v-;9zdwK4;M_hE~;XgzP+X zF4O4p0<*B57YEG_0VJTK%pnVvnImGoiOQGVi>!9LG&hk zL9QNOFlL+gWis+*Cguy)U#Bm0@~|XW$@ls~CvX$a5L=HoGRIx@^`B*CD)MG;%o}z_ zJ5s|>Vaeof_2h<5;%59owjO`*6m32`(~&=BF76OE0iK^SZ~eT;$m;UsY;N|z^gRcV zqH!b$4co^LMNOj|9y25{Z(NjP>K78rV!_TOHWIq6h`R^12^9N4K1&0cO75ajUZ_?j zXO1)RZbI#jI*pK;W9^Q|nT5GrvyH%$+w6#N8rI{ePw~!z$2Znl;tgdF>oYU@38D?0 zFuEO@`_^vnj#r|);|Y9NZaAzxq{PPryHE(6MSP0r9w&N8**Dc*!fwV73hp(aW*5nZ z)N%?Pfy6`-hH$=~)}wq)x)VEM>)8af2zSFEZQ-}%EWo8*5!?sk04t*roB+LJUxOPv zVD?%V<5wWrvCov?zIGVj8CtIW7Jv=6{I!J)FsR8uSiK{>yQ*Sajp1Ov3kmb>MR1UoY<4K#tV2+445ya++wdJ+(x~ zl6{-Fdnbkw_fIubFh{gBUQNUIo2e9i?MeRa*)*IVcu_(TMZU!DukV z3~uLf|9l!g-UOAIM~Ojru!-p`0QiMV8rQG3;afs}VHo*&zT4x^Z6o_h-&w2&rVrtf zAWiaH?X&G|@LC!&&Hi)(5&bIad+lL3d3vy))xwxD2rmp^JfS0=YB0(X6eNsarkP;; z3F1GOR(^F1zh#{g_Fz0SO?8Omt89M)U#kVvnnBS(+|ABt87s&r{g109l z__j2tSZ^4(MbFiKA2d$^z z-W3hb-H+n^9LvA{BewtfHJI&l80{zhtbj&P-|k`1 zgFY62Mt|m3gaP9X&bn7>1JARimpkuId+nI%wZuYU|16XyGnJdf0| z-b}-TOYZ^on|Aw~8oZQ-8>4+>`VhjFb>NHl-D5rQ*yH%)0$Uz_tW zt{+9~e{82KJ(9JK}tRE4G#o?WNv0K10N9-OHBR;~{{#*=4%TVyYDDcEjw}Ni|AxsR~ z)T4j=2F|GSDXJ04FY`#hml1AG(8=i`!7uYn`sscn`6T{_8}NT^9eygQZvNkIz~2=7 zSz!?AXU}s2miph_fd8lK=qEqB_1}#A_^;;-yd?N3=J6rHM_TDA;O9=%@YoXx+piF-2RXD?Y9uKsI&r|D>ByFm?0a!RrU^jSI!K2)z|PnIEY~&N5$ej_YI3OR{hzgP3KW`J^4Ft?Z9FUEINKf{9Y$s7;wM|i_P-uCm0HI|sC-%a6cR9N@ z$8n;VfTC1Yp|pLV>I-cj3hGOx@=!!oMdBh<^?}Fs&2kY$A+1aT1gUA=@Be4!?Afzx zNK@3R5+ixe{I~fp^UpuC=lJ-+LthVvLiwwOY+>&%xOyY_EzI(El)mLLK>j@`ICgTPq6JkFY3S)3~!cFGI3 zFeT8aH!&t%0l3_UXfBm{^irqX=}4h1@MXkwP{6~ryKbrLcKv#lQS3{fnOK}^xzU(& z%I$d$e%onvTixY0%qCi^+3A*>-OOi2StiL9_y14Hq_gP$1$)749X{N#yE1Q8K59C$ ztW?6HX!!nn4&GCe{H08(+^Cm3!{hZuZptn6DYP*@sMgWX5znMyply1Gk@Y(BZVYlMeF5PtW-EE1u0MPG!`QZ_oK zc>jWDShUfHLVr(gLmvW@2<_=EwD)vdPIbQCtSweCD$SKUtyc5VdU=oYm=IOxv2LMT zd35eEyV|KN7VGtVcm8laZ+CJFPOCcCXieqpDXZ|9b>v8K&T;IDJ7>0+Md;OcfAq#x zzrl*d;ks1(T323avEZe6Z; z7UE@+A2(Q_d9MP@)iluUP@C^u`G zKbK*C5Us*KQ8{dD6}hZO*llF{WpbL7hmMxx&6Ov-5}e-WrxaL%ZOf5ixM#J^Mpr3o zXIWVe@|cln_Ty%UeQjr18__a~=97~eF*{k}^EQVyQUQX${s83=ei#WlEbJptOr0zu z0R#OJl2%hhi=cGo9WO)_QZ3;NUEYf+OfN!9FGP9isGQRBhbYf(O3_?Dg`FgN9{PQA zaZm|QOXtIr?+)Q-`sZQK36G7^C%~RF!9t9EGqIganCVx|o)aOLgKlp*^^!@;;A=6H zoq`d1xi#Iha0|M$6n<|(mj*-27IbMhv}{3_=0nSHy4)0bXxW0!tC)wDE$9dgRsdQQ zT`ZJqW}~H2+9_~?P)akMnX$zL4j_+(RS;t~lgU%!iG{_6`9!^DYnB>U0iFzNz_O}B zvHBX-DU`PC)V?929H9b~1VojLde9c3t)QzZ8P(3x>SD{vU7|^(Q?+O!Si|Gc;+YpF}PzQVwbEm!f>oq<~*pS}RSXWsqeC zkw$D`c*ezCRmU!O?RKkCuP&STeO{$EAmxU6D7Sa7y;QGuOO0};%O{AG%9@bsRV-8y zHIuGkC{Y6#t*ys{@Ui|`SZLs+k@pAlP_|DkSy3%l=j{^i+g)?1zy*5+4sepmWQCVb z!8nGqi5$xrcNrPuVM5lZwECS!^sm5aNDVlRCFYrW4(Yjj7%XUj2kWSv&RvPqoZFN_ z4wsQ>&*`GiX(S_@?%%aCQ0OUN9V&eO4a>*^s8n_R1j=!T2w5tkehOPcl-x9>v2XXY zc|^%i2h_05L@7*D68mK2mIkFrCAW{n^^SzBso|BxW6;`EC1Gu~ESpoyVnYoSAz3R6 z@ID~Q=4escaD$?peg(uQAj;NwqS%mwqVNSlMF0b$yj@ZBq{VdC7?hPO1&*LB+vCY1 z?+0b&Vj?U-SvJU%1$9tXE;r&5lx2fFS(L~@S(y+BOi-2$@??48d$+ryJZNBUF@p^A z-4l!KE8TWf0af?MBKtwqDJwu>D|3*@LJMtBEdmpKKZW{|vdBIc*~lW@?Rq!WBD+Ku z_V0=!=bRFRCe%&!GENn)_(PLUaSF9*Ww3x9^IuQVE4QX+>q#txg^6fi|Bc>R(ve(MpmCF>SrO_M{_MTmxhS8O>V|JFCRU?kceTG^b zbXpX>?(zqKB_qnc)gM<&h8!aOakW^Z6tvVatX9xsJ17}~zHtPVBpt%Na$|7_EoPwI zjnhXhlbRHCER;1Pe*^W?XqPhaE?h;2G9ccPAy>49GQdB{fC@R30g;moSp1<32%coH z3Ynn{2&81dRv5~Fh)Ra(88TAf7|t8WS9J_iAow($(gzEjxiiKwI$OY77{)v8=gnq3 z=Hf8snLg&M$7~H_#$@VH=c7O8aZe59&h~NVJ?`nD+}?$O=f*`s`8YF_JKr~V(c^}R zL&BFW^s!r#-Lbpg&*SXg-cNVCS)*Ue$LOn;%@?t_`aV;xR2+K=uLy$e*Z?bk!U zLTqgJV18lggF|AZkDn)jeB?g7bG~U)L~kAuKftT%HT*@#60D0~=7KRYZiwi%5pfO1 zV_}S~D9y%=-W}86pX&J!^}K{9T;oRXYW;7uZ`lI>wJq=`RzjOR0V%ybDaM!8+uz#& zHxObHHv?Y+%T3JtbnUvUU?50UlqDNE&@FT~CUixY1}7!vj0v7U(Bj zd^zR!i9o)Y5GM(Ix7@2!;*_!g{4vCvoAP(kO=3dI70mb6y<*ZGyV`>xA9KlDG~)k5 zN<29}GR8C9VkX3kN%3Snn)KlHuy}1*eOoyNncHE%J34v^q8tylAMfW0=6KTjL7FE3 zKl$sXN5LWE0e{l&4*JmI#gpPs8O8Aj|G1mpB=Iz@2g}13z&iezFP+ga^griBW5j>p zH1gw*snGVL$sGi5u05Ng^|g+s)5@F?${)m==}C$Y=C_U}wYeh%PrzQ*FX_<@|t!?uB<8I3D-x%T-q4f=aBK4nG32%y%Kv3FN zlE|-AN=2Og$NG?tFRO2Iv~ABM0VDdW7!$mU==1>$yadk^3~N^vyo>mWfE8ta zf{|~lewd;QkBE+Xl6K<>14G^~uip!JTVhtj@9=(zOaHLZIp5+5ipQwhFKPFh#3Mfb zw3|$<_F#(7HYtz5V0{5!BVi;e=9_f4#mS;D&$ox@Nl9Pg{fG5RywIBu>zz!T?t>GF zH4S4v%iR%w1TZ(im~T|h*JXVt$Hg{^aC97ne*Gz*K3+e_kH$|+X{wi057hdi9`O9B zxPfqUd!6)M7pL%ZgMX85n)93eL4H{7_3^bHJ!-k*@mbn%)R;fP`7QlVjh(FHb07Ml zuB8ga@@HZK@!CmrhSy`_p6>$Swi*#C1$6wI?!T(7&+*@DNyOYZz?Af<| ztGH*mf4;kyF9|w7H#eV(h|?)?gGUG7{{Yt3eewJv0{@K&P^qB$8f>!)kHk(hnrGAIk@?G2hD-tRHVm`v>$tujCD%JPoyey^d!TZkC9?_FE;d zQDi^q*K&IO_d!3+Ig3Ak)AE{+&`tet)Bh7h_EY|^YPxzJ$CKv#Y5>1Fp%A20HHsgd_5St}0FtNq{mqm2BQYjh z{cRlx!hOY$#8}H`e4Be%{_xoke*J&7foa4IN}+!0_z{|1yAnArE&7LXK%tJee){#1 Yt=g}ESPxY3_RA4!D>)q>{nT>*0?5{&i~s-t literal 0 HcmV?d00001 diff --git a/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin b/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin new file mode 100644 index 0000000000000000000000000000000000000000..cf975523f82abbc0139fa3d9c7423217617869cb GIT binary patch literal 13832 zcmeHOYiwKBeLv)-MCxIQlAlWAn)amDfdh&!kyPx)s_l3PU?ahTVOz6|p+xH8Q6zIo zIajeg%UKX~K-;X_fHv5dBA?m;!yX+F^uw}rZL^@juy)&rZl3}pABw~s6x&;ZHLdph z{m;3Vmk%YDmkk(}3;5o1{_ofMpL2ETs}G+$8w>=}VWqF2dR1+hF&~>5JxL_=^Ygnt zujW)<_2Ir-1(b?6mK#mGS)9kneXg(SxjYMvN}xpEEnx^X$O>cxly(+f+eBicDZtA27$16XOyM# z{Mu4s(Jro*Ra{RhD^-lCt7Bel15_^-TKGbv*l1{`ZQw=3w41?$)tgSC=`{U%jZxxD zo0&+?La|a=vWxX)34YzKHEYfFI?N`nR&6wk)n?+OqO6PLuJ8Ysipi~_$5+d%@!H~I zqufM2Ay}p!T|BC+P(V`L`QzswK3~xEg+!rPSt&Mp$Lo{Y)Jy0?YV!`d$j4N=Ddfmf zqg|Q$CLo7>(2yDb7|+<($9x#y5N)&bDgtIHm%D|n3cRlcFrz;fcz+lCO@X`JyoKHy zr2N4az>JcBk4Ia6Z5Wcrgnqn>ehb&6fpW|Xc3 z2DrLH=b%jUhN6bH0zZPPj#(h^vynmE1AwT&baS;n-K^OQ%PZB=+5$#}rDCI2t6o?s zPS=Y{E!2w_nw4gKsb)LQZ&%ivYt^+Sr?xb+R9n4Jzp%XMRF~@M#`64;xy8!U zsd#-|1^)3r|M2_$Z|qZQ0M_U$6tKnuH?6^7G+;dwOa!a}j@E-eRDm0}ZoIpB%X$`o zcW>VKcNtjy!B_yJSb%eV2Yo*|Nl0h{_h*7%?+aK5DXYfY7;bOhc<09L+mMYpwcNB8 zDeNB3{iB2jdrff&nkCX4z-{y2tN?XgB{YB==NK*o-vsSJJp`?2ULe;&kOzNgA-a#i z>7Zit+v5#@FCNO;9k6zh^E*oOM+3L4m#=dCuLAkYtBj6i6SAAP25*euO>2U~-{NhA zxBa|b;El$^OWf|GkxRVYt6>;Ta`Yr`;>~wO&#R(S7+2`?-wX7zD1Q|En>tt1WP6zIh^hiCk$l&B zRlvgpydt=N62LVXe~Mrxb#qgcY)W0+(8(?lE?~=(w+X^;iGY8Rxc-TZe4tTPx33E7ZIa!(DP?W_CdRiVz&C@xA4CCh^~5ih?FJ5Wv3xv-GhfkODmP<| z)xvC|sN%&^3EN?#&{%o49Lw^E8ONcpY*%Au_l(7{F)meO$!sh>pN}n7YV*ZPp@FUT zi6@U!_)24CzEY0G%hjg69{ccp;HZk%i`YwI@wIFw7Oyw$SX@i=)O>`5g76m-4}4sb zl0Hf4)SYQh-33qD$MfIwJ)`dDN%HAxbEOnZrt<0%r%MPY4&SkXsrrdj?xEubHV>Z$ zrtA~5i53Xz0V*S%Iidmx)fM@A+<(GC(`e5Y6cp0jNr0AMyagW0z6~XShhW%8CE-q zU;yEdg$^p=&;tWo@lG+U!UDK2u4#!Yu_?sC7oxOwRR<5DXoWQ`eiDyx`@rHM!EFIe zOF#)uLbH#YZrcKHmo82d-k!3w_$=^twc0|#rrITm%W2^0T1#_ATsDsTliZM^ttUN; zv(V%w*+puz5rSSwZ94+d3#lD|@LovqA`RiaklGOl?}5|{gVE`QL>Lid@b^MW9vKZH z)#_R$zP#d=SVlNm>G_%b>})!dm9hm*r54w*urpc!C{c!uwRsP&Yh7cJbP?@illW3W zTGoOuGZNg4#;3_xEcx|$rJ?Dd6SSF(Va1M7{$d zQ6VCk=)h=iiJ|&-!)Nr;fI81`iODl7rLt>&Nv}{b(-`CSapthZZqCOngd+=Wth$yn zpgSEO%Aw(Ul5;Tt$J-3@xy9}>7Sap~5cu@h}yb=+{xsz9t@A0At zx_O0U28MU?nh@zn8rW`LiO38r@8H$T$kU6`>gGjqr)=i3lUJPac-d0eVco5r5!O2g zS_|^yLotFbiq-5xQKpN6L_doDyQo|{itf26G(A5Tt#?r9dzL-EJ@z z*Ik4+3AQeST~Ubd%#ID7X&IdH8;lOpZZH<_-53nEE`!~50E4qTHaOcdIOjK*Cr*F; z@<4tU29vGJVDvP%Byu}8IOiFhO85*;p#!!X>=xe6YB&Y9E`!}QSOHq@1R@QHSb$PQ9MeM4bNbX01)SNy=TpWw(hf}lZ+$pxS#q{Y@xwE;mG_B4v ze!E>P&Baq^K9|p*NuQvj zUf058<*cZAAkL3rUJq}2MqX^?}!#%-@g{71TFQs9A_u)z_ zhK^2Nalzy5a4;v)H*dl`oR~i^%iHnig^RPXWfH2=%bxvYWPM&u0M+}z( zVeBED<_`iBsV#v-xuSY7*27fU|bfRLe1EomYf8EVd`L!(C0KGjc&T1$;8Ab zrtI9T9A9R(K^{wHH6ELAFXm(El;HUu)Dxe{Ydj*;6Q9j!JnZg?&nGmVA?jiLu~}`O zyruTSCzHlNESLGB9dE2x7h(gnw)H2DyjKfcyRBM~24xeQ>L}^F)Pl}lp04?X>0$su zdn)phYR#C3H?5b~yv`^%W_Wb)6;@sx$#2;NduPa@o@+*sO!cNAIYu;_ZYH@)!3|tzxqC(RV`Re#7 zs0)qhF*S$AedAn6esdDqKP*Ot)B&Y#A6D+%QhV@2lre8E->H1_fQq!yixBWzkKvo= zy;~}DUqr8lVM+5gbLqp#8R@n0yCmhYB(`n;RXxLFv+EagO(^7AZLSFxnMCcc; zylD7m$VW7&=i^RfNb=|52g3As%f;Jardn-%0D8!S{Qf!0AHN(D`F8&Rzk&+>dx8&Z zysh!h;E+_QCqEpfzx_r&B$+8-49N8!)N5d9Opwr6k-)A~3P z4gKSY4_1T6e{Et{M85h@#D~?(K{aL%QoY*x@r#oGtTQJ0BaHcmJwtOC*2lH|m=Eju z#SrmXXK3&~;N`;nB`4BRF9RL*GBCh;iFoxgFwkBvk+yo7w1?a36Z%Kyz+WG48vVP& zM@2bqZ;ghs&ff5e)?6W8W36~iNBf)+jom_0b-av!b&)sajPGvIkE&}c!>xGz53GkN zXU|^0zL&5bIB%^t{8A34$|dWquaEHydGYJ(YmZ;3Eq>R2V-%ed7xR6@!g{-R$X{b8ze1x%(e-`xWUa zzy8#JSd7_(q@>yU~# z!F|`hc;d4vup#R|5(!!-zx0j221|hJ?JIVukNQr&hJH(XoW1*Fn0x)(2}}RnFa4A2 zfA8Kmp>XRp^ee~g=<+t&Lpj5TT#&efbydj^+|sMe$v73 zE?j+BZ<9{HdHulQ?{Q~CPzZXXYzlRG}?9ueQpK70v*UcRE z9rO-zNCM^)&MGbu@VIFq(Q)f7|>8p<^N7uLXhc~8CFS}g( zVnS2w*%=Pm|6rsh?d&-U|~dLTUWrjB)@4{hJ>F zKh854ian?|^pn>9iu!+HT}i!YdDH5=dwx+`-Y>6zpm?6!+WK8fy*8>2cz6i?e=vua zpIo0r;D0Rw>^1L|O+wlJ? zoyDbJHuRYHPX7BfiaS)>_0$nPoitE4N8a9K;J>Dy^*J+lAI}^9KQaLF%Q@L4FwfVu z{ayMG{E{Y!8M;pWZXSOEJZ+jYbnYW>_}#gy-{o`je;oQhjNhdrKTE2U-`R%$F{9tl z&ywim|Eq2ImwWKj&QAUmU=V>r7da9{iizUT>Xh)3(>277*m?){=cRcwH_Uya9r-wjUT~x n%f~u!<4f88%Z<1D^y`zBuuWV%87AI-J^^jR=f=l| z&Q{_(^=>$D?A5XVqj|0{TMcJ3ay!0K+N}U`#0`@2!;6xx?8mtN_uT_(@n7lq9jm*!8T$!t1C&kPNXm|z!*Q@YN`K|bi&vb~ z-zht8yX9iN|Htm0M|w0L*X?EFwlipU;^zAqkTV&yAmu-zH-A*hSW&LHKrZ=S&x zWnj0;w+jM)A`jCy1g>_&ee=%+etaL0((izunA^{PsHcWq;XgOwKSDbzQI@$O?>D(G z&0HT2&rxx7eR!@YL?Xr)C!(Z)8msUs^psmr75E7tokCnS?QgTkan+C!X-i2bTuQo8 zz1a%9tvV#9;l|x=@U@n^)VoSl?_I6gcDULP8f$&ecemH-Z}hiUlBjEYp_{a-og|3b zD}Hl(d%NFk58}<0uUp}Ov|s-D`|mvQ7gSU# zc0VvQ==XN@+d6V{MYde6gV_SeL?c8q=Jh$vn8xKJqlE}Y%=&p;_6!f#-4E{HeSc3l z+~0fe{vO6NHT`=$2!fH-^P{-c4a{|G4JVNsHT=YkJI;l&OP1?-c+O)dZf*PKYBlw5 z#jRS~H!VL%qJjCaXX6}Mp^Ib0w6<2OrWGcUX*Js2n%g!l>=uo{tUNS^BO*BE#m5(` zl8Lo!KWYYchV^l0vrNzPsrE*Yv^=w7uhR~z4U`DywmG#BURwF=MG4iA;?zQP$*%50 z7$1;#X&n%qeaXRFLr`~WexntXU*MH$duipZnsi-RG{5q&cvU^>Z9{3;B!uR zwm6wiw`XmobEvE>nhW*BZFyE7iO0>hC1+Pzhb3Zrr3_1!a{(5Vi5VG!T^0lw?Th3F zo(g1F7-#EBc^#3u(b!@Oi?CN<%2<|#1@@H+q7jb2&}t`EEV2&Mh_GUz%4o>rphFrM zLB|vPvf)`px}Imna$Q7l537UaTsvqIVLdq5fVxBk4CYCpw zO9d0p4^y3lFvYwV?kSaweRwh)8+jlcuA)el)p0iVem2>P3W|r=lNA+lJUFO|iZaAW zt_)(X%W3pQ^T?ub84$oW4-Zhy1~_~o93k?P@#9i?VgF(F1CO7cYy3^&*Awj0Q}Wfy zt<@sGhb<%Du1&a#PmD>&Jn?bqlL>U(I_EI8GlhxZ2wW=C6#5e^&!@7$Ph{k^0u9;z z%nY0GzH*Z2!!u)DC!CDC^J873;sLuL!QXfp+)s_D@aZDGjN>lEuP`?wV)N<8KjV~1 zh2uryH<>vd@@j#jiN^05Ec`?Ef0z9O^kOow@!8JvQSG!8^7!Q=aK%&ojU8N&3waoq+xTuA|$mvweD6IOx!S{asq< z>(lJ7vwr7Gg_%B0)3=|du(oT^yCqs>UA+*24?CsyJ%i(2f2*L28J+&jOgcaLE-B}; z&@YPkMUMa5l6L)vGw?^T*uQDet+%Mi{K9E0^qtSsl~Yhe8yfW2XX(~}iZk#}?Irz+ zsxPsuSMz81 l-{tW-s`_NdVp-Ko_1W=6jgP2HK2DY==6nQC)kpP;_fJyYA>IH0 literal 0 HcmV?d00001 diff --git a/exir/program/_program.py b/exir/program/_program.py index a33d715ca3b..af94399a3ed 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1825,6 +1825,7 @@ def __init__( backend_config = backend_config or ExecutorchBackendConfig() + print("start emitting..") # Emit methods self._emitter_output: EmitterOutput = emit_program( self._execution_programs, @@ -1832,6 +1833,7 @@ def __init__( self._config_methods, backend_config.emit_mutable_buffer_names, ) + print("done. start serializing..") # Serialize emitter output, ready to be written to a file. self._data_serializer = FlatTensorSerializer() diff --git a/export_add.py b/export_add.py deleted file mode 100644 index d0d2489b885..00000000000 --- a/export_add.py +++ /dev/null @@ -1,31 +0,0 @@ -import torch -from executorch.backends.aoti.aoti_partitioner import AotiPartitioner -from executorch.exir import to_edge -from torch.export import export - - -# Start with a PyTorch model that adds two input tensors (matrices) -class Add(torch.nn.Module): - def __init__(self): - super(Add, self).__init__() - - def forward(self, x: torch.Tensor, y: torch.Tensor): - # return triton_transpose_acc(x, y) - return (x.cuda() + y.cuda()).cpu() - - -# 1. torch.export: Defines the program with the ATen operator set. -aten_dialect = export( - Add(), (torch.ones(10, device="cpu"), torch.ones(10, device="cpu")) -) -# 2. to_edge: Make optimizations for Edge devices -edge_program = to_edge(aten_dialect) - -edge_program = edge_program.to_backend(AotiPartitioner([])) - -# 3. to_executorch: Convert the graph to an ExecuTorch program -executorch_program = edge_program.to_executorch() - -# 4. Save the compiled .pte program -with open("aoti_model.pte", "wb") as file: - file.write(executorch_program.buffer) diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 01c023f0e8f..90571b0751e 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -1,11 +1,130 @@ -./install_executorch.sh -python $1 -./install_executorch.sh --clean -mkdir -p cmake-out -cd cmake-out -cmake -DEXECUTORCH_BUILD_AOTI=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - .. -cd .. -cmake --build cmake-out -j9 -./cmake-out/executor_runner --model_path aoti_model.pte +#!/bin/bash + +# Script to export and run AOTI with different modes +# Usage: +# ./export_and_run_aoti.sh [mode] +# ./export_and_run_aoti.sh --mode= +# +# Examples: +# ./export_and_run_aoti.sh conv2d # Uses default mode (reinstall_all) +# ./export_and_run_aoti.sh conv2d inference # Uses inference mode +# ./export_and_run_aoti.sh conv2d --mode=inference # Alternative syntax +# +# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference +# model_arg: argument to pass to export_aoti.py + +set -e # Exit on any error + +# Parse command line arguments +MODE="reinstall_all" +MODEL_ARG="$1" + +# Parse arguments for mode +for arg in "$@"; do + case $arg in + --mode=*) + MODE="${arg#*=}" + shift + ;; + reinstall_all|reinstall_aot|reinstall_runtime|inference) + # If it's the second argument and a valid mode, use it as mode + if [[ "$arg" == "$2" ]]; then + MODE="$arg" + fi + ;; + esac +done + +# Validate mode +case "$MODE" in + reinstall_all|reinstall_aot|reinstall_runtime|inference) + # Valid mode, continue + ;; + *) + echo "Error: Unknown mode '$MODE'" + echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference" + echo "" + echo "Usage examples:" + echo " ./export_and_run_aoti.sh conv2d # Uses default mode" + echo " ./export_and_run_aoti.sh conv2d inference # Positional mode" + echo " ./export_and_run_aoti.sh conv2d --mode=inference # GNU-style mode" + exit 1 + ;; +esac + +echo "Running in mode: $MODE" +if [[ -n "$MODEL_ARG" ]]; then + echo "Model argument: $MODEL_ARG" +fi + +# Function definitions for each step +install_executorch() { + echo "Installing executorch..." + ./install_executorch.sh +} + +export_aoti_model() { + echo "Exporting AOTI model..." + python export_aoti.py $MODEL_ARG +} + +clean_install_executorch() { + echo "Clean installing executorch..." + ./install_executorch.sh --clean +} + +build_runtime() { + echo "Building runtime..." + # Clean the build directory to ensure debug flags take effect + rm -rf cmake-out + mkdir -p cmake-out + cd cmake-out + cmake -DEXECUTORCH_BUILD_AOTI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_LOG_LEVEL=Debug \ + -DCMAKE_BUILD_TYPE=Debug \ + .. + cd .. + cmake --build cmake-out -j9 +} + +run_inference() { + echo "Running executor_runner with debug logging enabled..." + ./cmake-out/executor_runner --model_path aoti_model.pte +} + +# Execute based on mode +case "$MODE" in + "reinstall_all") + echo "Mode: reinstall_all - Full reinstall and run" + install_executorch # Line 1 + export_aoti_model # Line 2 + clean_install_executorch # Line 3 + build_runtime # Lines 6-16 + run_inference # Lines 17-18 + ;; + "reinstall_aot") + echo "Mode: reinstall_aot - Reinstall AOT components only" + install_executorch # Line 1 + export_aoti_model # Line 2 + run_inference # Lines 17-18 + ;; + "reinstall_runtime") + echo "Mode: reinstall_runtime - Rebuild runtime and run" + build_runtime # Lines 6-16 + run_inference # Lines 17-18 + ;; + "inference") + echo "Mode: inference - Export model and run inference only" + export_aoti_model # Line 2 + run_inference # Lines 17-18 + ;; + *) + echo "Error: Unknown mode '$MODE'" + echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference" + exit 1 + ;; +esac + +echo "Script completed successfully!" diff --git a/export_aoti.py b/export_aoti.py new file mode 100644 index 00000000000..d798654ffe0 --- /dev/null +++ b/export_aoti.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +""" +Unified export script for AOTI backend. +Usage: python export_aoti.py + +Supported models: +- mv2: MobileNetV2 model +- linear: Simple linear layer model +- conv2d: Single Conv2d layer model +- add: Simple tensor addition model +""" + +import copy +import os + +import shutil + +import sys +from subprocess import check_call +from typing import Any, Dict, Tuple + +import torch +from executorch.backends.aoti.aoti_partitioner import AotiPartitioner +from executorch.exir import to_edge +from torch import nn +from torch.export import export +from torchvision import models +from torchvision.models.mobilenetv2 import MobileNet_V2_Weights + + +# Model classes +class MV2(torch.nn.Module): + def __init__(self): + super(MV2, self).__init__() + self.mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights) + + def forward(self, x: torch.Tensor): + return self.mv2(x) + + +class Linear(torch.nn.Module): + def __init__(self): + super(Linear, self).__init__() + self.linear = nn.Linear(3, 5) + + def forward(self, x: torch.Tensor): + return self.linear(x).cpu() + + +class SingleConv2d(nn.Module): + def __init__(self): + super(SingleConv2d, self).__init__() + self.conv = nn.Conv2d( + in_channels=3, out_channels=5, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x: torch.Tensor): + return self.conv(x) + + +class Add(torch.nn.Module): + def __init__(self): + super(Add, self).__init__() + + def forward(self, x: torch.Tensor, y: torch.Tensor): + return (x + y).cpu() + + +# Model registry mapping model names to their configurations +MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { + "mv2": { + "model_class": MV2, + "input_shapes": [(1, 3, 224, 224)], + "device": "cuda", + "description": "MobileNetV2 model", + }, + "linear": { + "model_class": Linear, + "input_shapes": [(4, 3)], + "device": "cuda", + "description": "Simple linear layer model", + }, + "conv2d": { + "model_class": SingleConv2d, + "input_shapes": [(4, 3, 8, 8)], + "device": "cuda", + "description": "Single Conv2d layer model", + }, + "add": { + "model_class": Add, + "input_shapes": [(10,), (10,)], + "device": "cuda", + "description": "Simple tensor addition model", + }, +} + + +def get_model_and_inputs( + model_name: str, +) -> Tuple[torch.nn.Module, Tuple[torch.Tensor, ...]]: + """Get model and example inputs based on model name.""" + + if model_name not in MODEL_REGISTRY: + available_models = ", ".join(MODEL_REGISTRY.keys()) + raise ValueError( + f"Unsupported model: {model_name}. Available models: {available_models}" + ) + + model_config = MODEL_REGISTRY[model_name] + model_class = model_config["model_class"] + input_shapes = model_config["input_shapes"] + device = model_config["device"] + + # Create model instance + model = model_class().to(device).eval() + + # Create example inputs (support multiple inputs) + example_inputs = tuple(torch.randn(*shape, device=device) for shape in input_shapes) + + return model, example_inputs + + +def export_model(model, example_inputs, output_filename="aoti_model.pte"): + """Export model through the AOTI pipeline.""" + + print(f"Starting export process...") + + # 1. torch.export: Defines the program with the ATen operator set. + print("Step 1: Converting to ATen dialect...") + aten_dialect = export(model, example_inputs) + + # 2. to_edge: Make optimizations for Edge devices + print("Step 2: Converting to Edge program...") + edge_program = to_edge(aten_dialect) + print(edge_program.exported_program().graph) + + print("Step 3: Converting to backend...") + edge_program = edge_program.to_backend(AotiPartitioner([])) + print("To backend done.") + + # 3. to_executorch: Convert the graph to an ExecuTorch program + print("Step 4: Converting to ExecuTorch program...") + executorch_program = edge_program.to_executorch() + print("To executorch done.") + + # 4. Save the compiled .pte program + print(f"Step 5: Saving to {output_filename}...") + with open(output_filename, "wb") as file: + file.write(executorch_program.buffer) + + print(f"Export completed successfully! Output saved to {output_filename}") + + +def main(): + if len(sys.argv) != 2: + available_models = ", ".join(MODEL_REGISTRY.keys()) + print("Usage: python export_aoti.py ") + print(f"Available models: {available_models}") + print("\nModel descriptions:") + for name, config in MODEL_REGISTRY.items(): + print(f" {name}: {config['description']}") + sys.exit(1) + + model_name = sys.argv[1] + + try: + model, example_inputs = get_model_and_inputs(model_name) + export_model(model, example_inputs) + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/export_mv2.py b/export_mv2.py deleted file mode 100644 index fa84084088f..00000000000 --- a/export_mv2.py +++ /dev/null @@ -1,28 +0,0 @@ -import torch -from executorch.backends.aoti.aoti_partitioner import AotiPartitioner -from executorch.examples.models.mobilenet_v2 import MV2Model -from executorch.exir import to_edge -from torch.export import export -from torchvision import models -from torchvision.models.mobilenetv2 import MobileNet_V2_Weights - -mv2 = models.mobilenetv2.mobilenet_v2(weights=MobileNet_V2_Weights) -mv2 = mv2.eval() - -model_inputs = (torch.randn(1, 3, 224, 224),) - - -# 1. torch.export: Defines the program with the ATen operator set. -aten_dialect = export(mv2, model_inputs) - -# 2. to_edge: Make optimizations for Edge devices -edge_program = to_edge(aten_dialect) - -edge_program = edge_program.to_backend(AotiPartitioner([])) - -# 3. to_executorch: Convert the graph to an ExecuTorch program -executorch_program = edge_program.to_executorch() - -# 4. Save the compiled .pte program -with open("aoti_model.pte", "wb") as file: - file.write(executorch_program.buffer) diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 65a47594c8d..1c90f88df7c 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -1580,6 +1580,8 @@ Error Method::execute() { "chain %" ET_PRIsize_t " has no instructions field", step_state_.chain_idx); + ET_LOG(Debug, "Executing chain idx: %" ET_PRIsize_t, step_state_.chain_idx); + // Loop over instructions step_state_.instr_idx = 0; while (step_state_.instr_idx < chain.s_chain_->instructions()->size()) { From 17f1a5f28fb1710571732d134cb57f898b36e7f2 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 20 Aug 2025 00:37:05 -0700 Subject: [PATCH 14/50] move input to GPU + using torch aoti kernel --- backends/aoti/CMakeLists.txt | 13 +- backends/aoti/aoti_backend.py | 1 - backends/aoti/runtime/aoti_backend.cpp | 180 +- .../aoti/runtime/aoti_model_container.cpp | 2 + backends/aoti/runtime/aoti_model_container.h | 7 + backends/aoti/runtime/shims/memory.cpp | 12 +- backends/aoti/runtime/shims/memory.h | 1 + backends/aoti/runtime/targets.bzl | 2 + ...ky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin | Bin 11320 -> 0 bytes ...rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin | Bin 10048 -> 0 bytes ...x5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin | Bin 10816 -> 0 bytes ...ci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin | Bin 10176 -> 0 bytes ...c3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp | 6 - ...3am2yslkkhyp4e7oaf7ej.kernel_metadata.json | 1 - ...vspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin | Bin 11320 -> 0 bytes ...6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin | Bin 10936 -> 0 bytes ...xauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin | Bin 11320 -> 0 bytes ...47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin | Bin 10944 -> 0 bytes ...ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin | Bin 11320 -> 0 bytes ...2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp | 965 --- ...kmcvkgx3hnjvysymcgms.wrapper_metadata.json | 1 - ...nhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin | Bin 10936 -> 0 bytes ...jd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp | 6144 ----------------- ...6ndtpaca5r3ct3piucq7.wrapper_metadata.json | 1 - ...x6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin | Bin 11320 -> 0 bytes ...f6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp | 965 --- ...x5fugystcn2wozmmxwaf.wrapper_metadata.json | 1 - ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 8968 -> 0 bytes ...retoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin | Bin 9784 -> 0 bytes ...ugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin | Bin 11320 -> 0 bytes ...2j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin | Bin 13832 -> 0 bytes ...hvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin | Bin 11320 -> 0 bytes ...6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin | Bin 10296 -> 0 bytes ...kxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin | Bin 13240 -> 0 bytes ...lgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp | 6 - ...uw6cqsbpvx756nf43k7mq.kernel_metadata.json | 1 - ...ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin | Bin 9528 -> 0 bytes ...ksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp | 6 - ...4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json | 1 - ...rsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin | Bin 9528 -> 0 bytes ...pudstbhsobm3wlczsly46p5oeax43spr3eab.cubin | Bin 21056 -> 0 bytes ...reaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin | Bin 10296 -> 0 bytes ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 11656 -> 0 bytes ...oylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin | Bin 10296 -> 0 bytes ...wncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin | Bin 15624 -> 0 bytes ...47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin | Bin 10816 -> 0 bytes ...zkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin | Bin 11320 -> 0 bytes ...jkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin | Bin 9528 -> 0 bytes ...zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin | Bin 11400 -> 0 bytes ...zc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin | Bin 13832 -> 0 bytes ...vkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin | Bin 6280 -> 0 bytes export_and_run_aoti.sh | 19 + export_aoti.py | 6 +- 53 files changed, 193 insertions(+), 8148 deletions(-) delete mode 100644 c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin delete mode 100644 c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin delete mode 100644 c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin delete mode 100644 c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin delete mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp delete mode 100644 c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json delete mode 100644 c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin delete mode 100644 c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin delete mode 100644 c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin delete mode 100644 cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin delete mode 100644 caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin delete mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp delete mode 100644 ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json delete mode 100644 cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin delete mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp delete mode 100644 ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json delete mode 100644 cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin delete mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp delete mode 100644 cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json delete mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin delete mode 100644 cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin delete mode 100644 cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin delete mode 100644 ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin delete mode 100644 cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin delete mode 100644 ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin delete mode 100644 cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin delete mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp delete mode 100644 cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json delete mode 100644 cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin delete mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp delete mode 100644 cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json delete mode 100644 crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin delete mode 100644 csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin delete mode 100644 ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin delete mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin delete mode 100644 cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin delete mode 100644 cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin delete mode 100644 cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin delete mode 100644 cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin delete mode 100644 cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin delete mode 100644 cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin delete mode 100644 cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin delete mode 100644 czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 1c596fef6e6..6922d5e9356 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -21,10 +21,12 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() -# include(${EXECUTORCH_ROOT}/build/Utils.cmake) - find_package(CUDAToolkit REQUIRED) +# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +find_package_torch() + set(_aoti_sources runtime/aoti_backend.cpp runtime/aoti_model_container.cpp @@ -37,17 +39,22 @@ target_include_directories( ${CUDAToolkit_INCLUDE_DIRS} $ $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} ) target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC) # Ensure symbols are exported properly target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic) -# Link against CUDA::cudart (the CUDA runtime library) + +# Link against CUDA::cudart, PyTorch libraries and standard libraries target_link_libraries( aoti_backend PUBLIC extension_tensor CUDA::cudart ${CMAKE_DL_LIBS} + # Link PyTorch libraries for AOTI CUDA functions + ${TORCH_LIBRARIES} ) # If you need other CUDA libraries, link them similarly: # target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index a0c4a2aa005..5aa547d789c 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -46,7 +46,6 @@ def preprocess( "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, } - so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] assert so_path == output_path, f"Expected {output_path} but got {so_path}" diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 65d28a7a1ff..4c065fbeeb6 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -30,6 +30,9 @@ #include "shims/memory.h" #include "shims/tensor_attribute.h" +// Include CUDA AOTI shims +#include + namespace executorch { namespace backends { namespace aoti { @@ -111,6 +114,14 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { return Error::AccessFailed; } + AOTInductorModelContainerGetInputName = + reinterpret_cast( + dlsym(so_handle, "AOTInductorModelContainerGetInputName")); + if (AOTInductorModelContainerGetInputName == nullptr) { + perror("dlsym AOTInductorModelContainerGetInputName"); + return Error::AccessFailed; + } + AOTInductorModelContainerGetNumOutputs = reinterpret_cast( dlsym(so_handle, "AOTInductorModelContainerGetNumOutputs")); @@ -152,29 +163,35 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "AOTIBackend Handle generated"); - size_t n_inputs, n_constants; + size_t n_inputs; AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); - AOTInductorModelContainerGetNumConstants( - handle->container_handle, &n_constants); - size_t n_user_inputs = n_inputs - n_constants; - - if (n_user_inputs != n_inputs) { - ET_LOG( - Error, - "number of user input does not match number of inputs. n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.", - n_user_inputs, - n_constants, - n_inputs); - return Error::InvalidArgument; - } - - ET_LOG( - Debug, - "AOTIBackend n_inputs %zd generated, where %zd is constant input, %zd is user input", - n_inputs, - n_constants, - n_user_inputs); + // for (int i = 0; i < n_inputs; i++) { + // const char* input_name; + // AOTInductorModelContainerGetInputName( + // handle->container_handle, i, &input_name); + // ET_LOG(Debug, "AOTIBackend %d-th input name %s", i, input_name); + // } + + // AOTInductorModelContainerGetNumConstants( + // handle->container_handle, &n_constants); + // size_t n_user_inputs = n_inputs - n_constants; + + // if (n_user_inputs != n_inputs) { + // ET_LOG( + // Error, + // "number of user input does not match number of inputs. + // n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.", + // n_user_inputs, + // n_constants, + // n_inputs); + // return Error::InvalidArgument; + // } + + // ET_LOG( + // Debug, + // "AOTIBackend n_inputs %zd generated, where %zd is constant input, + // %zd is user input", n_inputs, n_constants, n_user_inputs); size_t n_outputs; AOTInductorModelContainerGetNumOutputs( @@ -199,22 +216,87 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { n_outputs, args.size()); - std::vector inputs(n_inputs); - std::vector outputs(n_outputs); + // NOTE: ExecutorTorch tensors are always on CPU/host memory + // We need to create GPU copies for CUDA kernel execution + std::vector gpu_inputs( + n_inputs); // GPU copies for kernel execution + std::vector gpu_outputs( + n_outputs); // GPU tensors for kernel output ET_LOG(Debug, "AOTIBackend input/output vectors generated"); + // Process input tensors: ExecutorTorch provides CPU tensors, create GPU + // copies for (int i = 0; i < n_inputs; i++) { - ET_LOG(Debug, "Copying input %d from args to inputs vector", i); + ET_LOG(Debug, "Processing input %d from args to inputs vector", i); ET_LOG( Debug, "is %d input a tensor input? %d", i, int(args[i]->isTensor())); - inputs[i] = &(args[i]->toTensor()); + + // Get tensor dimensions and properties from ExecutorTorch CPU tensor + auto cpu_tensor = &(args[i]->toTensor()); + auto sizes = cpu_tensor->sizes(); + auto scalar_type = cpu_tensor->scalar_type(); + + // Create GPU tensor with same shape + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_input_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_input_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for input %d", i); + return Error::Internal; + } + + gpu_inputs[i] = gpu_input_handle; + + // Copy data from CPU to GPU + Error copy_err = aoti_torch_copy_(gpu_inputs[i], cpu_tensor, 0); + if (copy_err != Error::Ok) { + ET_LOG(Error, "Failed to copy input %d from CPU to GPU", i); + return Error::Internal; + } + + ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); } - ET_LOG(Debug, "AOTIBackend input generated"); + ET_LOG(Debug, "AOTIBackend GPU inputs generated"); + // Process output tensors: create GPU counterparts for ExecutorTorch CPU + // tensors for (int i = 0; i < n_outputs; i++) { - outputs[i] = &(args[i + n_inputs]->toTensor()); + // Get output tensor dimensions from ExecutorTorch CPU tensor + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + auto sizes = cpu_output_tensor->sizes(); + auto scalar_type = cpu_output_tensor->scalar_type(); + + // Create GPU tensor with same shape for kernel output + std::vector sizes_vec(sizes.begin(), sizes.end()); + + AOTITensorHandle gpu_output_handle; + Error create_err = aoti_torch_empty_strided( + sizes_vec.size(), + sizes_vec.data(), + nullptr, // use default strides + static_cast(scalar_type), + 1, // device_type = cuda + 0, // device_index = 0 + &gpu_output_handle); + + if (create_err != Error::Ok) { + ET_LOG(Error, "Failed to create GPU tensor for output %d", i); + return Error::Internal; + } + + gpu_outputs[i] = gpu_output_handle; + ET_LOG(Debug, "Created GPU output tensor %d", i); } ET_LOG(Debug, "AOTIBackend output generated"); @@ -232,13 +314,12 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream); - // Run AOTI container with the stream (AOTI will create its own stream guard - // internally) + // Run AOTI container with GPU tensors AOTIRuntimeError error = AOTInductorModelContainerRun( handle->container_handle, - inputs.data(), + gpu_inputs.data(), // Use GPU input tensors n_inputs, - outputs.data(), + gpu_outputs.data(), // Use GPU output tensors n_outputs, cuda_stream, // Pass the actual CUDA stream! nullptr); // proxy_executor_handle can remain nullptr @@ -253,27 +334,46 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "AOTIBackend running done"); - // Synchronize and destroy the CUDA stream + // Synchronize the CUDA stream to ensure kernels complete cudaError_t sync_err = cudaStreamSynchronize(cuda_stream); if (sync_err != cudaSuccess) { ET_LOG( Error, "Failed to synchronize CUDA stream: %s", cudaGetErrorString(sync_err)); - // Continue anyway to avoid fatal errors + return Error::Internal; } - cudaStreamDestroy(cuda_stream); - ET_LOG(Debug, "CUDA stream synchronized and destroyed"); + ET_LOG(Debug, "CUDA stream synchronized"); + + // Copy GPU output results back to CPU output tensors + for (int i = 0; i < n_outputs; i++) { + auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); + Error copy_err = aoti_torch_copy_(cpu_output_tensor, gpu_outputs[i], 0); + if (copy_err != Error::Ok) { + ET_LOG(Error, "Failed to copy GPU output %d back to CPU", i); + return Error::Internal; + } + ET_LOG(Debug, "Copied GPU output %d back to CPU", i); + } + + // Clean up GPU tensors that we created (ExecutorTorch tensors are always + // CPU, so all GPU tensors are our copies) + for (int i = 0; i < n_inputs; i++) { + // All GPU input tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_inputs[i]); + } - // Still need to copy the output to args, because they are malloc'ed but - // not using the data_ptr from outputs. for (int i = 0; i < n_outputs; i++) { - auto args_out = args[i + n_inputs]->toTensor(); - aoti_torch_copy_(&args_out, outputs[i], 0); + // All GPU output tensors were created by us, delete them + aoti_torch_delete_tensor_object(gpu_outputs[i]); } - ET_LOG(Debug, "AOTIBackend output copied"); + // Destroy the CUDA stream + cudaStreamDestroy(cuda_stream); + ET_LOG(Debug, "CUDA stream destroyed and GPU tensors cleaned up"); + + ET_LOG(Debug, "AOTIBackend execution completed successfully"); return Error::Ok; } diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/runtime/aoti_model_container.cpp index 0809a677a81..f9d66ed82e4 100644 --- a/backends/aoti/runtime/aoti_model_container.cpp +++ b/backends/aoti/runtime/aoti_model_container.cpp @@ -21,6 +21,8 @@ AOTInductorModelContainerCreateWithDeviceFunc AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete = nullptr; AOTInductorModelContainerGetNumInputsFunc AOTInductorModelContainerGetNumInputs = nullptr; +AOTInductorModelContainerGetInputNameFunc + AOTInductorModelContainerGetInputName = nullptr; AOTInductorModelContainerGetNumConstantsFunc AOTInductorModelContainerGetNumConstants = nullptr; AOTInductorModelContainerGetNumOutputsFunc diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/runtime/aoti_model_container.h index 2078490022d..39a8a35c14f 100644 --- a/backends/aoti/runtime/aoti_model_container.h +++ b/backends/aoti/runtime/aoti_model_container.h @@ -45,6 +45,11 @@ using AOTInductorModelContainerGetNumInputsFunc = AOTIRuntimeError (*)( AOTInductorModelContainerHandle container_handle, size_t* num_constants); +using AOTInductorModelContainerGetInputNameFunc = AOTIRuntimeError (*)( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** input_name); + using AOTInductorModelContainerGetNumConstantsFunc = AOTIRuntimeError (*)( AOTInductorModelContainerHandle container_handle, size_t* num_constants); @@ -72,6 +77,8 @@ extern AOTInductorModelContainerCreateWithDeviceFunc extern AOTInductorModelContainerDeleteFunc AOTInductorModelContainerDelete; extern AOTInductorModelContainerGetNumInputsFunc AOTInductorModelContainerGetNumInputs; +extern AOTInductorModelContainerGetInputNameFunc + AOTInductorModelContainerGetInputName; extern AOTInductorModelContainerGetNumConstantsFunc AOTInductorModelContainerGetNumConstants; extern AOTInductorModelContainerGetNumOutputsFunc diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index cadd021f51f..ab5d35efd9f 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // For posix_memalign #include #include #include @@ -147,14 +148,19 @@ AOTITorchError aoti_torch_empty_strided( std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl; cudaError_t err = cudaMalloc(&ptr, nbytes); if (err != cudaSuccess) { - std::cout << "failed to allocate " << nbytes << std::endl; + std::cout << "failed to allocate " << nbytes + << " error: " << cudaGetErrorString(err) << std::endl; throw std::runtime_error("Failed to call cudaMalloc"); } } else if (device_type == 0) { // cpu std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl; - ptr = malloc(nbytes); + // Ensure 16-byte alignment for CPU memory to match CUDA requirements + int result = posix_memalign(&ptr, 16, nbytes); + if (result != 0) { + throw std::runtime_error("Failed to allocate aligned CPU memory"); + } if (ptr == nullptr) { - throw std::runtime_error("Failed to call malloc"); + throw std::runtime_error("Failed to call posix_memalign"); } } else { throw std::runtime_error( diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index bcbb33d0e99..996c729b4be 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl index 7b02c1075a2..28c9e893721 100644 --- a/backends/aoti/runtime/targets.bzl +++ b/backends/aoti/runtime/targets.bzl @@ -23,5 +23,7 @@ def define_common_targets(): deps = [ "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", + "//caffe2/torch/csrc/inductor:aoti_torch", + "//caffe2/torch/csrc/inductor:aoti_torch_cuda", ], ) diff --git a/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin b/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin deleted file mode 100644 index d34f0ffd0262af56bf3c172beda228da57f93b3c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJu$8lYb`tkxG*DnnE20DTA{aWI?Mq^C?Bjef zPD^Q8rfHywPJ8(FfM-A;L9m3dCX z@tV*oyq&7ni*^-RsQBFbQoYtdG=82^Ym}Yh?36X>*)xs;s+Ah%o>P^?0Jb|2wWcM6 ziEcIQMned#fwxhLUV#AT`Yp?E!9%i(qA4#8w^HIrk`J)L_Z1hV|3D}oC5B||YIVx9 z-Dw`Z>(%|bKj$LCkiN_YR6um<<-C8?^s6jJ)$RAsI5S3laQgo~atoEj* z;QQ~n^Bzm^TN%r)R&4YyS|Yq$tX&p%myS;1ohR_Pn`+JcuPvZpS#@pz|Iq^4n9Woz z^vk*gO)N`|4fZU~Asi(;L)e&{-v+$?+)#)jrbXa2wzUWE0UrtB2PDTF=TG(UKO^}S zpVIg}&ab>!5qqBF<0}X``yI#k_293GlAr2yT_O7%zj*~AXWs&TXfTu?sq*)n|LPw8 zHm+R~OLMw37JGum)Rq(T^CK+LJ~2O%;8Vs3zq?0^L`Yk){Rl?L##9EL1S-n@1jb0@ zZ^_NLM}T3Gv_Ipf{km70uGGr2CCscTyHT w*r*(OU**Jqsc3W^SnvYmEn%|_a; z`;~OHQcSyZ{&c;TGqRa6Tm1didn=_A1PR};I z(R{r=I%>8W&8epQXt7bL=iJA~-J;v@@+EVsS;;vw<+(?yhC9b1zx>O0UhezNHo0P^ zovQ124WdLp{PiDR>ifkDh^imC7FR5yZH+9ydqF!A9gMuIMWbg~WKnAYc;}oJiRzIh zO^aqC=a;qP04@N~Ba7!RENf3jUuTh}#dFJx@1Ez3+A)wWzI72SXCsmG7vGLWwbvLF z!Nkvw(-_qbU|7=nao1*Px(M1qh((`^BHtRX0Ch%Z1%L=S%(QJ0?P^l-NxG4%+E?ke zi*8SlTiUfW-9op6bQ`3bhuhN9(&CbKBY_s(I3IWCQGn;PFB9@*a#~0`G^^2Vm~Jy& z%I^&*+}2eDHgQ{AT3lW}_l`)*aikBkOyt5Ma+rxMzkTuT_s;PoEc2|tv%IuKSwkA; zDGgK@qluzb>NUM!jN^II_NE+PZ_HSGGBz{pav3K}!)jEHIQkf!67nR`4V*1gH9e~v z#eyzQj)$_@hxO0sNB4~G*6Dw2e79bpv4DSx8_xij2R(Ca-#^nur_e*$@qekWK>AGR z-y`gi-9irkS%EL)VFxi*8kJ(z(G92Odvp3F@9`tdaBVzY>c;F?UN>Cd(|Zzg$wn-0 z!piKvg2u8}(wKQAjpeSSv3$r_&b;(~ldUisQhV9^Zj9YbA5k8|DN)w5W`P|ga0=ay z&vttMu)9B-yA_L=bJYZ3H{LJ{Iaufj8}%DTzfjQOn2~hE3}SY*n6YJYlTQ#ODlI&&Sw4m=9 z&F6FD+3|4-*;5T*;$Q+DTsg!{1fkb6g<=pOfb%uL;}jc~R5P=Wo2%9XD)p0Rf@gGm z4Ekn^0Y&r^kH*YNKOe7@9mP+dm%FmQdlc)43lzex!V}%s8zq`@9Z^E6CrZ?|D@s`AnDs`< zyJakhax4@j8oDz|zDCNlh*qm*7E5FuVUk!+oGN20-3p_Z6er&p3PGHELUCq!D|M_S zzIe-GQ|PTH?e^=56kFi^#<<)Cndop?g3s24j`k$D%{ zkq(-HgT8W5WP&@9&m81_D9Zs4bu6=RE%g)OAOFW+1b1G$i7-)|-OJKF=>pb_+wt$sE^iKOe56#I6ozn|WMC*FTH z#m=O_-`{@?-~)Y<|A_=UK)mD;$sbL&+1CM){S3i_ZT5_WU4n<&?3)rM|68JNZF-md z+cfBXZT2-uFDBU#=pO=XpMG`|+2867@BrEq1fODCT5MpV#+X}N)L=h$6W~41{<+YO zg8nMvZE5WtkZ0Qv6ql#>>(KY=sZH#S6pOdy%iA{Sh<8^j&Q7B-21p+K!>vS|S}no# z-OElq9Zm4sZ#TBEg&62bPEQhjAr+6e#F}8F5Bd1{5d?M;3Me12H{2TJ{DVS&bhtId z;UO-+1OAXTT+*N6R^I?&`Zzs}{KvZL74NE7JWlnB1@(%@JL?tesMm?7W62<%8zQhz zj^ZNwJU^T&;020*OCNvjzx@+i*hmWS=0WZ;<>QA*z=@#%M*ofaDXh8qTS*;T*ui%C z(dds~zGSU3e((*hXzPc+2ZVLCW6Nx(^7FAgt6*-kGO;~D1qQjr1l2lPM5ZeK?I zcf@vj@i^^Yd^14)B^|ck)7VSU-_nqtd9>_2;!DRs$5uh|7d7_ADFuW7eZa(@lKjvJ z|8#6!;J+0o8PAXYqK0;w|NIKf=_~L^lJpCH^b^I$FKl_Ve)Th4zu^>-jh?1aMR^rbKhWc%<0trf==Wmy z{)euU)W6`5Mh5`(6AJMnKPczv33e9wdm|Ma@EGquHjjKGKmA_3E%ZSi@qr(^GV$~L z5q}&AqyYJazD)%Z2c7c&I^esBOfd2b`}c6efY`jo&ZpSd=U5*ESl@PR^h8(t9ns%P zvCqzZz-a$&x1ZA3=@h#*phM{Y5Vov?U%cZs>#heL#FyfHdicIO4_fzs;l2kbX&+r5 zW#GRe1GGJheWlZ#2?>5zY#ITP_`?_NL95A+O5_b)#BO=|=DW)TSN|1$cri9D1&D{v z3wg@XlPSMG2YR^CC~-3Gi%8I zR37iQs2?uK2)gBO7I-%ZP)^Qn7n5BQR%M_)I(!`wfIAUJVjjC*AYPDolBc7vTYdzE zflod5UzGBCNS>xjUe%F)*Fm@mB8IL%N_kahvQOuel2iDftRcU+iafPcxBOq%kWa`D z6dCCg*{AcmTYekjU(Nr5lvjR|Jn?nQ7hoTMJ!{~wl&74BFD0*V^t&69m&-+=*}VEd zBnf1n*3W#vAtoczKGDP1{EY&VX$(kYD}2%Kd&sA5`5#F6FfYx!H@A&=kSS&llf);15v9_4gu#yfID_SraJCpZ#rem zs@cA~;8;c5FU?rBx;JZ8ooUy{)bc&st+};n%PyBKGaJgibzFqTk6idDA6z8X{if7L{a#3|_n0roDAj7rY1*k1p0W;yP zrrm4?OdG&$#J-;)gf;w@<+orV>BTNdEA@t3;y~gLp@HpdCJOrjkTwbk@z~Ysv}ZSF zxc3dO?$`ar20V<&%WN+AK~{c2{g=5-J0+9)2Zn|EV(}U z(0vbC0lk&6?5b-cO(8FODy+R0szz5r@B|Cs=uLSv{@>0?czU|e`R6((n=X>_D1UK> z%*4@C*l6DYRPbon6M?E?ZUU~I-x0wC)02QJY7>Nx zIlZ)EjqODaA6Ns3*+mW??1K-)$xivSTthn?zHJR4X3K!@7+s4`TbS?~r@yI>zJs<` zKxs}*Vd^t9riPc>?R_lKS#Ix3@F`;iAMcYA0HiMU3f2LyDkJb$fa0uyJ{J4O$Tsvb zc+6r4{Mp64c-eA!qB(aecRKg|N_H{VaO!$vk;Ptp@5R>#U){-A60QeLjVaq>mp{6s zEXGG;A1Tkp|G;7^t4a$iB}v`W=vG!&E-7j}69YaIyQC;_Ev7t6q?ay#xVoY&(ON~f z^1gBd=s#Hb;0lP^vDoU$N*wzbi~SMm;yg*aodq~z3{!52DL0YYJEVC(`DiDC$K)YfO%7Pj1EGd+P8y!coq)Lbw6p1$L7xxYN1aTUeA_MSEqL6Vsp=vwC~j#F*!+5C0lpPxnd|xt&08Tn3*k{u4V3LP z@`4<-Yh~%DfvTPM2x*w>`Dy~Q73So~oVpjP#RDYm@gN(f#UAhr1TY3Nm7 zZ}L5F1RleHd?A;iAkA0x8CPIE1CKHU1ZX2~lt_u28c<7xLgk5I3j=t*xX^RuUZeW$Z=pnV@$%D=6o^4P;zoh&|_~n6 zI9FOWxZQoif&4V#W+>cT6HXh@?N_h|@W00|9{RGu15-A$p?m6$u7+}7GAs| zPXsU<`j`uRhF&KI` zhKkHJctdP`?#NB<$=?5LYbHW>j7-EIDnZ>j9#nxcm7#SVl0p4^sYa!v&mkPrYu}&A zHHWfMhj4BJhbqByE=V)7x{}SC`C}7z9|J{m&hvGAay}yv52BxyRpjYOr)leQk&Oxy@hM6xFGhl$A0m@u_#X1Cz}b$}v}h?C=M^rmoq-hQIez$BU>sU|-EnGVdP_Ki zkC>*zYe!Pt2ie7<=Pcl@s<)l#BjwlT{e-=YTea>tVAd{?4aub>9AQj7kYG1r{0hyt z2XhYJ5t-*z)+YQd!-T?(NrAB|d&2b~%iza_?csWW?d!s;OzT=J?AER||D2D=i?Ku( zo+|}jIf46{uXdQaE5R-*jJ=INb#$29#xFcTsEkc9bz6dsU_Khhd{QN8ywUv%P5vdu ze~R%FG4g-_Pj?y=6WP8dCcSVem*D-kNP(Q_ZPdLR$dP(BJDy#Nbr@Y+kzrpSVPlIz z;vS5(5q5TIxWj&oInlg~bxUO{f`q-30zCpsjeaM3XN0}fO7J_TqdOrH__M7fo8mVW z!2c%2e%4AR>DDgs@wpUxBL(_o@@A}$4hs5b66`3}XaWCHqQjoYiu7k7Z)sbH{Yb1E zv>xfOpNKW=-_c_1IV?LD&keEP9{_%|!_EnOafEI6aD7Mk0`PCVz=lD8{&vP1#VZQz zRqtrU*>t5+>BND5m*lG*_Cvw{9^%tdI=iuq?}SsZf4ir|Nnq!#7lzoIDYmyYqDD%G-?l*C;PM3}!EZswZXafq zWN7~g$M05`FAU=3pppH>6pMR17OqEYVEx zv&uBAkuQ5ZWsoOC5byX6EjkJ9XW2a~3VV0AD%P;&GS(@UK)wQhl*F;OQcO!Ei2ev; zQ`}$e=N$jw+M45o^>4us{XAZy;NLsKG;c5gfoM~(Z#S(K?hjw5*f+ePLAovHwF3FN z#~WHJ8;H+C`-zuYTVWrh*z=1_C4#{*6gk4n^Gx{nQ|y7oPZ-s%PUi;-yO?6PhIlaj z7uJ?_?6G_Aw(fuQ3H*4(rzak{@0fM+JC8g{PJP+FjKKd?1gK1iCtjCbQ)1PQcYztv zM;rYvwI2U20ZjHFd->@XqaGDp{Jx-Quo=NXH1s~gYYj%cJ@Ix3_+FWW_^GHWGXYuH z_tHMF0sqH>|HF_onDp}d8}R>G82YEr;D2QU{#Sz{%I4`MF4B7Si*nhBknm|v1Zb~7 z$Zg=ynh5cHFmFsnU^IPP@c&xCiJwY*FaH=e1~&ES-!&F6YQYfZh~$_1NWX^?Y)a6{ zVM_4JeJ1_%9x3@G{QL&|kFUc|Db>sW_6Gb7!JiQZk$(Q3C)QH`5d2@y{`2eTC%Rt! z`=KBI`p&?w1V6<*+9bb(nV1m2I4%;**loeIEdoyV)A?x!;IJPP`UxLxZM;KL+Z7C9 zPr9OweitX3dij4L_@lHm@8zEe@`v5iLn!%A0598m`A2?}i}L?yM+$T9k`N&7PACg{~ZoL_eXt(UfKa$vOSqT)1rWMgq_5wn8*Y-`~&avI= z8`o(lb-D>n(*)Ca8PXmqo)Fu^9zkNL4^YuPp#2F|h4#>OD2W(C8pHd1&yVZt>tur^ zbeqIga?bhw{&BwNd#|s3>Csas!=X@S7ZX>Qz0F#3P1D*T0j%98=BMv^n$5Bt>&H04 zLW~*pg}UpwRvs%gFE0w1&t`q0QfxA#(lAPuxhgYUyXhLPQ*x^nvsNvcb4zu*Xqpwv zEj4U2Z@Gm9vr=^y&9XgTa0}7Me z$N6$KZngNTB*BM#hs#D8bv$5G;hv1)}pO|a;bu~W0xh-gRL$^rC|zTqMLQA zUKc`J;4S37SHQ#7+@|R^;UU?@E-5dKTB*R1B=2E`?;RJV|3D}oC5B|Ia(UjdY70F2 znp1VF?s5$g`t&8YmwZu`U()zhZPQMvqCT@|FB;Xkxw`F&@|K53(NXj`#SK%zXC8m_ zaZ~V{Nz*EqER-qq`A>y;&_vhhDhOU+0*^7Rj>iAnIq^*zUnsSjoLB`z4hmp-|hdyJ&eT=y0{`CZ78(y@rT-S zcrf&_b}9TP7Ft`^nt-$z4d-cyg!Rzvbxc=TNP9k<46S{rg@9YvE|S0oa5~0I82_U6 zVZ5d7f!uA*q=kXdR>C(zAx_)4y|FF`q#}%M4uwW|E@+Rsa{+|USEk(?((WdQCTZYK zo}txuXsDAb!s$_hPttIW(fHOSj2qejts5BD*Vfmy!vyj0c%btH-bF)&2CjF8R{LrA z5m!D=OD=MbR{Ln+!Y-{yZEXB?1CFh2Xd30^ z3gv}1$!OS4y;QB}IU|clH_Mr~UA?|&9!Xlvz~dO2ech~=F4+1EHE!A5bp!2wzM`je zBcIbnGk+$PdRG6gzH(%GLZ|n!WrX}gD*QM2$!ryEN%Q@T-e9la+f+tF`w z?PhY~&e%!aAv@_iWG8cn>`dJuJJUWpne?_QS60@HPve(a!C7{gK3truxTT_=O6S-L zf%7;r_&}z|#%mAcj-v-~R34$l#+?VIk{uL$1FHBydI~sp@RW(qk(Gj($7jY$@+h^g za#0*bBQJSVGZXqVJ8}4)J+_H*w6mR1o`Q0U2KQkKW3k#WmPGg|G_Uc3Yn6&d6Cu|Z z%7#n~1vc&wJ-3->Xs|osS$fm3(OGvwc?Qb6f<-#VQ5qDgKT^bkNI^_(5@{OkekYun zf-_d7=%p7i!YE9qr>8bEi@V8An4P8^R2^lOqJddqe1>-1c56o2HR`+`YOp%)5K}6b zOA&o>scbBiB-u%O^lWN6m(J#L8A_$2$|;TeBILo9K`#9S+#m%3!QvM0Da4H#ctK4c z_JT_Pp=MeG;Xtbbci>Xda6r!>nGQ=}q;^`KdV zA25rMIc96KJY`d!*>uM&ZCQ6NM0%04KrVRli&TW<6t+_40zLzQ zmJNJlF1UDh!z-v$>7z&OMycSMWvlM;jwqy322$uxtJ2g-D}323zEi0#m1Ln{6&7q0 zpB!AnmvmkjGhXylB6=PNl|xtRMb1Ud_eP1}|AQzY)e|M`a7~>kmF~bUe(or_=Q%IR ztS?G5^T*Ip24QbPub1xW*nWs9PML*q~A;SwPO$A0BSBho0%e-O8 zR;&oTv+@1F$)hC62Z#)CaOh+%Nf)#gOATXgsa*C{a&5$hHhbh)FP$H9LC=r^YHZ3X zzX4=~ZRE>O;@J}ZW~dp=En0?C@g#Y&GF)C%9jYt9!toidM;xmZ-6!a_CC#U}(;2P` z;(m$e7v@n~4~aF$5Bt{H z2x}4j-T@-vZCQ@7kH$Q>pC$3d(vS!DF?`A7qeqzD1#4u@M-S8cMjsqv(Jq)91zkIh zcXj*PEV4h!uHwbl4g5z22e@zaa)bW0y|XN`JIdl%4~DUxiI6sa(YrNTd?x3A$oW&u zJQ<+TyDZ9wy1p(KLA=sQ_};gNLXZC~&3=wG@w^JS8DVRZh22bmUlY(crkip0N;AseF+O+?G=hG)8Dq1QXu7T?*sq$g z7`?TPe)3|1y_W!gEH(o8!G6jAT$DXXyyOwduf*Hz1wdp!N$_Bs{Y=6&f`{Ae7ZN7_ zn_^$v^y>EgNzg~y>_th>$JsFGp8;%NeQ_7rA9Cs& ztd{>ogFlh`n_>3b*I)luI}H9mBL1e?b2X><@*#Ot~1Y5lx9^kdI&lq4gBj6`0F5>@y17y3hQ zUI6_v5R@;je~{}R6!h>b%}8K>`U3mY*GKyk@%E>$uX}$YUHh|oJrd_0?fb?6%g5lK zIP=`!n@_;MVQznz?0;||vU7Xa$fCojv5R7`-gZhBJ=h?$+8pS7J==w-w?<9zhW<~T3&?kvbFz6p^ z?EM7Oc`{nZar5Ezqrb-WeV@bdM<1mx4}A1DIPB5MKYoFW&Tq7-;VX}pI6}}*{t}4~ z@Hg5VlJpjjSAR`IxweEp)UW;qr+fOxoW5PWFY$aO!edUPpXXP=Om{Sq5AOc~f+M0M z%nJPu8x-Hc0Df;0bSB%Jpx@_c8dVJ6^=nVC`yA0v_W;ND!-;Eh$5GEOu`wsXyl;Jw zpT}`F#nsZ-?F4&anMH}ee;*n>(N#ZVpx;Wc$Cf{3)W6&9pJ?oAf{l7~2>lme(>(FR zM;q#^ePbFh!POsYzz5M zr2IfYAOJFV3;9S)SR4%q1VHxY7V`V$ZF7h>d*yv^tcL}u*5MGVN8vN>F|;HU=T|*dN^|1a&caHKOFzKGN?22sbPF)RdF*YM;qI z-Ds4Y!e7}!{`4mDR8m3t8(YXPNcp66i0t#{TnQ`tyAc0o{{MOt`@|Qte;D@h*K-Db zFXbuc{!7U#9Q|H{KkVTEqmn-jy1F*2KZ=|3s(#m~c-4Gb8dS5&AECD+r}jtLQM1}F sHT&)H2c=*_rJ&j;HT&f#MztTxj+&J^HT&f#K9xtsuV$53H7mJ)0z0U8!T-EP$ z$sJLvGRn=&|NCQR{<}NQGxr|5FCL4{jFvERZ ztk#NF6=6F*hf6iL;ajdhN4eC?c5!9Vta#S4ErF`zV(!^hK@4H52jMo&K$z%e!)i1F zp-u1>az7{#;_80O^jq+d>|&LqmwMePaU{uy*ueL$i^6{(q>lnaGFG*^=vnn89(~=b z`8B^?M}!f5nXM&1*p;7A|M}jgm6DtK#In7t*D95U?FZXi93Dx>rpGC6ni4+o;Jpu; z0l%3st*T>Tn?hgoRG52BRE?eu!8cfdM~}n)zj;o=)6?t!r#dHFE|T&ne|Co~i=(Ho z$$;;Y==_17JHI5%0z>mbqPx-X$l5LLP(M5>aI`C7Ik^Bf#uX6sI2KYN@dj*!})D)&Z zO=D_&eQj+YOLW%P_9gg~F~ZLbXb}iuOT7v>kX2;{&I1)^b@Z{=KPGmdk0D|fJK!(Z z5BN2&wB)$ul@ey=qSdI?+>?%V;E6V4r6<}a+qq(?)>?9McBS5|)-1O@-+s!?JI$v3 zgrD`ird4z7>fFgw^X=Jke#O=6Z5I3OdoR2``pPYgB@uei)R?j}cKL&gN;^Io`#^an z{uYaETvA$qlq7Yh&|SE^ap97p#xpT6WMUT;C9cJk9}(}x%YVPLp{x?DqT4vH+&}^w z7p{P5Ef%}9u@V1G9I^cw_Tqe(_C^bI#2cpE8dGi}yWgOWoKk*B-G1tPviPBL3(b#F zcZ9m{pgVu%1LY}fU73S4hj>l-p_UQAheNf2cfw;r+l6 z#SmCHOfz!h(xuBvoJWKS#UFp3!lEqdP1|cYHCHR>dAwU$-lFYmjb-y-#$r0&mr#ux zX2UsUYx7iDMfKHm)Z;~0Gc>(e(1L3Fm|;AweOp^Sn48h)Z$3Yx6=*ErPvG(y;O0Qj zoWA4ZbW!Dg%*cPDz5?mXLjS?Q9@#C-;?D^1g*n(kj84NTR&7nUUEgbKTd9ltm|nN= z&ZX%q^K+VB_dRVOCtJ~Pa_wej=K9z%u9KbYb+R*io$SnACp)=_o!RV`cCOf2^AU}o z5g&DK<`9*xYB@NHdQtEi^E29G+j00Fzhj7Ul(X$nZa_Ijg9jyrv0Q8F zD?#|Elj)LgIc2?th#N~)UE~GDE$xAtALbVD+u>GrXw0y@9m;2+JQ}HivYp1hpg=9R zEaNm#wbLFEjqI>kv!HB;J9Fet%?r)q0h0E3k_}@>inDNQJIv3rY3Sn!9Li4m#D^>j|ryP1x&ISt6lTI=xj~q#s>EL0=LTz`Qp>ambv@&>D}a9YKe8|~^;2V4}7KC^Wu|R4d z7Ub;OW0?zM$wy+L6VbCJc;&!~PyrdTFjPUVAsP}J2uCh*-|qd-mL?a5V`L)nP<`nO zi0{BM)tI3H$)tX|tw!}>ARrvlYhR!FZa}lqfN*YtfGWX5EI2acb!9f6%^%I(brc+p zCp}-c>bSyd=@tK!E}4U(ORG%fNMM7-<%b^!j~!@+WQI_Y7iX-W!cg_HzdR5w3p>fyoOR8q)I2m`5~f0jJgF%P}6X>BQ-5r z%4T?l%WJ2J*R%cbnQt8GzRTEdnO*@-;|rx>^V*RVw>?&|=-ExYGWEC91FZZ~e2}qU zaJ%3^2bQf8`H);qA`taJg53yPU%;QbGnkWUbt*E?t89&Mw~iADw;=__uIvfnQI=tB zV`m7Duzfvrm1#Y&!fxw<`ImY`UyLPs=-epi%5mJqe6hpST?zJ<0+aYtC&#&O{4xWK z%Jc$LcO=*Z=96*E=T*|i8{M7IX^rx-uNh*a=KcNLTm*}fzu{dlR9@PoHQbo;y*t6qtd4is&oC#RmjJg^w!s}#okDPKbgE4@R3o$ z|8#;K0gM*NuO>R|SwLhz1AVJII_#$ct`j`bVLulz{GVzu_6)$zYtN0bKOO*mvcsMe z^djPG#XHz-Ot~3Oy?d7J^l+U=`~~p8n_{=OlsI>n77YHLSyemiCzz9c#5WH8tcc&m zidPik-{~oV{Jyhe6wg#kQQpS>;r0PfdShf@t@Cz=>mO;6VzdsdK!3V5KBfX3kHh{S z9%Ad~;^TZTP3z~1m;aFw z%D)=se`KUL|7uVE)2$ucza4?S(e-mmg5Qcxe|el$lA-?-oW5IKKR1fghDQDuQ!MUH zF&+RJMZVIl39fIF^B+KbW8L_8K2#o`%IRkzk5rL9l1KdO=cdLfey;!g1mLlW5dJH{ zObz|Rdgc5i#Mcgs{KfY8@%XP_|9YNOus0rJiAI8-O{M|H{@LRxqkKVwe8+ES(aBi* z5xX1J<=x$?fZ@w!zWxd9U(k<`I`*d&(-H~dKa2+qk5~I8r+>E#b9w;3nd0_&z9u2R zcY>###6~)-5;4&x89(mfD)aE~MDAZKe{# z=rjr=(dGFj`uP;Qzx@%Tde`avSYfZF*li&lLjMP_X&(LlJ$IS+J@P1iyyDa258r## zJn_AUAE6|E-agO3e^mykEb;o(=hcFM*W$ed;*U1^U2K^AE`dz;5&Pxo7pp!KT>ZYF zNU{KbeYP5b5jP2}Gbj{fBn z=bkkpKVy diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp deleted file mode 100644 index 7d7e30069f9..00000000000 --- a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp - -// Compile cmd -// g++ /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o -// Link cmd -// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json b/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin b/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin deleted file mode 100644 index b2ba290a27dadc32cf1efdced36c497c5f85ee33..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11320 zcmeHNZEPFm9e?bz)3_;i8c+u+tXHT<*}C&<61Q}1pwJ<$h>kK%f}zvdz9cS=eVi|@ z(^Beo6MNA{C-!O5q#?8qo5V*nv1!`)Kv6YSh)?^lPhD0`h?Xh|1tE>${r=C(fu8N!=X@a0~1%6{hqbuoQ8M5D1dj9Vtn$JC)ormumRj} zVp55{c&#id%n(5Rl=HzU{E}Le} z@+wW+ELvV^+N{;x8MA6nRXj{h&$TMGN^Q!Vw%oF3)uzpHU*Gy@YCyAHURPbN-LTyz zl%~0_Q`LIWsv-pykHe*Ut$|RyJmu9W+r`-_bJDeDYz0&+HOyVRDv3U9bs=g^QwS5? zY*>wk5LyLqp$`25KF;x4rq_apWET}vUK&oN#E~TLV}FY0m=+YZ_|E%A8SD|Y3BQy4lQJo1Q4C}L^?ud&U2_&V^R0DeevG&z5wkN>-pU-2o8 zFL8e5#j@DXIX=3Kkh2RMKhTHY940^2sk%&lmg6@rBjoHyz>f?E@*`Eg&G~Qb4OJkAeXiRK6-EQw>gPqguor8SJ7~yyIX^{wNE3y~C2-%3r!1F+b*|F`-Iq%RBoyM;Zn zTgc%rBk+Yh>>$QUqf)Hex?$HmcTT_FJ$@%M91G8tx-mPJ*A2&W^}f7ZuMLx1Fw?tk zo~_JHw3WSywsJSoRz6@Wm%Z*zlU*@o@t0#)9dB^=vv$i&-dtz;JQ=2=Vcp*^Qs7VF0-O5O~X z?NjTuVxGQ#1p{O2XCMOu6ctWu3TLL?G-gE_GT3>NCC{pqjTS<0Ojiw=pbYlBq)YVO zD|v^Va=pUOt{6VH;Po<)gMqG~QM)91g+dNSjEo~<5Hl-9jQx7O9LmEXt5)_?jby?w z%#UW$aJBCG6l4JuLC;fxY|k;Op3>b8u#W45s{OX0$*IGqY7=x+3Xr22Klx-J|(@ zZagzSP9eLh{8=1Epo1%in28`1yIG+bgi*L!F8g?#TuDjI!dEg^t@~8!CN~S7OdifO zW{W;WbQ2FJo0EPXtCVfUP?oMxEYhM{X{57EWx+r;mum^vcj@UC`5{{x)+J^l>11>) zTlpT@%<`O`fD#^BRzHb)a?Sde^t4Vb!lUVSO7Oz@9;bMqWhxmth1^M};4={5=U$au z+wyFuUageo47{3_JUq7G^UtB|o;`N6Qu55I)$sV9BBU}#PkgY`6Q9!E6W^gsmrAlw zvP#poi6?W<2t?04&->BmMD#ok>K{>TnR*ePXutj_(Uoh65>kCpqP9Iz!ZOFKHA>zs zV}6ulfhe&8x})T4q)e-5rCMgNMAi@{iS@;)GPc~UFnV2a@|~gJ$GJNYC*MlC%9-Jd zw=6b={(910zm`ZL))y&q%2h2#>gRE-CFNzxr2SYk*49YBpWnH$fJ z9~gb$fY??Po%c4iS%A(O&$^yrIruKwGA6y_hGOPv&SEQEjX`W|V{n2tJtf9n=1G^k zp(NzQwo45MzbQxZ(9O|)HkmtENYe#vtaP#c*NO`agypH=8X<C=WJfxf z^&Rw-gCY~$iG2D1_d{9sd8ngNK(2n7N%&W@RPvk*FIGB3kpn;*~dJY?v&rXbbRTEGaGaiA&4kQWTF(xV8C5abhk1O=6Jxr`I2WR9MN z`TE7F)qSuAkJ4Q_KW4c_tkRQ+iMh7t=R3b=`5D`;mFbc6Bv_1w%@4?!{1$2zi>}?o z!~B}AlYRR5?@nvfwk8*Dh){oAGggU0i=Bv37Fi@U$Zo;7h31>Zoa09V^YI946a90W zh=kwq(u`f(?!!^Gld*-(J{)0s7hPkwcfn!$rW=qCvB55w8v$KAjPIdaJ1lbNAiGE( z7x5`IG7KN^VTZo$(4TfkfkieBvN-0$Va&%Oq>W$n{gNi1%JCm^{6vmd1slFqGM)no zvb$EnE4{?u|CN@$d2$z(8urjCdVGd8PR^VNbr^jPCWr2hvpeRbqU{)Kadzg!V26DV zbK-dga4W(VBnz8QfM0&WP69a}XFqKX@^82Mw?HH4XIe2fLCK}-tpt0i6^qf=@WGE? zOtA9_@W*1e0p33#`JWqP`-zu4BKZ^X4toI**-sNZ++jbEutV@jhy75(Admc2C~1|9pV9W9wGQF+tgx1?9u>piVGU-NA3l@`^CQ& zI$_ZNhv2= zMmRje<+s5fvW83glWYwP5oUnXQ^b%>1m^7+lVg}0Ug^1$zRjhrLzhK|HpudKOy;{ z5&r4e&cMG9ryI|Y{>gLssG_`zs2}K2(eV>}J;dAy zeg~rK4D~Pg!{H%7F+w3;3mybZzS05J{>~;hp=fL_{KvImEXv8K42dC`r}Vf(muOB z%fNp{255U0`%1Sv6B7Kf*fe}1@dq#3gI1DHOXOWW#9n#&1-!=uSAUovUW`pk0pg+a zLY{K;w92o8p!ez|!t3(kZ(6YV_g>y3tH|rp(3u{A9+0(Hk$*+*j~jXfdO&t=75QJu z<6T$(fk8&lD}SHBJAQz2a$&2O?2xc31MShlOK!u?L=cJjf|P$v;z^#4!e03#3Im_| z?Em6UA*hSN?-3=h>R5*`;U*-X8qQ03RcEqK=aZ6C_*Ykvzqo=twN$VCl~v?Fmhx%o z5ZR~mx>x>I#J`gNJLLiBv!_uG{RsINlJ#wl(%8nXUz0?@ANB!&9MG2@;6;O^rIYOy=D0Vff V`lvA|NAamVDtL{`=+p=bYKIe*Tf8N3=vDH^4;G*l$=%j;Sl(BY^TA(LZ&|Q|u6%X8m~H z#uAKKK{*Kh&?#V~`h}W+d2a{hda22*`n*-C&or17y3NoE{Yu!V+g_t$&&&mG$+qiG zSebY2f)f_YcD>=(?5aCk2{E)o->K9q^;x^@_@&UPm+fq%Z*w#S(Cn5rRhRDuu0Ic@ zY3}Q6wNY@YNI@m!aIsMj5K1^fd9_MzVQ$u*@tvBhfNG_VvF}zTF@l{AM19^C!bG^(Mim5w6eMJMohVk7wfTszT<>h%AAT05JviK-=jV;5f* zOHyZBdlqIAN679N){x_az?(nY7Ndxv4ZP0AdhiY46EXZ2$uZCQlRf<3m;8!PX?&IQ zD=${Xe#-HEs|Y!IkK_A$@M9YJsZP~Z@=1;#UPZ{!kAUB{HI^S~>V3|CYY%@LO&-aN z#?*9XJV$+UrzmkqYhwDu12A>%yzdbCJ{v}N3eU<`^~C?M*yKxynR7$p+_9393p z0Srr|!kU)~8-B4|sh8%87}>K<&}h`3tvIP#ou$f+nwwfh(V0d($h6i+Qo78}`9*?dsV zd2?azRG1HZCqG*}anf^Y*~t@)Vx^qVmp#Wh;aT1SOZ@t;KYpwK58LI6DZ5qA^#h`4 zpZxYuZ}$J{7l>*Ixfabx=wpeMYnSy?+SbH1UDGbG#FE|wuyP6i^n{j9EbGr}W&$)b z@zKiVOIJSBhe5vtn#Agh+J!_y?*j~(Wkb{7Wkj2c7>E_74x`h~Cy?7e!o0>;NoTf! zjycEl?Fs!>7}M{j2f3x&^w>?0=d|Cmgua7@BlI|e$F&cyd~{`LX+{4WVTR~&f*#+Z z$L|ZkFMxO^J83DDE0@B2@>=nQ!}lmD;!rb%BG z`u7WaWOq7;Pe$OUCtwFLR)R{Q>YA2Y5B&x6dUxo(%<>$(6`I!E)P!kyp>Ou&<$7({ z+=89nck^szZlbO1O|+G}iMA$UwsP6)t~6N%lQFF~y5gqTJ@lR8B=+`_nVp`?Fc#*q);{zDm}Mb^%`->*)iuz z`B(+Xo58Amdb1YHGxx1wU~2OWWMF`z!cj}%)Ee{FoJd0k8_#$#bSfpQiO_>`)shLy zV9Oi#iN1d=@32vBR`}U9!^aN1Sq5@2&=EA|ZXBgTA;%*|#t|`ynYALucD-2+O~4_i zUW!tUWWq4aPiE3^wc$q;WC0XGp8%;a<6427IyfXID2vksE(c{_O=}is3~1oU=3%N- zQbJiAG9YlZS=fqsL0ZOrVz@V#g|@juBt%7`gg|1oxvKT;N-1)sB<(>hoaE-zG9k$l z0S-(Xr@CODK9$3vTNf_IJ9DcbEhYwz;hyS9s<=wN5Vy9qGEajL?LY; zKuGljh#Bn)5SBS+0|D~#n2G|NiUo-B?F^7teMwYsZ-5yT{stl>v7QiBvR2Cz^}W#$ z`4XOvLfjt7kb5vq8xg*nQYS31y%v&>a)v8jN!qN zTBz{kYe5FD1-NeWtg0&&dLjxwJ3LYzY=G~(kwsb{Gm6F&MLu|R6m_Z+0n-K%u@g^B z6|Jm@##NEC9>SCkW+MkfY>sPRg7ThY8`{ z$H*dYBJtByKIFNo1~FBT&7`NNa@mP|j(d(Wi?d}!HiZci@laH8h6!FSF?OPwcY{H)w(@^h5?f~9#Hu`u5aBMlP=xGw(k>()#3u6P&I}%xU)MCryLjpgyo28lmrOvyUC3$)NUjlD*pO z>!aIf;~y_2**i(__x0Td_;*t%e~J7VZnGasI8F4CHhWRR z1^C|-bKj;b={I))Hrm@U9H6`1i;uHypnrm4`}|7->|&CA0seCLXo3Kb`2#$l_M@P~ zzcJr12+`jAGKaT;{{m`+%Y(kvPY$5;TI~yMuJ7!k(PrO+0Qvi+Q8wPxlAtq9gTD7t z?976u5zKa4Bb2Z4<{+mJ68(dh*)LDQ@w1?lJn}W(9Om?4qF+q*omm(hjN~tp{0OIy z2)cG=VR$&A?;&}E{9^{E?}Gi2k%+!G3Ao?jxA!~0IKn;{Ws_kN>!NkAO-VuicZZtx z`deSl0|NXwU|o-3_%|4c+GZx`=~2L=xc5gR81=FR`G&slu!oj(c5#;>Vd%Q5vp17O z$MPil3g{`K6Ab#_b@qOenS5cj4l;I#(R`S%aeZIsF#I!jj1vDU{TzoQI{C*N{j_iw z{A=nczm_O5=qG=n02>58@OQA8l=K#l-+WC+I4z-%?4RTGNdIoXzaK7w84dHd8}&03 z!Ql6Gkq_?wE`s}ceh4Q2eQi|yIs^Ef6u|=;>5}xjeSMht`Of-wGI6)BZ6P}C$yBdH zem{QqpmuhE>Sf3uXbXA3pd+7{-t&jp1?2l;lJbjkCK&l8*k}EG{#!IZ;P;tW04>Z1 z@-wiP$l#|2`oJ`i32rg=Zj!yQzwglJ}_4A66gY1YaRKQuvt_4v*@B{xBOe{$j{O_jdqUCk67!Lmw!g) z#aQ^z+$k`7BtcDs(ZwjYP47?_{$VNqMogYu>6RbI#K5N>`&XsB8Iz}>l2`NCfH2_> z5i!Kgy2k)qJTwZjVoe`l6==sD zw3NEt#0J{v#J+6Oq*Z*{7$4EZrfKU3%Bl$=K27`BMQB2_R7ofZX$lNe9FN1rTD5^tykW|#QL+ouljekLRcr;6ohs(8U6w>2ww55OO;ZRH-E3Hm zh7ei-Z=nvo0zR(pwM?%C56Lbnro1%jPLU%?-p2~x7hRP81EGAB7?QEd|y?Yg^Mn6=sD@JW%qG5ZY<>jJMbSwm{@}{ZaLytW4 zh$;BZv}u(c3$2T82(A=!n~7azNt5uV6L{RSYPtNctsQ?wEnWNn(b`#)O;j!M>$~`} zSdtnW>{*xz93fi+*od6(0bV~l6rhNy3B1NO_TX#4hXVKk$6+5+vqBEmprR-|6k+N!@lPWuf zRDH&qs#SAFCY@?Hp8b`&Reai-v>QTk%2cXT?sV02a5S>w(WCB>!_8{Nv7Dk+@+PXS z+)R1&aOH^Gw1(@BTdo|*7MzAVG1GcF=N+*thaICn!$QCN+mFxp{c)3AF;jNAZo3Vl zgg^VkpWo>F?Q4iChFps)64Ew?7CyS79SaYJKGMSBi!3y!wE%p0NehMb(7dLF)1k`? z+DQOc0O+B)%U2e(XT$HX(EQw`g}IL|b4Kk1NLSy#3YLqZ(B-QiM8euz42odl*C%NV zYkM)wYcbrlX`0S~whvE_}#KR-V=uiZtUNjJ{N-FX_|CGA^;JWoywX`5zMx+UpW=~DivPvN$1 zBCv_u-2B|a!le&IT23N;n59Ek=8(g5XyJpaAAEd?Ct-nS{lkU%dCD5n(55s{VT>k< z=G3Zs-WbDkq~%W9p5CaKJJS|3tWpUlOT%n9$83F+P6>IE=myS~$*P{wjY3`*C&!bS z%v1VT_2WB7cIfmO9owPjY0Tpzabs!VhCxrCxbI)-qEqO}%-FxxmnVH$=-(;qk==X_ zpNzoghhYaXIt`~#wspgb_+RjIN!53=B|IIISt1O08*3i!@}g^CXL&<&=yTLT^l!4Vj<}_PnG^^qtFj zhn;e*!p|-nKDOYsGLVCTB|)QhN%RVZ9E=zlN5mjzmWvqs^;$VJ42P_0$xk(s3Bxcy zl1anWn(I@L1yBTim4HxX z&BUY0!UZmO)G67Dp(H&4hcc)Z8mTN%RWOjr#Ztoa9eTP&Zpe;?afumE)-yU*t^69< z%<_~ThZ5ntegfs>n)NU1X`MP`8K(pw=qS zpGV4rZQ$FuZ;=+pgrf08kq>o*+@*?J%R%8HJMjcmQ77t~Qr+D7bbL&+@CfPW0&G zKRc~b+p1i+E<(L=RjeX~7CRn0!6K=Cb`$Jw!Y8s(%*l<&P+&e5VQs>FVFQuy8(x~R zt6P0I%C<8$x6y|qOkYCR*sV+8Ful(W$cI?}5||qST|0<(p_@A_a(h3!NG}%g8Z|Nq zAMjF#-s{juyDiTm>-$+8^T9CYqY=`^FM4-Llh5S%4>^8{@rq!0KvxPpM%KuAs+fxW>upQlBAMfw5?_*9pF9U8x z*qmfxvkCCaH`qxaXXEUrt$zOQcJC%=1pRa?#>OeZbiJ8iFSTMZdK=#V$%_egAp!na z>=wX#`y~G}{cJDsl1C(eJl< zyOLgrvmwwwgFl^fFRml|8{GjOK<8nC&#(`{v%!Wigwj28l`$g{CY*BOY0R`Qm<3bN8)}wcZOh}9K}WU zd44!mz_S$phCcrKfAbeNuj$~Vl#gG;0rwC2F#2!AOJFUw_mDa^yM=A_!r@oX z-#x}{%$@|fYIaD@7q+b4N99H;to_{|i%%hfchD6b;w2YOU={5W3^F*kzWf#^C- z{R{qZcmPn0P>2`#K{?Njvx~^zyNSqv%Xt5>Hu8=9#N22{=z~1s13z|a;^+Az{wNSg z0rCrd>+&QHI_3Wzzz-9dVB{C}AK``pv9`u8C)jf{tPcXLZ!-=6qO1Lm=~Tuk=hx>M_^-$S zZO>v~S?bQX1b-wp4WCH-!Hf2w<>b>6c}EwqTb_Oa?=r#FALNG@V^dOqc<8*4ryM=8 z@@qfn-8zZzx_k(n5^VmxoA=NP^13v1x=Wx7WbGB?UzYphx-Nk(key#a{+IH2-=hA3 zK}OIme~-Y|{Q%|U!e%kqCSg?u+M|P)+=lDpK_upvrTl9WPx5pWcFQMG82Hp<|LpBT zP#1&WBT8P?u?AtnjY~c?oRjjZ&Sam?CncxwudX2fmu2LsrMl%WuOL4s<wkk`9($fN%F+kEk6SL`0H5%c_~jh4_-=M;pqQwNM0@%g=TH_MM)CKKCPd& z&mksBX`kr9t9_S%WEy-D*$Q3>Ajqd~`JYMoATQ0kg5r|P5Zs8Q8RjX`_VzkXemfErZ+TG#J9ARf9+geF35A?8Wv*>)p-l z*sNm$o3uhzRS{I4ctbq#(1$*P)S^5a~JJOX};`XYJ09zu9mCwwo@$H>1kir_O&z?9Jf^5F+FanQF51|FvUL3 zS892ug0RaT!-ZP4;W<@rl5%MkOZlaFd(L$hOB$$@tC+i`iX!^3(}Aci+ftb5cEf2j zq|g?42f6PS@NspoX?sn0NOrMG+DofmE-;eheXR6-!$svk5ZXtDAsMGqnRlJ~0!LqW zYhKM;sUyOGzSQ=TFRSu9x_@8Wv{JgLPc4=ft=imNqvXl*R+~rDQS{iw%eID3J@)Wp zw&b@{wo@rPC{ySQ?h5;$jjqvA5L{pqk9$UMjsLZM;_sf$eg3WX$+nxM-pb$ICd**! z8DgMo0~$CQMg!QGnrDDFUmFTgGvrCNc?ZWp&$xnS+Zjeii-@SoQv-7|Y4FvL|RKCspw|DWkajBzL(3~zq z#Ga!u*>}FxIwa!l^Q}X1o(kp}@6w_W%2q6gV5DqJXJ8jlQBenDB=V2`-MB}95s?XR zu|DC|+`>Y+T3jk%X3sl~TCIAz>`ctKLKJ4))7e@vKVLXocN)!REt_pN3*P)pVY1oC z0if5LxwbN&&md?yA< zzwU5_7y>7&!X2f_tnu@G_W<)T*Zk!|jb>K|gU%>ri zqX+kE#vT%5CL;=ju^Rm_5@Fiz}S)2j{wB zP+nf9ywDn1%O$r_u2s#PHG?NN$DJ>EW@FJll5&KFr!n;UhTSNiEt%8Qw^eU9Ep+?& zs+l&ee9n};{ONT18S|&+>XE5&lRnck<7SS=96ky+lLBrM^wha~|D7&67oJYfe67A5 z>B~U>5owR?=Cb&tB|bL^JBYE|DCa9B(<)Uxcg5UJ*F7Yxx`QV=(^{IIG_AVlnq4{B zj((f9+o|zeV<&x!>|}0{o$M{LGkJ^bOa<&@Gux_MRaw&kjo)Pj&xphH-QrBuD;Ldl zCMQ-2oX3{I_c1duR=+QI3_XBRd4!f2bMMO@*+9WJpo;fpCV>+NPuTbxSuNOkd}XYr zj#BHY6y;X5@`^V-J#Id|6PxdudpA*zcD57B(@;*);25Sb7HiAak_ zBIL$G#Zrl(z{Xsn=Qi^U4R$9y%WN7pI_pj-&q8@eut?_^N`pcTMv7PvDTwJ!BF&)P z?}RgxaK@ zF{R-a{1W-OlA^g8CQ?OYaNHqRjRjZNQ3eMh1i_JkR-EC!5Ll%~+%p|1$ly>05)40? zhN;F<-WQ^(WH)dpFBX?7*0bfJ?@CeGgQFm`IlUiAvM7reT#)rE0CA6-sb=cl00wR| zW_GNeHnF+Y*U04zCu|kEWJb($SPs@~{@6^Jl++Hh9Db2%2-d@95n;$IBL5n*oVH2d zY-Yo3cP?a#k+V=PIKM@zF>)&F=K`q$0{rYl!7Vvnsa~s;3o91B5Enc=n&Ac0iOkWX zrR8$Lvnx)+$jBjf_Wxq-Jp{7%TSkq`FpbMg$%@JUPMHJ6>q%p9M3;JEB*nl9_4lP(%( zT+edqc>me7=Df3(X6D^aO?WE?UW?2OoS=@Y#Ijso+%QNqK;T$-J+SjAN$U9^OIe!B zK9Wn(1#QLhvNg9mCrXEcbd)#Iz^QEv~-jVH}gwMY+W=kWcpQQ}6NP;dI2eBLcBGVs(iZgEk^;E9U+i5`rbjVW`^AOTzP9*1=EUdYm&+}(J8EL!ak+sxn7q{_h6hE>=bYxeC% zDT}@+^#AQ~asEoQk85k6)hDjTL=W=M7x6=XPgA|*B^db{YxWD`=qLHB$;ex0LB9Y5 z`Oo?XSpR^eM_+BmLhIENTCbiSTCbSDUOhdX>lN!*uk%-8{p``cANGlS0{+Q9!SP*v z9R3Zl{UNgd(Sg|cE4>4Le8mXp!%QD$|0W<$j^k4CoxjpI#2KJ`yxtEw?QWkQ8EeKl z{y52BzKHzo9`^a)=m*>%_x;0qPI-gq1Fd7EUR*sOM!jhCPw)O{hIJyJqn=^-F!C2~ z$kGrdI8yz92WUM7!>F%4wEm*?ym)X8>wh4oVCcSKh+AJ^<}=n`hLMM{4q!9$Akd=O@@6s`NtQyXg|lB27Ur*$t?x_c9QAw=a8mfc0)hV6&xw() zmLWb#iWgTzocMe9;cy_j?r%iDmK2Y!d?~2sx7)ul#HFMd@#zryKY(rf_^%#%z<%V( zr|{z~PftGa@NxUpFQ0gllKB1geFpxkGC*xc_LEMxW);|qcH)UYcu^nOOn$#YCPKup zJpE!DGQsK(sbhFS2@ntUEpF!+N0>{9qo9X%66t02(72%Z^bqEKbPM?}l>DBMKnN7x z7V<_yS{w-pgh27u7V`VlF}>zDW2rAJum7D)$EU}>yX5SkLg;1SvqdvuT$1zpYyd~o zl#+i(;Yps(xv>0REDU_AD>i?)q2x`43ES7}NdJ#OI%gH19!B=bnOQ6*8c+u+Td&YSVCDQ8$1NQM3XEw*biiH&L#MNSNxVAt^?V6V zOQ~C?X`qQtd-*VF9}u4=HffrKCN@o5KLDyGw4r^N_OXl5glOrKP!Q4>-tYgsoX>W$ zmo%-miBodV^M8MRp67h_@p}*47Y>E8o0z!5?ANR%=QO-^s{q~_72~5sdw2)VMG!U_P3SnOeEYjM}_WsZG_H;n_{k@Z5@5ubEE0VouF9?6PUr zEUz+en?=hj&6u^iJ8M?$>57M`>A6;=R;f*!GnQNStlEs39`_Y)kg5iB+vSZ_=GqP0 zorl&G_jbBkFIrV(q2h74RIfDY(Ug~lQz>yI$@^I0d&foTKM=}Ci6I%QTAg++ zXNE`bxOK1YEjWlUpf9xn6%d_zIqx4g{VI!5b^F7!_N-B#nrhgd=zF=e6dkJpt9{;7 z@WT(>`+zC<&6H_XD;D|}EfHKU<}MSvOIN4x&J%dtGiuHJuPq>dS#@s!|Iq^4n9Woz z^vk*gGgy`y8}3`2BREQS2Cxx1zYBQ%`H=ucOpCy4Y-=Ch1wItO4@r&%&Y$Sxe^&A< zKBe*doL_m-5qp8-qaB2t{hs3o`ta9;$xn5-mH`K>TYSKQY+7uFf*sEM!jD9TE$8puCwG!eb!EPP;_vVt)yL>ZzQd{ zS4mbY#iX;~&D3jIBb`b%DxQ6tW0ekB({@8BPMS(}(w(b$6`YT(bhv(`UUA3g%K6&i zVyT|5P2?(>{GsuM>G{K%Bh#5fj#Diio?Cb#SI<;SPh@lX3By@npwD+9lL2~iCq*W!wVw5_3)_bzBh!o#8Wv~c(=3oU6)0PmjDLSa3$ ztZCs?==_Ry9KZztdT8nVg%$0o@Ea_&ymW45>AmxuQ9B0G#kVhl$gofww2LEN=Dnl6F1A7bI>!^pSBD?pvmT>&5h4l`|ANV}R;+(b8WRr@O4 zcG2xga!b3Grd#N?pKil+b8%Z5W-)Ps(D3RW9RXX_$@55nIpGDIrf1-N4y0UDMOL zQ7q`<gSq_OOkG?oh(%VsXU-()L{2h?8nzRR;)=p)KuoDyX{GoEKh37kf^ zwt+*r?rw<#3#{OYA@|Q)v{Pd zMp5#nv3wufu!ZyZ?W4_{L1tkSR)I z_ZxMIzGpS}uxD;q0y3*60K4&qS;)ddSJZW>NY zdV0K&nJ5&p6tb%tAcKPmbZ})6bC4hu(|H9?%Vni7gHvqO&2VZ4u9CTG-KSDNxx|w# zz?sHe(Wi)h;?ZQ_0~dUvQnnRCS-JuaNOFs6wUN#;l?4OY?9>vj@77aI@&H+}u37)Ip37H~6W!MzC7N;_Q9`OOO4PO|N?7KY^+w6N zCGSU>4@5c98zo;OWm-h5)iRAGvW_rGtS?TLu}-(b=q1I;H->^A=bk{Ed?)EDXPPhG zve*>*>q)!)dLo5bU!=$>SJjEs&*OSa%FC2a`LU)m!C+}m;lWOcqb+c0G%}syPjb=cv0Cjro1DDV&-YijAKh%gF$R;*em&_r^L9+Jn3>b zl!UCaq4;|vz zvZX>taP#c*NO`agd^Gnh(!!WhG@dB(!9mD9s<^c*6n5B&C!mUYv0D|HcaRT($;P{}Mk5A*el zQ>*)64IZVtb$-lpi&&+{5EFB4&(C*mZ|6DNu9fMb^cYx-hRqMiLHS+xI9Arfz4Vaif!=dAf+*qjIpaQZln2DF(*$VBZ2uugtZ9wi7iCJ zcY7(uF7EWvvJX{!%MnBGO#*tK17nBMOOMqp>#oCg#NR z0^nwZElC!(m;k?gb3F>=Vx0Y;8RPG4@7xBBpr33GvPnuLU9TnBcbbEP^cFn!!E*_A zCISAz!D|5DIUxC;jIle3mpmf*qwzNT1|YJZB6zsXo|Uje@JO3|OTy%TQ`D_Z?~;F) z1bv{*zAov-I2!@|Bly!k{oE$9zttV$0krQY_!QgHWJBzx5Oa!48tg}I2K?g}U%a&) z2K~2)x2d&vKpoo#C0w4~uS4G(r#7)S6D-=4FK^qRBi>!jC~KiXhDaX#qs>^9S}n%) zJ;+Wx6OQrPZ#TBE#R%wePLC6PF%gY6#hPHG5Bd1T5d?M;3Me12H`*NL{KG=D-kP1~@&5{6~7~744~4G)nb~`1OiLyXzI{s@I8UB5^;S8$z&8 zj^ZNwJU^T&;6;jm%K(4vzx`8N*jNJa=3(wJ<>N_aX+Dhn#~NboFdZDJIN&(17l#+aY^NKE@C-G1Nxt0cdwxS zJ0d&XXq5IZz8N6@k`CMNYwQ*1Z)!+S3oYA1e8~vt*eXc=qQ>4lrC{)X0GRj_k{=r3 zpN_2y{CD6a@4#bm(l7YYPZS@&u;tPE)z5JKf*=0sXE?urUnl-J z-^=({U|0{jtBHJ&{WCiVj!-{y`y}s%<5YhRzm{Y-x|&866Bz;(_ws&e4f!!?=wy#T56GTgL;gj%KW^#~=mFW8HROLL zkN4Zu4;N$vz4EsRyyFKbCug^d$u0@2GSD6!yp9OKnG7N^k6bShFG)Pf(^1$fKZe4< zr#|~HOL;vYPg5nY>PWxqAlxJo1J|FVys9(Vr}Ih4Df~~@kY8Fwo?5C`{;zAu$K(f! zl=O-0(|O%1zYX!P=6^xTD?dq|_YMQ_h2zl2!JkIs0t{@pd6u8Jruhd WRejVLl%x1m9u>bDRbJJo -// Definition of AOTI runtime interface functions - -#include -#include - -#include -#include - -#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ - try { \ - __VA_ARGS__ \ - } catch (const std::exception& e) { \ - std::cerr << "Error: " << e.what() << '\n'; \ - return AOTI_RUNTIME_FAILURE; \ - } catch (...) { \ - std::cerr << "Unknown exception occurred.\n"; \ - return AOTI_RUNTIME_FAILURE; \ - } \ - return AOTI_RUNTIME_SUCCESS; - -#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ - do { \ - AOTI_RUNTIME_CHECK( \ - actual_size == expected_size, \ - "expected " + std::string(name) + " vector size to be " + \ - std::to_string(expected_size) + ", but got " + \ - std::to_string(actual_size)); \ - } while (0) - -// AOTInductor uses at::addmm_out, which doesn't supports -// arguments that requires gradient. For this reason, we -// enforce no_grad context for run APIs. -// -// A RAII, thread local (!) guard that enables or disables grad mode upon -// construction, and sets it back to the original value upon destruction. -struct AOTINoGradGuard { - AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(false); - } - AOTINoGradGuard(const AOTINoGradGuard&) = delete; - AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; - ~AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(prev_mode); - } - AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; - AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; - bool prev_mode{aoti_torch_grad_mode_is_enabled()}; -}; - -extern "C" { - -AOTIRuntimeError AOTInductorModelContainerCreate( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - bool is_cpu, - const char* cubin_dir) { - return AOTInductorModelContainerCreateWithDevice( - container_handle, - num_models, - is_cpu ? "cpu" : "cuda", - cubin_dir); -} - -AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - const char* device_str, - const char* cubin_dir) { - if (num_models == 0) { - std::cerr << "Error: num_models must be positive, but got 0\n"; - return AOTI_RUNTIME_FAILURE; - } - CONVERT_EXCEPTION_TO_ERROR_CODE({ - std::optional cubin_dir_opt; - if (cubin_dir != nullptr) { - cubin_dir_opt.emplace(cubin_dir); - } - auto* container = new torch::aot_inductor::AOTInductorModelContainer( - num_models, std::string(device_str), cubin_dir_opt); - *container_handle = - reinterpret_cast(container); - }) -} - -AOTIRuntimeError AOTInductorModelContainerDelete( - AOTInductorModelContainerHandle container_handle) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto* container = - reinterpret_cast( - container_handle); - delete container; - }); -} - -AOTIRuntimeError AOTInductorModelContainerRun( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_single_threaded( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumConstants( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *num_constants = container->num_constants(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantName( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** name) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *name = container->constant_name(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** original_fqn) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *original_fqn = container->constant_original_fqn(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( - AOTInductorModelContainerHandle container_handle, - size_t idx, - bool* from_folded) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantType( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* type) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* dtype) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *dtype = container->constant_dtype(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( - AOTInductorModelContainerHandle container_handle, - size_t idx, - size_t* data_size) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *data_size = container->constant_data_size(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive) { - auto* container = - reinterpret_cast( - container_handle); - auto constants_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { const auto ret = container->extract_constants_map(use_inactive); - for (const auto& pair: ret) { - constants_map->emplace(pair.first, pair.second); - } - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update, /* user_managed = */ true); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle) { - return AOTInductorModelContainerUpdateConstantBuffer(container_handle, - constant_map_handle, - /*use_inactive*/ true, - /*validate_full_update*/ true); -} - -AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->free_inactive_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( - AOTInductorModelContainerHandle container_handle, - bool use_inactive, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_const_fold(use_inactive, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->swap_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumInputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_inputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_inputs = container->num_inputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetInputName( - AOTInductorModelContainerHandle container_handle, - size_t input_idx, - const char** ret_input_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_input_names = container->input_name(input_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_outputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_outputs = container->num_outputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetOutputName( - AOTInductorModelContainerHandle container_handle, - size_t output_idx, - const char** ret_output_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_output_names = container->output_name(output_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetCallSpec( - AOTInductorModelContainerHandle container_handle, - const char** in_spec, - const char** out_spec) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - *in_spec = container->get_in_spec(); - *out_spec = container->get_out_spec(); - }) -} - -AOTIRuntimeError AOTInductorModelCreate( - AOTInductorModelHandle* model_handle, - AOTInductorConstantMapHandle constant_map_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto constant_array = std::make_shared>(); - auto input_map = reinterpret_cast*>(constant_map_handle); - - auto model = new torch::aot_inductor::AOTInductorModel( - constant_map, - constant_array, - "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models - "" - ); - - if (input_map) { - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - } else { - model->load_constants(); - } - - *model_handle = reinterpret_cast(model); - })} - -AOTIRuntimeError AOTInductorModelRun( - AOTInductorModelHandle model_handle, - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - model->run_impl( - input_handles, - output_handles, - (torch::aot_inductor::DeviceStreamType) nullptr, - nullptr); - }) -} - -AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast( - model_handle); - delete model; - })} - -AOTIRuntimeError AOTInductorModelGetNumOutputs( - AOTInductorModelHandle model_handle, - size_t* ret_num_outputs) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast(model_handle); - *ret_num_outputs = model->num_outputs(); - }) -} - -AOTIRuntimeError AOTInductorModelUpdateConstantsMap( - AOTInductorModelHandle model_handle, - AOTInductorConstantMapHandle constant_map_handle) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto input_map = - reinterpret_cast*>( - constant_map_handle); - - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - model->update_constants_map(std::move(constant_map)); - }) -} - -} // extern "C" - - -#define CUDA_DRIVER_CHECK(EXPR) \ -do { \ - CUresult code = EXPR; \ - const char *msg; \ - CUresult code_get_error = cuGetErrorString(code, &msg); \ - if (code_get_error != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string("invalid error code!")); \ - } \ - if (code != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string(msg)); \ - } \ -} while (0); - -static inline CUfunction loadKernel( - std::string filePath, - const std::string &funcName, - uint32_t sharedMemBytes, - const std::optional &cubinDir = std::nullopt) { - if (cubinDir) { - std::filesystem::path p1{*cubinDir}; - std::filesystem::path p2{filePath}; - filePath = (p1 / p2.filename()).string(); - } - - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline void launchKernel( - CUfunction func, - uint32_t gridX, - uint32_t gridY, - uint32_t gridZ, - uint32_t numWarps, - uint32_t sharedMemBytes, - void* args[], - cudaStream_t stream) { - CUDA_DRIVER_CHECK(cuLaunchKernel( - func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr - )); -} -CACHE_TORCH_DTYPE(float32); -CACHE_TORCH_DEVICE(cuda); -CACHE_TORCH_LAYOUT(strided); -namespace torch::aot_inductor { -namespace { -class AOTInductorModelKernels : public AOTInductorModelKernelsBase { - public: - CUfunction triton_poi_fused_convolution_0{nullptr}; - CUfunction triton_poi_fused_convolution_1{nullptr}; - CUfunction triton_poi_fused_convolution_2{nullptr}; -}; -} // namespace - - - -AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) - : AOTInductorModelBase(1, - 1, - 1, - device_str, - std::move(cubin_dir), - true) { - inputs_info_[0].name = "arg2_1"; - constants_info_[0].name = "conv_weight"; - constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[0].offset = 0; - constants_info_[0].data_size = 540; - constants_info_[0].from_folded = false; - constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[0].shape = {5, 3, 3, 3}; - constants_info_[0].stride = {27, 9, 3, 1}; - constants_info_[0].layout = static_cast(cached_torch_layout_strided); - constants_info_[0].original_fqn = "conv.weight"; - update_constants_map(std::move(constants_map)); - update_constants_array(std::move(constants_array)); - in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; - out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; - outputs_info_[0].name = "output0"; - this->kernels_ = std::make_unique(); -} - -std::unordered_map AOTInductorModel::const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization -) { - - if (!initialization) { - std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " - << "aot_inductor.use_runtime_constant_folding=False\n"; - } - return {}; -} -} // namespace torch::aot_inductor -using namespace torch::aot_inductor; - -template -static inline void call_triton_poi_fused_convolution_0( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_0', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 12 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_0 == nullptr) { - kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); - } - CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); - int var_2 = ynumel; - int var_3 = xnumel; - CUdeviceptr global_scratch_4 = 0; - void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; - launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_1( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_1', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 15 - xnumel = 9 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_1 == nullptr) { - kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); - } - CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); - int var_7 = ynumel; - int var_8 = xnumel; - CUdeviceptr global_scratch_9 = 0; - void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; - launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_2( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_2', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 20 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y0 = (yindex % 5) - y1 = yindex // 5 - y3 = yindex - tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') - tmp1 = y0 - tmp2 = tl.full([1, 1], 2, tl.int64) - tmp3 = tmp1 < tmp2 - tmp4 = tl.full([1, 1], 1, tl.int64) - tmp5 = tmp1 < tmp4 - tmp6 = -0.16373057663440704 - tmp7 = 0.04603243246674538 - tmp8 = tl.where(tmp5, tmp6, tmp7) - tmp9 = tl.full([1, 1], 3, tl.int64) - tmp10 = tmp1 < tmp9 - tmp11 = tl.full([1, 1], 4, tl.int64) - tmp12 = tmp1 < tmp11 - tmp13 = 0.16525162756443024 - tmp14 = 0.022457100450992584 - tmp15 = tl.where(tmp12, tmp13, tmp14) - tmp16 = -0.08230065554380417 - tmp17 = tl.where(tmp10, tmp16, tmp15) - tmp18 = tl.where(tmp3, tmp8, tmp17) - tmp19 = tmp0 + tmp18 - tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); - uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_2 == nullptr) { - kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); - } - CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); - int var_12 = ynumel; - int var_13 = xnumel; - CUdeviceptr global_scratch_14 = 0; - void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; - launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); -} - -namespace torch::aot_inductor { - -void AOTInductorModel::_const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) {} - -AOTI_NOINLINE static void check_input_0( - AtenTensorHandle* input_handles -) { - ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); - int32_t arg2_1_dtype; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); - - int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); - if (arg2_1_expected_dtype != arg2_1_dtype) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dtype, " - << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " - << "but got: " << arg2_1_dtype << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_size = arg2_1.sizes(); - - if (4 != arg2_1_size[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 0, " - << "expected: 4, " << "but got: " << arg2_1_size[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (3 != arg2_1_size[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 1, " - << "expected: 3, " << "but got: " << arg2_1_size[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 2, " - << "expected: 8, " << "but got: " << arg2_1_size[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 3, " - << "expected: 8, " << "but got: " << arg2_1_size[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_stride = arg2_1.strides(); - - if (192 != arg2_1_stride[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 0, " - << "expected: 192, " << "but got: " << arg2_1_stride[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (64 != arg2_1_stride[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 1, " - << "expected: 64, " << "but got: " << arg2_1_stride[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_stride[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 2, " - << "expected: 8, " << "but got: " << arg2_1_stride[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (1 != arg2_1_stride[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 3, " - << "expected: 1, " << "but got: " << arg2_1_stride[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - int32_t arg2_1_device_type; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); - - int32_t arg2_1_expected_device_type = 1; - if (arg2_1_expected_device_type != arg2_1_device_type) { - std::stringstream ss; - ss << "input_handles[0]: unmatched device type, " - << "expected: " << arg2_1_expected_device_type << "1(cuda), " - << "but got: " << arg2_1_device_type << "\n"; - throw std::runtime_error(ss.str()); - } -} - -static bool _check_aoti_runtime_check_inputs_env() { - const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); - const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; - return result; -} - -AOTI_NOINLINE static void __check_inputs_outputs( - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - if (!_check_aoti_runtime_check_inputs_env()){ - return; - } - check_input_0(input_handles); -} - -void AOTInductorModel::run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) { - __check_inputs_outputs(input_handles, output_handles); - - auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); - auto arg2_1 = std::move(inputs[0]); - [[maybe_unused]] auto& conv_weight = constants_->at(0); - - if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { - AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); - AtenTensorHandle arg2_1_aligned; - aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); - arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); - } - inputs.clear(); - [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); - - AOTICudaStreamGuard stream_guard(stream, this->device_idx_); - static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; - static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; - AtenTensorHandle buf0_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); - RAIIAtenTensorHandle buf0(buf0_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - arg2_1.reset(); - static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; - static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; - AtenTensorHandle buf1_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); - RAIIAtenTensorHandle buf1(buf1_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - AtenTensorHandle buf2_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); - RAIIAtenTensorHandle buf2(buf2_handle); - buf0.reset(); - buf1.reset(); - static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; - static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; - AtenTensorHandle buf3_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); - RAIIAtenTensorHandle buf3(buf3_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf2.reset(); - output_handles[0] = buf3.release(); -} // AOTInductorModel::run_impl -} // namespace torch::aot_inductor - - - - -// Compile cmd -// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o -// Link cmd -// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json b/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin b/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin deleted file mode 100644 index 1be0cd3083897a28e082defde99cc1da6a9ef442..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10936 zcmeHNTWlQF89qC+zQ%ZyfYP9ZW)k8O+I046dlS-N2uYC|L`vWVM2Fekv3JROcQP|x z)-frYv}&tf0MVyD74->G9{SJ+T2*Zx03n1BQq@*{OA)H7HjQkPRFz8Ee*d|QXLs#Z zI8H<*j^a7zzhBOO&Y3xzXO12}rp02}0VbNpe$85POkMd-0hITO{?VHrWm7EA`tiJl z#TYaFvLASXUBF27^Roixz2%o{r6x0L4YN|4t}`=mnt>U3m7rd;+^DZKc=u2Ju1c)lVj?Yi)y?Yg^LO(g{%$oJ-Y2OJ%$;(Bh=vW9?0Cm%Ta zfF<~?lx0^dHcA)O5M3$OUJJWQN0IQN6L>r`YPtNst(|a1b*}x7wRSdS6IDz6`Yyf< zmZZ)$_bki=j*#6EY+Q~H18=;zH9`?X3wWJv>A}~5k45mCBu9huCwln5E%_Cn()a@B zS6-}${g~tXRuFRZHpln(;J0Yxr#e+v$fr4ecm*LxKLCE~=16{|sdqU4%{}~WG1QJL|dG(%gJ?@~KK;E+~5L^rY#|v)Hfx{KMD#f45Dpn6guK9nU9< z_R(*C|62bqe~zezkZaN6F?~yH`RZl;jJ7#;RoAqOEVigO0qDAxis>;tE-&lPXhw`b z7B4R^US7O<2?Ss?dMs!cV==uC!{tlMOL0wqixGM*WFS`9If71mH-^Oi7Pd9ML^@Lq zbi^~JZ;R3Ntk$ZJ33;A-4R- z(&@*IFBuC5#`hcaH!->2$Wx!kpTte3fXjiNI(yfD(?!R}W9iBNsV`6ZGSGiO*dx37 zEdJ60pU=S#VyyU;Le()$rxtkg#s@gZ>`wZeIE`(+WMsxC*aCqwD06&<8AD_4 zp~=HoyBt+Oh(lxEpqbTO8X1QYea`xa8$Y7dGkLNR!`Fxfl^3?2RaMXYft}H@|5^&ZC z0+iy!R0=aVm&Uvdr)J;^nXA@ADiw=IJO?JAZmtj#M6GaVGjM^+Jyj_=ilHPu0f#cE z))A>JQPR0Sx#c)CLyDKFZ^vSZ;bIWQxEbEk7*;8_tk4}pqC%&<(jh)k4J zZ-A(@bp!~ho&YhUT>-)}$E+hjULF%+fD@4bC%XgWYobJpVzn7eV@a$dLK5om>$F5Gd(|LSb$z@#rOm!4NSNrF$1sKKm`Lt$Iu(YQ z&P3y+t%JuoE%vtzpLeRH>Fi`?^2qo-N5poe=)9Pzt|D~ipY{UNc5x}#G^c|zrefw9 z&WvN{T7yCCU)b9C-loL3%RJv$OSI`pLRM_9Gz0LPYCxWxS=zRyv&Zr&YS31yG|cI_ zYBiMP+K3Gs8tS))&eyY~=b?bMVP%zXwZe8bBX(#-;@Qe@f3uXz=~>(KYM~@gR+h{2 zRN!3i;^62kSHp8bg(u$|(s*ycRh(;99jVae{%3jid89m8AK${EMOqHiipCQ~KDcxi zb*d5xf%z47;)$rDGVD}E&br8ebTAV-7$^rt8o3j>)DiB7vK;b=baL%0Ji@)2t#aq` zlw^5MvX~I=eS|DBD-w^a3eROVW~6duUjIU+yobKF@sTt zoM3>FPCaxINA-wQa$mTJi>j3=%F?SWj-}3%GkR9;H~2ZqO<`%CMJ&v7f>1;5AlJp; zj-6VGUMA0i#q=G10uIUBN4rq)oCe;)*VWGU=;OcitW(>%T(~Ymy>ZRjMG9?bVF+1_ zCkNS0Fif|U@hxIZe#Ey%#*=Z@BHU+2h=kwGQjA^M9m0KVA7hJKLO9Nh4!X{^cfkCD zC?X$YgB^5k1a$ob?jLuwS$t=lP2ox}z8Us$7em)E^rzpNXYt`dHj43P4daP8Y2uCU z5NYs{?EfwM57A#1EV!s*Ja!4P+t$D-Z#^&vlF&G-QE$BF*_F31lKhVp+I1>7I! zx9vMVJHp;0eZySeu&^H+YmRVugv%$ibMwQ)q5QrCu{Uq2>IR} zXxb~U-!TaY@PmMLJ%r)kpf75c8K9>|0gvLIAB|wt$0p<(`o7NYUDV-!T*Aj&k^kvYKL?;;Zf9UL;1T*-8X&q*4is#>WiR=3chvA>GeU$iD=&y1(q?3QV(N76` z;9pZmxwS+|K|lEm1z7a>gTI5#grv84{KiW=)PZ#;%{_i5VpXY~Q^54@&#ji1d-%Jubpph;~zunV^iJ$M5ZzN*3d)g+V(>_b} zH{|u>cMfXj2WUMFc>`@B4;Xah6XkJXid{s$-%n6}QSSsJzXW@%pVv=|<_G*96APk+ z`9OXK_7NHU)IcA|6Pe%^qxJEvdB*Fne+M>H#PfjKFIY=we@(D2&wt2hpKQ0kp|cAK zwmqan-zN~ZtRs)ycaL@K;Yab~EFYeD=;#sah}P zb0aK#cxZ>f?3DyH4e>5UxsAG0UHB)Y{LdotGE9DFn;?W_6-$$P{| zx)GwuReYOyK5CCoh5DdV0;-quq(0iBa(r&r$fueg)kozhK9xshM)fMMs#kLV E0n);sHUIzs diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp deleted file mode 100644 index cc963cd88f0..00000000000 --- a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp +++ /dev/null @@ -1,6144 +0,0 @@ - -#include -// Definition of AOTI runtime interface functions - -#include -#include - -#include -#include - -#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ - try { \ - __VA_ARGS__ \ - } catch (const std::exception& e) { \ - std::cerr << "Error: " << e.what() << '\n'; \ - return AOTI_RUNTIME_FAILURE; \ - } catch (...) { \ - std::cerr << "Unknown exception occurred.\n"; \ - return AOTI_RUNTIME_FAILURE; \ - } \ - return AOTI_RUNTIME_SUCCESS; - -#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ - do { \ - AOTI_RUNTIME_CHECK( \ - actual_size == expected_size, \ - "expected " + std::string(name) + " vector size to be " + \ - std::to_string(expected_size) + ", but got " + \ - std::to_string(actual_size)); \ - } while (0) - -// AOTInductor uses at::addmm_out, which doesn't supports -// arguments that requires gradient. For this reason, we -// enforce no_grad context for run APIs. -// -// A RAII, thread local (!) guard that enables or disables grad mode upon -// construction, and sets it back to the original value upon destruction. -struct AOTINoGradGuard { - AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(false); - } - AOTINoGradGuard(const AOTINoGradGuard&) = delete; - AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; - ~AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(prev_mode); - } - AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; - AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; - bool prev_mode{aoti_torch_grad_mode_is_enabled()}; -}; - -extern "C" { - -AOTIRuntimeError AOTInductorModelContainerCreate( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - bool is_cpu, - const char* cubin_dir) { - return AOTInductorModelContainerCreateWithDevice( - container_handle, - num_models, - is_cpu ? "cpu" : "cuda", - cubin_dir); -} - -AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - const char* device_str, - const char* cubin_dir) { - if (num_models == 0) { - std::cerr << "Error: num_models must be positive, but got 0\n"; - return AOTI_RUNTIME_FAILURE; - } - CONVERT_EXCEPTION_TO_ERROR_CODE({ - std::optional cubin_dir_opt; - if (cubin_dir != nullptr) { - cubin_dir_opt.emplace(cubin_dir); - } - auto* container = new torch::aot_inductor::AOTInductorModelContainer( - num_models, std::string(device_str), cubin_dir_opt); - *container_handle = - reinterpret_cast(container); - }) -} - -AOTIRuntimeError AOTInductorModelContainerDelete( - AOTInductorModelContainerHandle container_handle) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto* container = - reinterpret_cast( - container_handle); - delete container; - }); -} - -AOTIRuntimeError AOTInductorModelContainerRun( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_single_threaded( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumConstants( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *num_constants = container->num_constants(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantName( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** name) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *name = container->constant_name(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** original_fqn) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *original_fqn = container->constant_original_fqn(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( - AOTInductorModelContainerHandle container_handle, - size_t idx, - bool* from_folded) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantType( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* type) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* dtype) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *dtype = container->constant_dtype(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( - AOTInductorModelContainerHandle container_handle, - size_t idx, - size_t* data_size) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *data_size = container->constant_data_size(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive) { - auto* container = - reinterpret_cast( - container_handle); - auto constants_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { const auto ret = container->extract_constants_map(use_inactive); - for (const auto& pair: ret) { - constants_map->emplace(pair.first, pair.second); - } - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update, /* user_managed = */ true); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle) { - return AOTInductorModelContainerUpdateConstantBuffer(container_handle, - constant_map_handle, - /*use_inactive*/ true, - /*validate_full_update*/ true); -} - -AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->free_inactive_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( - AOTInductorModelContainerHandle container_handle, - bool use_inactive, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_const_fold(use_inactive, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->swap_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumInputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_inputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_inputs = container->num_inputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetInputName( - AOTInductorModelContainerHandle container_handle, - size_t input_idx, - const char** ret_input_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_input_names = container->input_name(input_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_outputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_outputs = container->num_outputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetOutputName( - AOTInductorModelContainerHandle container_handle, - size_t output_idx, - const char** ret_output_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_output_names = container->output_name(output_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetCallSpec( - AOTInductorModelContainerHandle container_handle, - const char** in_spec, - const char** out_spec) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - *in_spec = container->get_in_spec(); - *out_spec = container->get_out_spec(); - }) -} - -AOTIRuntimeError AOTInductorModelCreate( - AOTInductorModelHandle* model_handle, - AOTInductorConstantMapHandle constant_map_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto constant_array = std::make_shared>(); - auto input_map = reinterpret_cast*>(constant_map_handle); - - auto model = new torch::aot_inductor::AOTInductorModel( - constant_map, - constant_array, - "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models - "" - ); - - if (input_map) { - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - } else { - model->load_constants(); - } - - *model_handle = reinterpret_cast(model); - })} - -AOTIRuntimeError AOTInductorModelRun( - AOTInductorModelHandle model_handle, - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - model->run_impl( - input_handles, - output_handles, - (torch::aot_inductor::DeviceStreamType) nullptr, - nullptr); - }) -} - -AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast( - model_handle); - delete model; - })} - -AOTIRuntimeError AOTInductorModelGetNumOutputs( - AOTInductorModelHandle model_handle, - size_t* ret_num_outputs) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast(model_handle); - *ret_num_outputs = model->num_outputs(); - }) -} - -AOTIRuntimeError AOTInductorModelUpdateConstantsMap( - AOTInductorModelHandle model_handle, - AOTInductorConstantMapHandle constant_map_handle) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto input_map = - reinterpret_cast*>( - constant_map_handle); - - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - model->update_constants_map(std::move(constant_map)); - }) -} - -} // extern "C" - - -#define CUDA_DRIVER_CHECK(EXPR) \ -do { \ - CUresult code = EXPR; \ - const char *msg; \ - CUresult code_get_error = cuGetErrorString(code, &msg); \ - if (code_get_error != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string("invalid error code!")); \ - } \ - if (code != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string(msg)); \ - } \ -} while (0); - -static inline CUfunction loadKernel( - std::string filePath, - const std::string &funcName, - uint32_t sharedMemBytes, - const std::optional &cubinDir = std::nullopt) { - if (cubinDir) { - std::filesystem::path p1{*cubinDir}; - std::filesystem::path p2{filePath}; - filePath = (p1 / p2.filename()).string(); - } - - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline void launchKernel( - CUfunction func, - uint32_t gridX, - uint32_t gridY, - uint32_t gridZ, - uint32_t numWarps, - uint32_t sharedMemBytes, - void* args[], - cudaStream_t stream) { - CUDA_DRIVER_CHECK(cuLaunchKernel( - func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr - )); -} -CACHE_TORCH_DTYPE(float32); -CACHE_TORCH_DEVICE(cuda); -CACHE_TORCH_LAYOUT(strided); -namespace torch::aot_inductor { -namespace { -class AOTInductorModelKernels : public AOTInductorModelKernelsBase { - public: - CUfunction triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_10{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_14{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_17{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_21{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_24{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_3{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_6{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_12{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_16{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_19{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_23{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_add_8{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7{nullptr}; - CUfunction triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9{nullptr}; - CUfunction triton_poi_fused_convolution_0{nullptr}; - CUfunction triton_poi_fused_convolution_1{nullptr}; - CUfunction triton_poi_fused_permute_copy_26{nullptr}; -}; -} // namespace - - - -AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) - : AOTInductorModelBase(1, - 1, - 262, - device_str, - std::move(cubin_dir), - true) { - inputs_info_[0].name = "arg262_1"; - constants_info_[0].name = "mv2_features_0_0_weight"; - constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[0].offset = 0; - constants_info_[0].data_size = 3456; - constants_info_[0].from_folded = false; - constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[0].shape = {32, 3, 3, 3}; - constants_info_[0].stride = {27, 9, 3, 1}; - constants_info_[0].layout = static_cast(cached_torch_layout_strided); - constants_info_[0].original_fqn = "mv2.features.0.0.weight"; - constants_info_[1].name = "mv2_features_0_1_weight"; - constants_info_[1].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[1].offset = 0; - constants_info_[1].data_size = 128; - constants_info_[1].from_folded = false; - constants_info_[1].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[1].shape = {32}; - constants_info_[1].stride = {1}; - constants_info_[1].layout = static_cast(cached_torch_layout_strided); - constants_info_[1].original_fqn = "mv2.features.0.1.weight"; - constants_info_[2].name = "mv2_features_0_1_bias"; - constants_info_[2].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[2].offset = 0; - constants_info_[2].data_size = 128; - constants_info_[2].from_folded = false; - constants_info_[2].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[2].shape = {32}; - constants_info_[2].stride = {1}; - constants_info_[2].layout = static_cast(cached_torch_layout_strided); - constants_info_[2].original_fqn = "mv2.features.0.1.bias"; - constants_info_[3].name = "mv2_features_1_conv_0_0_weight"; - constants_info_[3].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[3].offset = 0; - constants_info_[3].data_size = 1152; - constants_info_[3].from_folded = false; - constants_info_[3].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[3].shape = {32, 1, 3, 3}; - constants_info_[3].stride = {9, 9, 3, 1}; - constants_info_[3].layout = static_cast(cached_torch_layout_strided); - constants_info_[3].original_fqn = "mv2.features.1.conv.0.0.weight"; - constants_info_[4].name = "mv2_features_1_conv_0_1_weight"; - constants_info_[4].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[4].offset = 0; - constants_info_[4].data_size = 128; - constants_info_[4].from_folded = false; - constants_info_[4].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[4].shape = {32}; - constants_info_[4].stride = {1}; - constants_info_[4].layout = static_cast(cached_torch_layout_strided); - constants_info_[4].original_fqn = "mv2.features.1.conv.0.1.weight"; - constants_info_[5].name = "mv2_features_1_conv_0_1_bias"; - constants_info_[5].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[5].offset = 0; - constants_info_[5].data_size = 128; - constants_info_[5].from_folded = false; - constants_info_[5].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[5].shape = {32}; - constants_info_[5].stride = {1}; - constants_info_[5].layout = static_cast(cached_torch_layout_strided); - constants_info_[5].original_fqn = "mv2.features.1.conv.0.1.bias"; - constants_info_[6].name = "mv2_features_1_conv_1_weight"; - constants_info_[6].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[6].offset = 0; - constants_info_[6].data_size = 2048; - constants_info_[6].from_folded = false; - constants_info_[6].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[6].shape = {16, 32, 1, 1}; - constants_info_[6].stride = {32, 1, 1, 1}; - constants_info_[6].layout = static_cast(cached_torch_layout_strided); - constants_info_[6].original_fqn = "mv2.features.1.conv.1.weight"; - constants_info_[7].name = "mv2_features_1_conv_2_weight"; - constants_info_[7].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[7].offset = 0; - constants_info_[7].data_size = 64; - constants_info_[7].from_folded = false; - constants_info_[7].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[7].shape = {16}; - constants_info_[7].stride = {1}; - constants_info_[7].layout = static_cast(cached_torch_layout_strided); - constants_info_[7].original_fqn = "mv2.features.1.conv.2.weight"; - constants_info_[8].name = "mv2_features_1_conv_2_bias"; - constants_info_[8].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[8].offset = 0; - constants_info_[8].data_size = 64; - constants_info_[8].from_folded = false; - constants_info_[8].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[8].shape = {16}; - constants_info_[8].stride = {1}; - constants_info_[8].layout = static_cast(cached_torch_layout_strided); - constants_info_[8].original_fqn = "mv2.features.1.conv.2.bias"; - constants_info_[9].name = "mv2_features_2_conv_0_0_weight"; - constants_info_[9].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[9].offset = 0; - constants_info_[9].data_size = 6144; - constants_info_[9].from_folded = false; - constants_info_[9].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[9].shape = {96, 16, 1, 1}; - constants_info_[9].stride = {16, 1, 1, 1}; - constants_info_[9].layout = static_cast(cached_torch_layout_strided); - constants_info_[9].original_fqn = "mv2.features.2.conv.0.0.weight"; - constants_info_[10].name = "mv2_features_2_conv_0_1_weight"; - constants_info_[10].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[10].offset = 0; - constants_info_[10].data_size = 384; - constants_info_[10].from_folded = false; - constants_info_[10].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[10].shape = {96}; - constants_info_[10].stride = {1}; - constants_info_[10].layout = static_cast(cached_torch_layout_strided); - constants_info_[10].original_fqn = "mv2.features.2.conv.0.1.weight"; - constants_info_[11].name = "mv2_features_2_conv_0_1_bias"; - constants_info_[11].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[11].offset = 0; - constants_info_[11].data_size = 384; - constants_info_[11].from_folded = false; - constants_info_[11].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[11].shape = {96}; - constants_info_[11].stride = {1}; - constants_info_[11].layout = static_cast(cached_torch_layout_strided); - constants_info_[11].original_fqn = "mv2.features.2.conv.0.1.bias"; - constants_info_[12].name = "mv2_features_2_conv_1_0_weight"; - constants_info_[12].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[12].offset = 0; - constants_info_[12].data_size = 3456; - constants_info_[12].from_folded = false; - constants_info_[12].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[12].shape = {96, 1, 3, 3}; - constants_info_[12].stride = {9, 9, 3, 1}; - constants_info_[12].layout = static_cast(cached_torch_layout_strided); - constants_info_[12].original_fqn = "mv2.features.2.conv.1.0.weight"; - constants_info_[13].name = "mv2_features_2_conv_1_1_weight"; - constants_info_[13].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[13].offset = 0; - constants_info_[13].data_size = 384; - constants_info_[13].from_folded = false; - constants_info_[13].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[13].shape = {96}; - constants_info_[13].stride = {1}; - constants_info_[13].layout = static_cast(cached_torch_layout_strided); - constants_info_[13].original_fqn = "mv2.features.2.conv.1.1.weight"; - constants_info_[14].name = "mv2_features_2_conv_1_1_bias"; - constants_info_[14].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[14].offset = 0; - constants_info_[14].data_size = 384; - constants_info_[14].from_folded = false; - constants_info_[14].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[14].shape = {96}; - constants_info_[14].stride = {1}; - constants_info_[14].layout = static_cast(cached_torch_layout_strided); - constants_info_[14].original_fqn = "mv2.features.2.conv.1.1.bias"; - constants_info_[15].name = "mv2_features_2_conv_2_weight"; - constants_info_[15].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[15].offset = 0; - constants_info_[15].data_size = 9216; - constants_info_[15].from_folded = false; - constants_info_[15].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[15].shape = {24, 96, 1, 1}; - constants_info_[15].stride = {96, 1, 1, 1}; - constants_info_[15].layout = static_cast(cached_torch_layout_strided); - constants_info_[15].original_fqn = "mv2.features.2.conv.2.weight"; - constants_info_[16].name = "mv2_features_2_conv_3_weight"; - constants_info_[16].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[16].offset = 0; - constants_info_[16].data_size = 96; - constants_info_[16].from_folded = false; - constants_info_[16].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[16].shape = {24}; - constants_info_[16].stride = {1}; - constants_info_[16].layout = static_cast(cached_torch_layout_strided); - constants_info_[16].original_fqn = "mv2.features.2.conv.3.weight"; - constants_info_[17].name = "mv2_features_2_conv_3_bias"; - constants_info_[17].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[17].offset = 0; - constants_info_[17].data_size = 96; - constants_info_[17].from_folded = false; - constants_info_[17].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[17].shape = {24}; - constants_info_[17].stride = {1}; - constants_info_[17].layout = static_cast(cached_torch_layout_strided); - constants_info_[17].original_fqn = "mv2.features.2.conv.3.bias"; - constants_info_[18].name = "mv2_features_3_conv_0_0_weight"; - constants_info_[18].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[18].offset = 0; - constants_info_[18].data_size = 13824; - constants_info_[18].from_folded = false; - constants_info_[18].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[18].shape = {144, 24, 1, 1}; - constants_info_[18].stride = {24, 1, 1, 1}; - constants_info_[18].layout = static_cast(cached_torch_layout_strided); - constants_info_[18].original_fqn = "mv2.features.3.conv.0.0.weight"; - constants_info_[19].name = "mv2_features_3_conv_0_1_weight"; - constants_info_[19].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[19].offset = 0; - constants_info_[19].data_size = 576; - constants_info_[19].from_folded = false; - constants_info_[19].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[19].shape = {144}; - constants_info_[19].stride = {1}; - constants_info_[19].layout = static_cast(cached_torch_layout_strided); - constants_info_[19].original_fqn = "mv2.features.3.conv.0.1.weight"; - constants_info_[20].name = "mv2_features_3_conv_0_1_bias"; - constants_info_[20].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[20].offset = 0; - constants_info_[20].data_size = 576; - constants_info_[20].from_folded = false; - constants_info_[20].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[20].shape = {144}; - constants_info_[20].stride = {1}; - constants_info_[20].layout = static_cast(cached_torch_layout_strided); - constants_info_[20].original_fqn = "mv2.features.3.conv.0.1.bias"; - constants_info_[21].name = "mv2_features_3_conv_1_0_weight"; - constants_info_[21].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[21].offset = 0; - constants_info_[21].data_size = 5184; - constants_info_[21].from_folded = false; - constants_info_[21].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[21].shape = {144, 1, 3, 3}; - constants_info_[21].stride = {9, 9, 3, 1}; - constants_info_[21].layout = static_cast(cached_torch_layout_strided); - constants_info_[21].original_fqn = "mv2.features.3.conv.1.0.weight"; - constants_info_[22].name = "mv2_features_3_conv_1_1_weight"; - constants_info_[22].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[22].offset = 0; - constants_info_[22].data_size = 576; - constants_info_[22].from_folded = false; - constants_info_[22].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[22].shape = {144}; - constants_info_[22].stride = {1}; - constants_info_[22].layout = static_cast(cached_torch_layout_strided); - constants_info_[22].original_fqn = "mv2.features.3.conv.1.1.weight"; - constants_info_[23].name = "mv2_features_3_conv_1_1_bias"; - constants_info_[23].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[23].offset = 0; - constants_info_[23].data_size = 576; - constants_info_[23].from_folded = false; - constants_info_[23].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[23].shape = {144}; - constants_info_[23].stride = {1}; - constants_info_[23].layout = static_cast(cached_torch_layout_strided); - constants_info_[23].original_fqn = "mv2.features.3.conv.1.1.bias"; - constants_info_[24].name = "mv2_features_3_conv_2_weight"; - constants_info_[24].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[24].offset = 0; - constants_info_[24].data_size = 13824; - constants_info_[24].from_folded = false; - constants_info_[24].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[24].shape = {24, 144, 1, 1}; - constants_info_[24].stride = {144, 1, 1, 1}; - constants_info_[24].layout = static_cast(cached_torch_layout_strided); - constants_info_[24].original_fqn = "mv2.features.3.conv.2.weight"; - constants_info_[25].name = "mv2_features_3_conv_3_weight"; - constants_info_[25].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[25].offset = 0; - constants_info_[25].data_size = 96; - constants_info_[25].from_folded = false; - constants_info_[25].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[25].shape = {24}; - constants_info_[25].stride = {1}; - constants_info_[25].layout = static_cast(cached_torch_layout_strided); - constants_info_[25].original_fqn = "mv2.features.3.conv.3.weight"; - constants_info_[26].name = "mv2_features_3_conv_3_bias"; - constants_info_[26].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[26].offset = 0; - constants_info_[26].data_size = 96; - constants_info_[26].from_folded = false; - constants_info_[26].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[26].shape = {24}; - constants_info_[26].stride = {1}; - constants_info_[26].layout = static_cast(cached_torch_layout_strided); - constants_info_[26].original_fqn = "mv2.features.3.conv.3.bias"; - constants_info_[27].name = "mv2_features_4_conv_0_0_weight"; - constants_info_[27].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[27].offset = 0; - constants_info_[27].data_size = 13824; - constants_info_[27].from_folded = false; - constants_info_[27].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[27].shape = {144, 24, 1, 1}; - constants_info_[27].stride = {24, 1, 1, 1}; - constants_info_[27].layout = static_cast(cached_torch_layout_strided); - constants_info_[27].original_fqn = "mv2.features.4.conv.0.0.weight"; - constants_info_[28].name = "mv2_features_4_conv_0_1_weight"; - constants_info_[28].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[28].offset = 0; - constants_info_[28].data_size = 576; - constants_info_[28].from_folded = false; - constants_info_[28].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[28].shape = {144}; - constants_info_[28].stride = {1}; - constants_info_[28].layout = static_cast(cached_torch_layout_strided); - constants_info_[28].original_fqn = "mv2.features.4.conv.0.1.weight"; - constants_info_[29].name = "mv2_features_4_conv_0_1_bias"; - constants_info_[29].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[29].offset = 0; - constants_info_[29].data_size = 576; - constants_info_[29].from_folded = false; - constants_info_[29].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[29].shape = {144}; - constants_info_[29].stride = {1}; - constants_info_[29].layout = static_cast(cached_torch_layout_strided); - constants_info_[29].original_fqn = "mv2.features.4.conv.0.1.bias"; - constants_info_[30].name = "mv2_features_4_conv_1_0_weight"; - constants_info_[30].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[30].offset = 0; - constants_info_[30].data_size = 5184; - constants_info_[30].from_folded = false; - constants_info_[30].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[30].shape = {144, 1, 3, 3}; - constants_info_[30].stride = {9, 9, 3, 1}; - constants_info_[30].layout = static_cast(cached_torch_layout_strided); - constants_info_[30].original_fqn = "mv2.features.4.conv.1.0.weight"; - constants_info_[31].name = "mv2_features_4_conv_1_1_weight"; - constants_info_[31].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[31].offset = 0; - constants_info_[31].data_size = 576; - constants_info_[31].from_folded = false; - constants_info_[31].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[31].shape = {144}; - constants_info_[31].stride = {1}; - constants_info_[31].layout = static_cast(cached_torch_layout_strided); - constants_info_[31].original_fqn = "mv2.features.4.conv.1.1.weight"; - constants_info_[32].name = "mv2_features_4_conv_1_1_bias"; - constants_info_[32].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[32].offset = 0; - constants_info_[32].data_size = 576; - constants_info_[32].from_folded = false; - constants_info_[32].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[32].shape = {144}; - constants_info_[32].stride = {1}; - constants_info_[32].layout = static_cast(cached_torch_layout_strided); - constants_info_[32].original_fqn = "mv2.features.4.conv.1.1.bias"; - constants_info_[33].name = "mv2_features_4_conv_2_weight"; - constants_info_[33].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[33].offset = 0; - constants_info_[33].data_size = 18432; - constants_info_[33].from_folded = false; - constants_info_[33].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[33].shape = {32, 144, 1, 1}; - constants_info_[33].stride = {144, 1, 1, 1}; - constants_info_[33].layout = static_cast(cached_torch_layout_strided); - constants_info_[33].original_fqn = "mv2.features.4.conv.2.weight"; - constants_info_[34].name = "mv2_features_4_conv_3_weight"; - constants_info_[34].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[34].offset = 0; - constants_info_[34].data_size = 128; - constants_info_[34].from_folded = false; - constants_info_[34].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[34].shape = {32}; - constants_info_[34].stride = {1}; - constants_info_[34].layout = static_cast(cached_torch_layout_strided); - constants_info_[34].original_fqn = "mv2.features.4.conv.3.weight"; - constants_info_[35].name = "mv2_features_4_conv_3_bias"; - constants_info_[35].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[35].offset = 0; - constants_info_[35].data_size = 128; - constants_info_[35].from_folded = false; - constants_info_[35].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[35].shape = {32}; - constants_info_[35].stride = {1}; - constants_info_[35].layout = static_cast(cached_torch_layout_strided); - constants_info_[35].original_fqn = "mv2.features.4.conv.3.bias"; - constants_info_[36].name = "mv2_features_5_conv_0_0_weight"; - constants_info_[36].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[36].offset = 0; - constants_info_[36].data_size = 24576; - constants_info_[36].from_folded = false; - constants_info_[36].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[36].shape = {192, 32, 1, 1}; - constants_info_[36].stride = {32, 1, 1, 1}; - constants_info_[36].layout = static_cast(cached_torch_layout_strided); - constants_info_[36].original_fqn = "mv2.features.5.conv.0.0.weight"; - constants_info_[37].name = "mv2_features_5_conv_0_1_weight"; - constants_info_[37].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[37].offset = 0; - constants_info_[37].data_size = 768; - constants_info_[37].from_folded = false; - constants_info_[37].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[37].shape = {192}; - constants_info_[37].stride = {1}; - constants_info_[37].layout = static_cast(cached_torch_layout_strided); - constants_info_[37].original_fqn = "mv2.features.5.conv.0.1.weight"; - constants_info_[38].name = "mv2_features_5_conv_0_1_bias"; - constants_info_[38].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[38].offset = 0; - constants_info_[38].data_size = 768; - constants_info_[38].from_folded = false; - constants_info_[38].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[38].shape = {192}; - constants_info_[38].stride = {1}; - constants_info_[38].layout = static_cast(cached_torch_layout_strided); - constants_info_[38].original_fqn = "mv2.features.5.conv.0.1.bias"; - constants_info_[39].name = "mv2_features_5_conv_1_0_weight"; - constants_info_[39].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[39].offset = 0; - constants_info_[39].data_size = 6912; - constants_info_[39].from_folded = false; - constants_info_[39].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[39].shape = {192, 1, 3, 3}; - constants_info_[39].stride = {9, 9, 3, 1}; - constants_info_[39].layout = static_cast(cached_torch_layout_strided); - constants_info_[39].original_fqn = "mv2.features.5.conv.1.0.weight"; - constants_info_[40].name = "mv2_features_5_conv_1_1_weight"; - constants_info_[40].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[40].offset = 0; - constants_info_[40].data_size = 768; - constants_info_[40].from_folded = false; - constants_info_[40].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[40].shape = {192}; - constants_info_[40].stride = {1}; - constants_info_[40].layout = static_cast(cached_torch_layout_strided); - constants_info_[40].original_fqn = "mv2.features.5.conv.1.1.weight"; - constants_info_[41].name = "mv2_features_5_conv_1_1_bias"; - constants_info_[41].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[41].offset = 0; - constants_info_[41].data_size = 768; - constants_info_[41].from_folded = false; - constants_info_[41].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[41].shape = {192}; - constants_info_[41].stride = {1}; - constants_info_[41].layout = static_cast(cached_torch_layout_strided); - constants_info_[41].original_fqn = "mv2.features.5.conv.1.1.bias"; - constants_info_[42].name = "mv2_features_5_conv_2_weight"; - constants_info_[42].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[42].offset = 0; - constants_info_[42].data_size = 24576; - constants_info_[42].from_folded = false; - constants_info_[42].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[42].shape = {32, 192, 1, 1}; - constants_info_[42].stride = {192, 1, 1, 1}; - constants_info_[42].layout = static_cast(cached_torch_layout_strided); - constants_info_[42].original_fqn = "mv2.features.5.conv.2.weight"; - constants_info_[43].name = "mv2_features_5_conv_3_weight"; - constants_info_[43].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[43].offset = 0; - constants_info_[43].data_size = 128; - constants_info_[43].from_folded = false; - constants_info_[43].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[43].shape = {32}; - constants_info_[43].stride = {1}; - constants_info_[43].layout = static_cast(cached_torch_layout_strided); - constants_info_[43].original_fqn = "mv2.features.5.conv.3.weight"; - constants_info_[44].name = "mv2_features_5_conv_3_bias"; - constants_info_[44].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[44].offset = 0; - constants_info_[44].data_size = 128; - constants_info_[44].from_folded = false; - constants_info_[44].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[44].shape = {32}; - constants_info_[44].stride = {1}; - constants_info_[44].layout = static_cast(cached_torch_layout_strided); - constants_info_[44].original_fqn = "mv2.features.5.conv.3.bias"; - constants_info_[45].name = "mv2_features_6_conv_0_0_weight"; - constants_info_[45].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[45].offset = 0; - constants_info_[45].data_size = 24576; - constants_info_[45].from_folded = false; - constants_info_[45].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[45].shape = {192, 32, 1, 1}; - constants_info_[45].stride = {32, 1, 1, 1}; - constants_info_[45].layout = static_cast(cached_torch_layout_strided); - constants_info_[45].original_fqn = "mv2.features.6.conv.0.0.weight"; - constants_info_[46].name = "mv2_features_6_conv_0_1_weight"; - constants_info_[46].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[46].offset = 0; - constants_info_[46].data_size = 768; - constants_info_[46].from_folded = false; - constants_info_[46].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[46].shape = {192}; - constants_info_[46].stride = {1}; - constants_info_[46].layout = static_cast(cached_torch_layout_strided); - constants_info_[46].original_fqn = "mv2.features.6.conv.0.1.weight"; - constants_info_[47].name = "mv2_features_6_conv_0_1_bias"; - constants_info_[47].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[47].offset = 0; - constants_info_[47].data_size = 768; - constants_info_[47].from_folded = false; - constants_info_[47].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[47].shape = {192}; - constants_info_[47].stride = {1}; - constants_info_[47].layout = static_cast(cached_torch_layout_strided); - constants_info_[47].original_fqn = "mv2.features.6.conv.0.1.bias"; - constants_info_[48].name = "mv2_features_6_conv_1_0_weight"; - constants_info_[48].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[48].offset = 0; - constants_info_[48].data_size = 6912; - constants_info_[48].from_folded = false; - constants_info_[48].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[48].shape = {192, 1, 3, 3}; - constants_info_[48].stride = {9, 9, 3, 1}; - constants_info_[48].layout = static_cast(cached_torch_layout_strided); - constants_info_[48].original_fqn = "mv2.features.6.conv.1.0.weight"; - constants_info_[49].name = "mv2_features_6_conv_1_1_weight"; - constants_info_[49].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[49].offset = 0; - constants_info_[49].data_size = 768; - constants_info_[49].from_folded = false; - constants_info_[49].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[49].shape = {192}; - constants_info_[49].stride = {1}; - constants_info_[49].layout = static_cast(cached_torch_layout_strided); - constants_info_[49].original_fqn = "mv2.features.6.conv.1.1.weight"; - constants_info_[50].name = "mv2_features_6_conv_1_1_bias"; - constants_info_[50].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[50].offset = 0; - constants_info_[50].data_size = 768; - constants_info_[50].from_folded = false; - constants_info_[50].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[50].shape = {192}; - constants_info_[50].stride = {1}; - constants_info_[50].layout = static_cast(cached_torch_layout_strided); - constants_info_[50].original_fqn = "mv2.features.6.conv.1.1.bias"; - constants_info_[51].name = "mv2_features_6_conv_2_weight"; - constants_info_[51].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[51].offset = 0; - constants_info_[51].data_size = 24576; - constants_info_[51].from_folded = false; - constants_info_[51].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[51].shape = {32, 192, 1, 1}; - constants_info_[51].stride = {192, 1, 1, 1}; - constants_info_[51].layout = static_cast(cached_torch_layout_strided); - constants_info_[51].original_fqn = "mv2.features.6.conv.2.weight"; - constants_info_[52].name = "mv2_features_6_conv_3_weight"; - constants_info_[52].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[52].offset = 0; - constants_info_[52].data_size = 128; - constants_info_[52].from_folded = false; - constants_info_[52].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[52].shape = {32}; - constants_info_[52].stride = {1}; - constants_info_[52].layout = static_cast(cached_torch_layout_strided); - constants_info_[52].original_fqn = "mv2.features.6.conv.3.weight"; - constants_info_[53].name = "mv2_features_6_conv_3_bias"; - constants_info_[53].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[53].offset = 0; - constants_info_[53].data_size = 128; - constants_info_[53].from_folded = false; - constants_info_[53].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[53].shape = {32}; - constants_info_[53].stride = {1}; - constants_info_[53].layout = static_cast(cached_torch_layout_strided); - constants_info_[53].original_fqn = "mv2.features.6.conv.3.bias"; - constants_info_[54].name = "mv2_features_7_conv_0_0_weight"; - constants_info_[54].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[54].offset = 0; - constants_info_[54].data_size = 24576; - constants_info_[54].from_folded = false; - constants_info_[54].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[54].shape = {192, 32, 1, 1}; - constants_info_[54].stride = {32, 1, 1, 1}; - constants_info_[54].layout = static_cast(cached_torch_layout_strided); - constants_info_[54].original_fqn = "mv2.features.7.conv.0.0.weight"; - constants_info_[55].name = "mv2_features_7_conv_0_1_weight"; - constants_info_[55].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[55].offset = 0; - constants_info_[55].data_size = 768; - constants_info_[55].from_folded = false; - constants_info_[55].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[55].shape = {192}; - constants_info_[55].stride = {1}; - constants_info_[55].layout = static_cast(cached_torch_layout_strided); - constants_info_[55].original_fqn = "mv2.features.7.conv.0.1.weight"; - constants_info_[56].name = "mv2_features_7_conv_0_1_bias"; - constants_info_[56].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[56].offset = 0; - constants_info_[56].data_size = 768; - constants_info_[56].from_folded = false; - constants_info_[56].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[56].shape = {192}; - constants_info_[56].stride = {1}; - constants_info_[56].layout = static_cast(cached_torch_layout_strided); - constants_info_[56].original_fqn = "mv2.features.7.conv.0.1.bias"; - constants_info_[57].name = "mv2_features_7_conv_1_0_weight"; - constants_info_[57].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[57].offset = 0; - constants_info_[57].data_size = 6912; - constants_info_[57].from_folded = false; - constants_info_[57].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[57].shape = {192, 1, 3, 3}; - constants_info_[57].stride = {9, 9, 3, 1}; - constants_info_[57].layout = static_cast(cached_torch_layout_strided); - constants_info_[57].original_fqn = "mv2.features.7.conv.1.0.weight"; - constants_info_[58].name = "mv2_features_7_conv_1_1_weight"; - constants_info_[58].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[58].offset = 0; - constants_info_[58].data_size = 768; - constants_info_[58].from_folded = false; - constants_info_[58].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[58].shape = {192}; - constants_info_[58].stride = {1}; - constants_info_[58].layout = static_cast(cached_torch_layout_strided); - constants_info_[58].original_fqn = "mv2.features.7.conv.1.1.weight"; - constants_info_[59].name = "mv2_features_7_conv_1_1_bias"; - constants_info_[59].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[59].offset = 0; - constants_info_[59].data_size = 768; - constants_info_[59].from_folded = false; - constants_info_[59].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[59].shape = {192}; - constants_info_[59].stride = {1}; - constants_info_[59].layout = static_cast(cached_torch_layout_strided); - constants_info_[59].original_fqn = "mv2.features.7.conv.1.1.bias"; - constants_info_[60].name = "mv2_features_7_conv_2_weight"; - constants_info_[60].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[60].offset = 0; - constants_info_[60].data_size = 49152; - constants_info_[60].from_folded = false; - constants_info_[60].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[60].shape = {64, 192, 1, 1}; - constants_info_[60].stride = {192, 1, 1, 1}; - constants_info_[60].layout = static_cast(cached_torch_layout_strided); - constants_info_[60].original_fqn = "mv2.features.7.conv.2.weight"; - constants_info_[61].name = "mv2_features_7_conv_3_weight"; - constants_info_[61].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[61].offset = 0; - constants_info_[61].data_size = 256; - constants_info_[61].from_folded = false; - constants_info_[61].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[61].shape = {64}; - constants_info_[61].stride = {1}; - constants_info_[61].layout = static_cast(cached_torch_layout_strided); - constants_info_[61].original_fqn = "mv2.features.7.conv.3.weight"; - constants_info_[62].name = "mv2_features_7_conv_3_bias"; - constants_info_[62].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[62].offset = 0; - constants_info_[62].data_size = 256; - constants_info_[62].from_folded = false; - constants_info_[62].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[62].shape = {64}; - constants_info_[62].stride = {1}; - constants_info_[62].layout = static_cast(cached_torch_layout_strided); - constants_info_[62].original_fqn = "mv2.features.7.conv.3.bias"; - constants_info_[63].name = "mv2_features_8_conv_0_0_weight"; - constants_info_[63].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[63].offset = 0; - constants_info_[63].data_size = 98304; - constants_info_[63].from_folded = false; - constants_info_[63].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[63].shape = {384, 64, 1, 1}; - constants_info_[63].stride = {64, 1, 1, 1}; - constants_info_[63].layout = static_cast(cached_torch_layout_strided); - constants_info_[63].original_fqn = "mv2.features.8.conv.0.0.weight"; - constants_info_[64].name = "mv2_features_8_conv_0_1_weight"; - constants_info_[64].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[64].offset = 0; - constants_info_[64].data_size = 1536; - constants_info_[64].from_folded = false; - constants_info_[64].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[64].shape = {384}; - constants_info_[64].stride = {1}; - constants_info_[64].layout = static_cast(cached_torch_layout_strided); - constants_info_[64].original_fqn = "mv2.features.8.conv.0.1.weight"; - constants_info_[65].name = "mv2_features_8_conv_0_1_bias"; - constants_info_[65].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[65].offset = 0; - constants_info_[65].data_size = 1536; - constants_info_[65].from_folded = false; - constants_info_[65].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[65].shape = {384}; - constants_info_[65].stride = {1}; - constants_info_[65].layout = static_cast(cached_torch_layout_strided); - constants_info_[65].original_fqn = "mv2.features.8.conv.0.1.bias"; - constants_info_[66].name = "mv2_features_8_conv_1_0_weight"; - constants_info_[66].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[66].offset = 0; - constants_info_[66].data_size = 13824; - constants_info_[66].from_folded = false; - constants_info_[66].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[66].shape = {384, 1, 3, 3}; - constants_info_[66].stride = {9, 9, 3, 1}; - constants_info_[66].layout = static_cast(cached_torch_layout_strided); - constants_info_[66].original_fqn = "mv2.features.8.conv.1.0.weight"; - constants_info_[67].name = "mv2_features_8_conv_1_1_weight"; - constants_info_[67].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[67].offset = 0; - constants_info_[67].data_size = 1536; - constants_info_[67].from_folded = false; - constants_info_[67].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[67].shape = {384}; - constants_info_[67].stride = {1}; - constants_info_[67].layout = static_cast(cached_torch_layout_strided); - constants_info_[67].original_fqn = "mv2.features.8.conv.1.1.weight"; - constants_info_[68].name = "mv2_features_8_conv_1_1_bias"; - constants_info_[68].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[68].offset = 0; - constants_info_[68].data_size = 1536; - constants_info_[68].from_folded = false; - constants_info_[68].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[68].shape = {384}; - constants_info_[68].stride = {1}; - constants_info_[68].layout = static_cast(cached_torch_layout_strided); - constants_info_[68].original_fqn = "mv2.features.8.conv.1.1.bias"; - constants_info_[69].name = "mv2_features_8_conv_2_weight"; - constants_info_[69].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[69].offset = 0; - constants_info_[69].data_size = 98304; - constants_info_[69].from_folded = false; - constants_info_[69].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[69].shape = {64, 384, 1, 1}; - constants_info_[69].stride = {384, 1, 1, 1}; - constants_info_[69].layout = static_cast(cached_torch_layout_strided); - constants_info_[69].original_fqn = "mv2.features.8.conv.2.weight"; - constants_info_[70].name = "mv2_features_8_conv_3_weight"; - constants_info_[70].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[70].offset = 0; - constants_info_[70].data_size = 256; - constants_info_[70].from_folded = false; - constants_info_[70].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[70].shape = {64}; - constants_info_[70].stride = {1}; - constants_info_[70].layout = static_cast(cached_torch_layout_strided); - constants_info_[70].original_fqn = "mv2.features.8.conv.3.weight"; - constants_info_[71].name = "mv2_features_8_conv_3_bias"; - constants_info_[71].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[71].offset = 0; - constants_info_[71].data_size = 256; - constants_info_[71].from_folded = false; - constants_info_[71].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[71].shape = {64}; - constants_info_[71].stride = {1}; - constants_info_[71].layout = static_cast(cached_torch_layout_strided); - constants_info_[71].original_fqn = "mv2.features.8.conv.3.bias"; - constants_info_[72].name = "mv2_features_9_conv_0_0_weight"; - constants_info_[72].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[72].offset = 0; - constants_info_[72].data_size = 98304; - constants_info_[72].from_folded = false; - constants_info_[72].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[72].shape = {384, 64, 1, 1}; - constants_info_[72].stride = {64, 1, 1, 1}; - constants_info_[72].layout = static_cast(cached_torch_layout_strided); - constants_info_[72].original_fqn = "mv2.features.9.conv.0.0.weight"; - constants_info_[73].name = "mv2_features_9_conv_0_1_weight"; - constants_info_[73].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[73].offset = 0; - constants_info_[73].data_size = 1536; - constants_info_[73].from_folded = false; - constants_info_[73].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[73].shape = {384}; - constants_info_[73].stride = {1}; - constants_info_[73].layout = static_cast(cached_torch_layout_strided); - constants_info_[73].original_fqn = "mv2.features.9.conv.0.1.weight"; - constants_info_[74].name = "mv2_features_9_conv_0_1_bias"; - constants_info_[74].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[74].offset = 0; - constants_info_[74].data_size = 1536; - constants_info_[74].from_folded = false; - constants_info_[74].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[74].shape = {384}; - constants_info_[74].stride = {1}; - constants_info_[74].layout = static_cast(cached_torch_layout_strided); - constants_info_[74].original_fqn = "mv2.features.9.conv.0.1.bias"; - constants_info_[75].name = "mv2_features_9_conv_1_0_weight"; - constants_info_[75].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[75].offset = 0; - constants_info_[75].data_size = 13824; - constants_info_[75].from_folded = false; - constants_info_[75].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[75].shape = {384, 1, 3, 3}; - constants_info_[75].stride = {9, 9, 3, 1}; - constants_info_[75].layout = static_cast(cached_torch_layout_strided); - constants_info_[75].original_fqn = "mv2.features.9.conv.1.0.weight"; - constants_info_[76].name = "mv2_features_9_conv_1_1_weight"; - constants_info_[76].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[76].offset = 0; - constants_info_[76].data_size = 1536; - constants_info_[76].from_folded = false; - constants_info_[76].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[76].shape = {384}; - constants_info_[76].stride = {1}; - constants_info_[76].layout = static_cast(cached_torch_layout_strided); - constants_info_[76].original_fqn = "mv2.features.9.conv.1.1.weight"; - constants_info_[77].name = "mv2_features_9_conv_1_1_bias"; - constants_info_[77].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[77].offset = 0; - constants_info_[77].data_size = 1536; - constants_info_[77].from_folded = false; - constants_info_[77].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[77].shape = {384}; - constants_info_[77].stride = {1}; - constants_info_[77].layout = static_cast(cached_torch_layout_strided); - constants_info_[77].original_fqn = "mv2.features.9.conv.1.1.bias"; - constants_info_[78].name = "mv2_features_9_conv_2_weight"; - constants_info_[78].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[78].offset = 0; - constants_info_[78].data_size = 98304; - constants_info_[78].from_folded = false; - constants_info_[78].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[78].shape = {64, 384, 1, 1}; - constants_info_[78].stride = {384, 1, 1, 1}; - constants_info_[78].layout = static_cast(cached_torch_layout_strided); - constants_info_[78].original_fqn = "mv2.features.9.conv.2.weight"; - constants_info_[79].name = "mv2_features_9_conv_3_weight"; - constants_info_[79].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[79].offset = 0; - constants_info_[79].data_size = 256; - constants_info_[79].from_folded = false; - constants_info_[79].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[79].shape = {64}; - constants_info_[79].stride = {1}; - constants_info_[79].layout = static_cast(cached_torch_layout_strided); - constants_info_[79].original_fqn = "mv2.features.9.conv.3.weight"; - constants_info_[80].name = "mv2_features_9_conv_3_bias"; - constants_info_[80].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[80].offset = 0; - constants_info_[80].data_size = 256; - constants_info_[80].from_folded = false; - constants_info_[80].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[80].shape = {64}; - constants_info_[80].stride = {1}; - constants_info_[80].layout = static_cast(cached_torch_layout_strided); - constants_info_[80].original_fqn = "mv2.features.9.conv.3.bias"; - constants_info_[81].name = "mv2_features_10_conv_0_0_weight"; - constants_info_[81].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[81].offset = 0; - constants_info_[81].data_size = 98304; - constants_info_[81].from_folded = false; - constants_info_[81].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[81].shape = {384, 64, 1, 1}; - constants_info_[81].stride = {64, 1, 1, 1}; - constants_info_[81].layout = static_cast(cached_torch_layout_strided); - constants_info_[81].original_fqn = "mv2.features.10.conv.0.0.weight"; - constants_info_[82].name = "mv2_features_10_conv_0_1_weight"; - constants_info_[82].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[82].offset = 0; - constants_info_[82].data_size = 1536; - constants_info_[82].from_folded = false; - constants_info_[82].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[82].shape = {384}; - constants_info_[82].stride = {1}; - constants_info_[82].layout = static_cast(cached_torch_layout_strided); - constants_info_[82].original_fqn = "mv2.features.10.conv.0.1.weight"; - constants_info_[83].name = "mv2_features_10_conv_0_1_bias"; - constants_info_[83].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[83].offset = 0; - constants_info_[83].data_size = 1536; - constants_info_[83].from_folded = false; - constants_info_[83].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[83].shape = {384}; - constants_info_[83].stride = {1}; - constants_info_[83].layout = static_cast(cached_torch_layout_strided); - constants_info_[83].original_fqn = "mv2.features.10.conv.0.1.bias"; - constants_info_[84].name = "mv2_features_10_conv_1_0_weight"; - constants_info_[84].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[84].offset = 0; - constants_info_[84].data_size = 13824; - constants_info_[84].from_folded = false; - constants_info_[84].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[84].shape = {384, 1, 3, 3}; - constants_info_[84].stride = {9, 9, 3, 1}; - constants_info_[84].layout = static_cast(cached_torch_layout_strided); - constants_info_[84].original_fqn = "mv2.features.10.conv.1.0.weight"; - constants_info_[85].name = "mv2_features_10_conv_1_1_weight"; - constants_info_[85].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[85].offset = 0; - constants_info_[85].data_size = 1536; - constants_info_[85].from_folded = false; - constants_info_[85].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[85].shape = {384}; - constants_info_[85].stride = {1}; - constants_info_[85].layout = static_cast(cached_torch_layout_strided); - constants_info_[85].original_fqn = "mv2.features.10.conv.1.1.weight"; - constants_info_[86].name = "mv2_features_10_conv_1_1_bias"; - constants_info_[86].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[86].offset = 0; - constants_info_[86].data_size = 1536; - constants_info_[86].from_folded = false; - constants_info_[86].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[86].shape = {384}; - constants_info_[86].stride = {1}; - constants_info_[86].layout = static_cast(cached_torch_layout_strided); - constants_info_[86].original_fqn = "mv2.features.10.conv.1.1.bias"; - constants_info_[87].name = "mv2_features_10_conv_2_weight"; - constants_info_[87].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[87].offset = 0; - constants_info_[87].data_size = 98304; - constants_info_[87].from_folded = false; - constants_info_[87].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[87].shape = {64, 384, 1, 1}; - constants_info_[87].stride = {384, 1, 1, 1}; - constants_info_[87].layout = static_cast(cached_torch_layout_strided); - constants_info_[87].original_fqn = "mv2.features.10.conv.2.weight"; - constants_info_[88].name = "mv2_features_10_conv_3_weight"; - constants_info_[88].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[88].offset = 0; - constants_info_[88].data_size = 256; - constants_info_[88].from_folded = false; - constants_info_[88].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[88].shape = {64}; - constants_info_[88].stride = {1}; - constants_info_[88].layout = static_cast(cached_torch_layout_strided); - constants_info_[88].original_fqn = "mv2.features.10.conv.3.weight"; - constants_info_[89].name = "mv2_features_10_conv_3_bias"; - constants_info_[89].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[89].offset = 0; - constants_info_[89].data_size = 256; - constants_info_[89].from_folded = false; - constants_info_[89].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[89].shape = {64}; - constants_info_[89].stride = {1}; - constants_info_[89].layout = static_cast(cached_torch_layout_strided); - constants_info_[89].original_fqn = "mv2.features.10.conv.3.bias"; - constants_info_[90].name = "mv2_features_11_conv_0_0_weight"; - constants_info_[90].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[90].offset = 0; - constants_info_[90].data_size = 98304; - constants_info_[90].from_folded = false; - constants_info_[90].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[90].shape = {384, 64, 1, 1}; - constants_info_[90].stride = {64, 1, 1, 1}; - constants_info_[90].layout = static_cast(cached_torch_layout_strided); - constants_info_[90].original_fqn = "mv2.features.11.conv.0.0.weight"; - constants_info_[91].name = "mv2_features_11_conv_0_1_weight"; - constants_info_[91].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[91].offset = 0; - constants_info_[91].data_size = 1536; - constants_info_[91].from_folded = false; - constants_info_[91].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[91].shape = {384}; - constants_info_[91].stride = {1}; - constants_info_[91].layout = static_cast(cached_torch_layout_strided); - constants_info_[91].original_fqn = "mv2.features.11.conv.0.1.weight"; - constants_info_[92].name = "mv2_features_11_conv_0_1_bias"; - constants_info_[92].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[92].offset = 0; - constants_info_[92].data_size = 1536; - constants_info_[92].from_folded = false; - constants_info_[92].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[92].shape = {384}; - constants_info_[92].stride = {1}; - constants_info_[92].layout = static_cast(cached_torch_layout_strided); - constants_info_[92].original_fqn = "mv2.features.11.conv.0.1.bias"; - constants_info_[93].name = "mv2_features_11_conv_1_0_weight"; - constants_info_[93].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[93].offset = 0; - constants_info_[93].data_size = 13824; - constants_info_[93].from_folded = false; - constants_info_[93].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[93].shape = {384, 1, 3, 3}; - constants_info_[93].stride = {9, 9, 3, 1}; - constants_info_[93].layout = static_cast(cached_torch_layout_strided); - constants_info_[93].original_fqn = "mv2.features.11.conv.1.0.weight"; - constants_info_[94].name = "mv2_features_11_conv_1_1_weight"; - constants_info_[94].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[94].offset = 0; - constants_info_[94].data_size = 1536; - constants_info_[94].from_folded = false; - constants_info_[94].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[94].shape = {384}; - constants_info_[94].stride = {1}; - constants_info_[94].layout = static_cast(cached_torch_layout_strided); - constants_info_[94].original_fqn = "mv2.features.11.conv.1.1.weight"; - constants_info_[95].name = "mv2_features_11_conv_1_1_bias"; - constants_info_[95].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[95].offset = 0; - constants_info_[95].data_size = 1536; - constants_info_[95].from_folded = false; - constants_info_[95].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[95].shape = {384}; - constants_info_[95].stride = {1}; - constants_info_[95].layout = static_cast(cached_torch_layout_strided); - constants_info_[95].original_fqn = "mv2.features.11.conv.1.1.bias"; - constants_info_[96].name = "mv2_features_11_conv_2_weight"; - constants_info_[96].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[96].offset = 0; - constants_info_[96].data_size = 147456; - constants_info_[96].from_folded = false; - constants_info_[96].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[96].shape = {96, 384, 1, 1}; - constants_info_[96].stride = {384, 1, 1, 1}; - constants_info_[96].layout = static_cast(cached_torch_layout_strided); - constants_info_[96].original_fqn = "mv2.features.11.conv.2.weight"; - constants_info_[97].name = "mv2_features_11_conv_3_weight"; - constants_info_[97].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[97].offset = 0; - constants_info_[97].data_size = 384; - constants_info_[97].from_folded = false; - constants_info_[97].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[97].shape = {96}; - constants_info_[97].stride = {1}; - constants_info_[97].layout = static_cast(cached_torch_layout_strided); - constants_info_[97].original_fqn = "mv2.features.11.conv.3.weight"; - constants_info_[98].name = "mv2_features_11_conv_3_bias"; - constants_info_[98].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[98].offset = 0; - constants_info_[98].data_size = 384; - constants_info_[98].from_folded = false; - constants_info_[98].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[98].shape = {96}; - constants_info_[98].stride = {1}; - constants_info_[98].layout = static_cast(cached_torch_layout_strided); - constants_info_[98].original_fqn = "mv2.features.11.conv.3.bias"; - constants_info_[99].name = "mv2_features_12_conv_0_0_weight"; - constants_info_[99].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[99].offset = 0; - constants_info_[99].data_size = 221184; - constants_info_[99].from_folded = false; - constants_info_[99].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[99].shape = {576, 96, 1, 1}; - constants_info_[99].stride = {96, 1, 1, 1}; - constants_info_[99].layout = static_cast(cached_torch_layout_strided); - constants_info_[99].original_fqn = "mv2.features.12.conv.0.0.weight"; - constants_info_[100].name = "mv2_features_12_conv_0_1_weight"; - constants_info_[100].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[100].offset = 0; - constants_info_[100].data_size = 2304; - constants_info_[100].from_folded = false; - constants_info_[100].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[100].shape = {576}; - constants_info_[100].stride = {1}; - constants_info_[100].layout = static_cast(cached_torch_layout_strided); - constants_info_[100].original_fqn = "mv2.features.12.conv.0.1.weight"; - constants_info_[101].name = "mv2_features_12_conv_0_1_bias"; - constants_info_[101].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[101].offset = 0; - constants_info_[101].data_size = 2304; - constants_info_[101].from_folded = false; - constants_info_[101].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[101].shape = {576}; - constants_info_[101].stride = {1}; - constants_info_[101].layout = static_cast(cached_torch_layout_strided); - constants_info_[101].original_fqn = "mv2.features.12.conv.0.1.bias"; - constants_info_[102].name = "mv2_features_12_conv_1_0_weight"; - constants_info_[102].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[102].offset = 0; - constants_info_[102].data_size = 20736; - constants_info_[102].from_folded = false; - constants_info_[102].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[102].shape = {576, 1, 3, 3}; - constants_info_[102].stride = {9, 9, 3, 1}; - constants_info_[102].layout = static_cast(cached_torch_layout_strided); - constants_info_[102].original_fqn = "mv2.features.12.conv.1.0.weight"; - constants_info_[103].name = "mv2_features_12_conv_1_1_weight"; - constants_info_[103].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[103].offset = 0; - constants_info_[103].data_size = 2304; - constants_info_[103].from_folded = false; - constants_info_[103].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[103].shape = {576}; - constants_info_[103].stride = {1}; - constants_info_[103].layout = static_cast(cached_torch_layout_strided); - constants_info_[103].original_fqn = "mv2.features.12.conv.1.1.weight"; - constants_info_[104].name = "mv2_features_12_conv_1_1_bias"; - constants_info_[104].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[104].offset = 0; - constants_info_[104].data_size = 2304; - constants_info_[104].from_folded = false; - constants_info_[104].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[104].shape = {576}; - constants_info_[104].stride = {1}; - constants_info_[104].layout = static_cast(cached_torch_layout_strided); - constants_info_[104].original_fqn = "mv2.features.12.conv.1.1.bias"; - constants_info_[105].name = "mv2_features_12_conv_2_weight"; - constants_info_[105].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[105].offset = 0; - constants_info_[105].data_size = 221184; - constants_info_[105].from_folded = false; - constants_info_[105].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[105].shape = {96, 576, 1, 1}; - constants_info_[105].stride = {576, 1, 1, 1}; - constants_info_[105].layout = static_cast(cached_torch_layout_strided); - constants_info_[105].original_fqn = "mv2.features.12.conv.2.weight"; - constants_info_[106].name = "mv2_features_12_conv_3_weight"; - constants_info_[106].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[106].offset = 0; - constants_info_[106].data_size = 384; - constants_info_[106].from_folded = false; - constants_info_[106].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[106].shape = {96}; - constants_info_[106].stride = {1}; - constants_info_[106].layout = static_cast(cached_torch_layout_strided); - constants_info_[106].original_fqn = "mv2.features.12.conv.3.weight"; - constants_info_[107].name = "mv2_features_12_conv_3_bias"; - constants_info_[107].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[107].offset = 0; - constants_info_[107].data_size = 384; - constants_info_[107].from_folded = false; - constants_info_[107].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[107].shape = {96}; - constants_info_[107].stride = {1}; - constants_info_[107].layout = static_cast(cached_torch_layout_strided); - constants_info_[107].original_fqn = "mv2.features.12.conv.3.bias"; - constants_info_[108].name = "mv2_features_13_conv_0_0_weight"; - constants_info_[108].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[108].offset = 0; - constants_info_[108].data_size = 221184; - constants_info_[108].from_folded = false; - constants_info_[108].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[108].shape = {576, 96, 1, 1}; - constants_info_[108].stride = {96, 1, 1, 1}; - constants_info_[108].layout = static_cast(cached_torch_layout_strided); - constants_info_[108].original_fqn = "mv2.features.13.conv.0.0.weight"; - constants_info_[109].name = "mv2_features_13_conv_0_1_weight"; - constants_info_[109].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[109].offset = 0; - constants_info_[109].data_size = 2304; - constants_info_[109].from_folded = false; - constants_info_[109].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[109].shape = {576}; - constants_info_[109].stride = {1}; - constants_info_[109].layout = static_cast(cached_torch_layout_strided); - constants_info_[109].original_fqn = "mv2.features.13.conv.0.1.weight"; - constants_info_[110].name = "mv2_features_13_conv_0_1_bias"; - constants_info_[110].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[110].offset = 0; - constants_info_[110].data_size = 2304; - constants_info_[110].from_folded = false; - constants_info_[110].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[110].shape = {576}; - constants_info_[110].stride = {1}; - constants_info_[110].layout = static_cast(cached_torch_layout_strided); - constants_info_[110].original_fqn = "mv2.features.13.conv.0.1.bias"; - constants_info_[111].name = "mv2_features_13_conv_1_0_weight"; - constants_info_[111].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[111].offset = 0; - constants_info_[111].data_size = 20736; - constants_info_[111].from_folded = false; - constants_info_[111].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[111].shape = {576, 1, 3, 3}; - constants_info_[111].stride = {9, 9, 3, 1}; - constants_info_[111].layout = static_cast(cached_torch_layout_strided); - constants_info_[111].original_fqn = "mv2.features.13.conv.1.0.weight"; - constants_info_[112].name = "mv2_features_13_conv_1_1_weight"; - constants_info_[112].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[112].offset = 0; - constants_info_[112].data_size = 2304; - constants_info_[112].from_folded = false; - constants_info_[112].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[112].shape = {576}; - constants_info_[112].stride = {1}; - constants_info_[112].layout = static_cast(cached_torch_layout_strided); - constants_info_[112].original_fqn = "mv2.features.13.conv.1.1.weight"; - constants_info_[113].name = "mv2_features_13_conv_1_1_bias"; - constants_info_[113].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[113].offset = 0; - constants_info_[113].data_size = 2304; - constants_info_[113].from_folded = false; - constants_info_[113].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[113].shape = {576}; - constants_info_[113].stride = {1}; - constants_info_[113].layout = static_cast(cached_torch_layout_strided); - constants_info_[113].original_fqn = "mv2.features.13.conv.1.1.bias"; - constants_info_[114].name = "mv2_features_13_conv_2_weight"; - constants_info_[114].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[114].offset = 0; - constants_info_[114].data_size = 221184; - constants_info_[114].from_folded = false; - constants_info_[114].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[114].shape = {96, 576, 1, 1}; - constants_info_[114].stride = {576, 1, 1, 1}; - constants_info_[114].layout = static_cast(cached_torch_layout_strided); - constants_info_[114].original_fqn = "mv2.features.13.conv.2.weight"; - constants_info_[115].name = "mv2_features_13_conv_3_weight"; - constants_info_[115].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[115].offset = 0; - constants_info_[115].data_size = 384; - constants_info_[115].from_folded = false; - constants_info_[115].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[115].shape = {96}; - constants_info_[115].stride = {1}; - constants_info_[115].layout = static_cast(cached_torch_layout_strided); - constants_info_[115].original_fqn = "mv2.features.13.conv.3.weight"; - constants_info_[116].name = "mv2_features_13_conv_3_bias"; - constants_info_[116].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[116].offset = 0; - constants_info_[116].data_size = 384; - constants_info_[116].from_folded = false; - constants_info_[116].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[116].shape = {96}; - constants_info_[116].stride = {1}; - constants_info_[116].layout = static_cast(cached_torch_layout_strided); - constants_info_[116].original_fqn = "mv2.features.13.conv.3.bias"; - constants_info_[117].name = "mv2_features_14_conv_0_0_weight"; - constants_info_[117].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[117].offset = 0; - constants_info_[117].data_size = 221184; - constants_info_[117].from_folded = false; - constants_info_[117].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[117].shape = {576, 96, 1, 1}; - constants_info_[117].stride = {96, 1, 1, 1}; - constants_info_[117].layout = static_cast(cached_torch_layout_strided); - constants_info_[117].original_fqn = "mv2.features.14.conv.0.0.weight"; - constants_info_[118].name = "mv2_features_14_conv_0_1_weight"; - constants_info_[118].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[118].offset = 0; - constants_info_[118].data_size = 2304; - constants_info_[118].from_folded = false; - constants_info_[118].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[118].shape = {576}; - constants_info_[118].stride = {1}; - constants_info_[118].layout = static_cast(cached_torch_layout_strided); - constants_info_[118].original_fqn = "mv2.features.14.conv.0.1.weight"; - constants_info_[119].name = "mv2_features_14_conv_0_1_bias"; - constants_info_[119].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[119].offset = 0; - constants_info_[119].data_size = 2304; - constants_info_[119].from_folded = false; - constants_info_[119].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[119].shape = {576}; - constants_info_[119].stride = {1}; - constants_info_[119].layout = static_cast(cached_torch_layout_strided); - constants_info_[119].original_fqn = "mv2.features.14.conv.0.1.bias"; - constants_info_[120].name = "mv2_features_14_conv_1_0_weight"; - constants_info_[120].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[120].offset = 0; - constants_info_[120].data_size = 20736; - constants_info_[120].from_folded = false; - constants_info_[120].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[120].shape = {576, 1, 3, 3}; - constants_info_[120].stride = {9, 9, 3, 1}; - constants_info_[120].layout = static_cast(cached_torch_layout_strided); - constants_info_[120].original_fqn = "mv2.features.14.conv.1.0.weight"; - constants_info_[121].name = "mv2_features_14_conv_1_1_weight"; - constants_info_[121].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[121].offset = 0; - constants_info_[121].data_size = 2304; - constants_info_[121].from_folded = false; - constants_info_[121].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[121].shape = {576}; - constants_info_[121].stride = {1}; - constants_info_[121].layout = static_cast(cached_torch_layout_strided); - constants_info_[121].original_fqn = "mv2.features.14.conv.1.1.weight"; - constants_info_[122].name = "mv2_features_14_conv_1_1_bias"; - constants_info_[122].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[122].offset = 0; - constants_info_[122].data_size = 2304; - constants_info_[122].from_folded = false; - constants_info_[122].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[122].shape = {576}; - constants_info_[122].stride = {1}; - constants_info_[122].layout = static_cast(cached_torch_layout_strided); - constants_info_[122].original_fqn = "mv2.features.14.conv.1.1.bias"; - constants_info_[123].name = "mv2_features_14_conv_2_weight"; - constants_info_[123].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[123].offset = 0; - constants_info_[123].data_size = 368640; - constants_info_[123].from_folded = false; - constants_info_[123].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[123].shape = {160, 576, 1, 1}; - constants_info_[123].stride = {576, 1, 1, 1}; - constants_info_[123].layout = static_cast(cached_torch_layout_strided); - constants_info_[123].original_fqn = "mv2.features.14.conv.2.weight"; - constants_info_[124].name = "mv2_features_14_conv_3_weight"; - constants_info_[124].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[124].offset = 0; - constants_info_[124].data_size = 640; - constants_info_[124].from_folded = false; - constants_info_[124].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[124].shape = {160}; - constants_info_[124].stride = {1}; - constants_info_[124].layout = static_cast(cached_torch_layout_strided); - constants_info_[124].original_fqn = "mv2.features.14.conv.3.weight"; - constants_info_[125].name = "mv2_features_14_conv_3_bias"; - constants_info_[125].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[125].offset = 0; - constants_info_[125].data_size = 640; - constants_info_[125].from_folded = false; - constants_info_[125].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[125].shape = {160}; - constants_info_[125].stride = {1}; - constants_info_[125].layout = static_cast(cached_torch_layout_strided); - constants_info_[125].original_fqn = "mv2.features.14.conv.3.bias"; - constants_info_[126].name = "mv2_features_15_conv_0_0_weight"; - constants_info_[126].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[126].offset = 0; - constants_info_[126].data_size = 614400; - constants_info_[126].from_folded = false; - constants_info_[126].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[126].shape = {960, 160, 1, 1}; - constants_info_[126].stride = {160, 1, 1, 1}; - constants_info_[126].layout = static_cast(cached_torch_layout_strided); - constants_info_[126].original_fqn = "mv2.features.15.conv.0.0.weight"; - constants_info_[127].name = "mv2_features_15_conv_0_1_weight"; - constants_info_[127].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[127].offset = 0; - constants_info_[127].data_size = 3840; - constants_info_[127].from_folded = false; - constants_info_[127].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[127].shape = {960}; - constants_info_[127].stride = {1}; - constants_info_[127].layout = static_cast(cached_torch_layout_strided); - constants_info_[127].original_fqn = "mv2.features.15.conv.0.1.weight"; - constants_info_[128].name = "mv2_features_15_conv_0_1_bias"; - constants_info_[128].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[128].offset = 0; - constants_info_[128].data_size = 3840; - constants_info_[128].from_folded = false; - constants_info_[128].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[128].shape = {960}; - constants_info_[128].stride = {1}; - constants_info_[128].layout = static_cast(cached_torch_layout_strided); - constants_info_[128].original_fqn = "mv2.features.15.conv.0.1.bias"; - constants_info_[129].name = "mv2_features_15_conv_1_0_weight"; - constants_info_[129].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[129].offset = 0; - constants_info_[129].data_size = 34560; - constants_info_[129].from_folded = false; - constants_info_[129].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[129].shape = {960, 1, 3, 3}; - constants_info_[129].stride = {9, 9, 3, 1}; - constants_info_[129].layout = static_cast(cached_torch_layout_strided); - constants_info_[129].original_fqn = "mv2.features.15.conv.1.0.weight"; - constants_info_[130].name = "mv2_features_15_conv_1_1_weight"; - constants_info_[130].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[130].offset = 0; - constants_info_[130].data_size = 3840; - constants_info_[130].from_folded = false; - constants_info_[130].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[130].shape = {960}; - constants_info_[130].stride = {1}; - constants_info_[130].layout = static_cast(cached_torch_layout_strided); - constants_info_[130].original_fqn = "mv2.features.15.conv.1.1.weight"; - constants_info_[131].name = "mv2_features_15_conv_1_1_bias"; - constants_info_[131].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[131].offset = 0; - constants_info_[131].data_size = 3840; - constants_info_[131].from_folded = false; - constants_info_[131].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[131].shape = {960}; - constants_info_[131].stride = {1}; - constants_info_[131].layout = static_cast(cached_torch_layout_strided); - constants_info_[131].original_fqn = "mv2.features.15.conv.1.1.bias"; - constants_info_[132].name = "mv2_features_15_conv_2_weight"; - constants_info_[132].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[132].offset = 0; - constants_info_[132].data_size = 614400; - constants_info_[132].from_folded = false; - constants_info_[132].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[132].shape = {160, 960, 1, 1}; - constants_info_[132].stride = {960, 1, 1, 1}; - constants_info_[132].layout = static_cast(cached_torch_layout_strided); - constants_info_[132].original_fqn = "mv2.features.15.conv.2.weight"; - constants_info_[133].name = "mv2_features_15_conv_3_weight"; - constants_info_[133].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[133].offset = 0; - constants_info_[133].data_size = 640; - constants_info_[133].from_folded = false; - constants_info_[133].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[133].shape = {160}; - constants_info_[133].stride = {1}; - constants_info_[133].layout = static_cast(cached_torch_layout_strided); - constants_info_[133].original_fqn = "mv2.features.15.conv.3.weight"; - constants_info_[134].name = "mv2_features_15_conv_3_bias"; - constants_info_[134].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[134].offset = 0; - constants_info_[134].data_size = 640; - constants_info_[134].from_folded = false; - constants_info_[134].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[134].shape = {160}; - constants_info_[134].stride = {1}; - constants_info_[134].layout = static_cast(cached_torch_layout_strided); - constants_info_[134].original_fqn = "mv2.features.15.conv.3.bias"; - constants_info_[135].name = "mv2_features_16_conv_0_0_weight"; - constants_info_[135].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[135].offset = 0; - constants_info_[135].data_size = 614400; - constants_info_[135].from_folded = false; - constants_info_[135].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[135].shape = {960, 160, 1, 1}; - constants_info_[135].stride = {160, 1, 1, 1}; - constants_info_[135].layout = static_cast(cached_torch_layout_strided); - constants_info_[135].original_fqn = "mv2.features.16.conv.0.0.weight"; - constants_info_[136].name = "mv2_features_16_conv_0_1_weight"; - constants_info_[136].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[136].offset = 0; - constants_info_[136].data_size = 3840; - constants_info_[136].from_folded = false; - constants_info_[136].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[136].shape = {960}; - constants_info_[136].stride = {1}; - constants_info_[136].layout = static_cast(cached_torch_layout_strided); - constants_info_[136].original_fqn = "mv2.features.16.conv.0.1.weight"; - constants_info_[137].name = "mv2_features_16_conv_0_1_bias"; - constants_info_[137].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[137].offset = 0; - constants_info_[137].data_size = 3840; - constants_info_[137].from_folded = false; - constants_info_[137].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[137].shape = {960}; - constants_info_[137].stride = {1}; - constants_info_[137].layout = static_cast(cached_torch_layout_strided); - constants_info_[137].original_fqn = "mv2.features.16.conv.0.1.bias"; - constants_info_[138].name = "mv2_features_16_conv_1_0_weight"; - constants_info_[138].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[138].offset = 0; - constants_info_[138].data_size = 34560; - constants_info_[138].from_folded = false; - constants_info_[138].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[138].shape = {960, 1, 3, 3}; - constants_info_[138].stride = {9, 9, 3, 1}; - constants_info_[138].layout = static_cast(cached_torch_layout_strided); - constants_info_[138].original_fqn = "mv2.features.16.conv.1.0.weight"; - constants_info_[139].name = "mv2_features_16_conv_1_1_weight"; - constants_info_[139].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[139].offset = 0; - constants_info_[139].data_size = 3840; - constants_info_[139].from_folded = false; - constants_info_[139].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[139].shape = {960}; - constants_info_[139].stride = {1}; - constants_info_[139].layout = static_cast(cached_torch_layout_strided); - constants_info_[139].original_fqn = "mv2.features.16.conv.1.1.weight"; - constants_info_[140].name = "mv2_features_16_conv_1_1_bias"; - constants_info_[140].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[140].offset = 0; - constants_info_[140].data_size = 3840; - constants_info_[140].from_folded = false; - constants_info_[140].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[140].shape = {960}; - constants_info_[140].stride = {1}; - constants_info_[140].layout = static_cast(cached_torch_layout_strided); - constants_info_[140].original_fqn = "mv2.features.16.conv.1.1.bias"; - constants_info_[141].name = "mv2_features_16_conv_2_weight"; - constants_info_[141].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[141].offset = 0; - constants_info_[141].data_size = 614400; - constants_info_[141].from_folded = false; - constants_info_[141].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[141].shape = {160, 960, 1, 1}; - constants_info_[141].stride = {960, 1, 1, 1}; - constants_info_[141].layout = static_cast(cached_torch_layout_strided); - constants_info_[141].original_fqn = "mv2.features.16.conv.2.weight"; - constants_info_[142].name = "mv2_features_16_conv_3_weight"; - constants_info_[142].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[142].offset = 0; - constants_info_[142].data_size = 640; - constants_info_[142].from_folded = false; - constants_info_[142].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[142].shape = {160}; - constants_info_[142].stride = {1}; - constants_info_[142].layout = static_cast(cached_torch_layout_strided); - constants_info_[142].original_fqn = "mv2.features.16.conv.3.weight"; - constants_info_[143].name = "mv2_features_16_conv_3_bias"; - constants_info_[143].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[143].offset = 0; - constants_info_[143].data_size = 640; - constants_info_[143].from_folded = false; - constants_info_[143].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[143].shape = {160}; - constants_info_[143].stride = {1}; - constants_info_[143].layout = static_cast(cached_torch_layout_strided); - constants_info_[143].original_fqn = "mv2.features.16.conv.3.bias"; - constants_info_[144].name = "mv2_features_17_conv_0_0_weight"; - constants_info_[144].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[144].offset = 0; - constants_info_[144].data_size = 614400; - constants_info_[144].from_folded = false; - constants_info_[144].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[144].shape = {960, 160, 1, 1}; - constants_info_[144].stride = {160, 1, 1, 1}; - constants_info_[144].layout = static_cast(cached_torch_layout_strided); - constants_info_[144].original_fqn = "mv2.features.17.conv.0.0.weight"; - constants_info_[145].name = "mv2_features_17_conv_0_1_weight"; - constants_info_[145].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[145].offset = 0; - constants_info_[145].data_size = 3840; - constants_info_[145].from_folded = false; - constants_info_[145].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[145].shape = {960}; - constants_info_[145].stride = {1}; - constants_info_[145].layout = static_cast(cached_torch_layout_strided); - constants_info_[145].original_fqn = "mv2.features.17.conv.0.1.weight"; - constants_info_[146].name = "mv2_features_17_conv_0_1_bias"; - constants_info_[146].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[146].offset = 0; - constants_info_[146].data_size = 3840; - constants_info_[146].from_folded = false; - constants_info_[146].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[146].shape = {960}; - constants_info_[146].stride = {1}; - constants_info_[146].layout = static_cast(cached_torch_layout_strided); - constants_info_[146].original_fqn = "mv2.features.17.conv.0.1.bias"; - constants_info_[147].name = "mv2_features_17_conv_1_0_weight"; - constants_info_[147].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[147].offset = 0; - constants_info_[147].data_size = 34560; - constants_info_[147].from_folded = false; - constants_info_[147].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[147].shape = {960, 1, 3, 3}; - constants_info_[147].stride = {9, 9, 3, 1}; - constants_info_[147].layout = static_cast(cached_torch_layout_strided); - constants_info_[147].original_fqn = "mv2.features.17.conv.1.0.weight"; - constants_info_[148].name = "mv2_features_17_conv_1_1_weight"; - constants_info_[148].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[148].offset = 0; - constants_info_[148].data_size = 3840; - constants_info_[148].from_folded = false; - constants_info_[148].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[148].shape = {960}; - constants_info_[148].stride = {1}; - constants_info_[148].layout = static_cast(cached_torch_layout_strided); - constants_info_[148].original_fqn = "mv2.features.17.conv.1.1.weight"; - constants_info_[149].name = "mv2_features_17_conv_1_1_bias"; - constants_info_[149].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[149].offset = 0; - constants_info_[149].data_size = 3840; - constants_info_[149].from_folded = false; - constants_info_[149].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[149].shape = {960}; - constants_info_[149].stride = {1}; - constants_info_[149].layout = static_cast(cached_torch_layout_strided); - constants_info_[149].original_fqn = "mv2.features.17.conv.1.1.bias"; - constants_info_[150].name = "mv2_features_17_conv_2_weight"; - constants_info_[150].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[150].offset = 0; - constants_info_[150].data_size = 1228800; - constants_info_[150].from_folded = false; - constants_info_[150].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[150].shape = {320, 960, 1, 1}; - constants_info_[150].stride = {960, 1, 1, 1}; - constants_info_[150].layout = static_cast(cached_torch_layout_strided); - constants_info_[150].original_fqn = "mv2.features.17.conv.2.weight"; - constants_info_[151].name = "mv2_features_17_conv_3_weight"; - constants_info_[151].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[151].offset = 0; - constants_info_[151].data_size = 1280; - constants_info_[151].from_folded = false; - constants_info_[151].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[151].shape = {320}; - constants_info_[151].stride = {1}; - constants_info_[151].layout = static_cast(cached_torch_layout_strided); - constants_info_[151].original_fqn = "mv2.features.17.conv.3.weight"; - constants_info_[152].name = "mv2_features_17_conv_3_bias"; - constants_info_[152].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[152].offset = 0; - constants_info_[152].data_size = 1280; - constants_info_[152].from_folded = false; - constants_info_[152].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[152].shape = {320}; - constants_info_[152].stride = {1}; - constants_info_[152].layout = static_cast(cached_torch_layout_strided); - constants_info_[152].original_fqn = "mv2.features.17.conv.3.bias"; - constants_info_[153].name = "mv2_features_18_0_weight"; - constants_info_[153].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[153].offset = 0; - constants_info_[153].data_size = 1638400; - constants_info_[153].from_folded = false; - constants_info_[153].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[153].shape = {1280, 320, 1, 1}; - constants_info_[153].stride = {320, 1, 1, 1}; - constants_info_[153].layout = static_cast(cached_torch_layout_strided); - constants_info_[153].original_fqn = "mv2.features.18.0.weight"; - constants_info_[154].name = "mv2_features_18_1_weight"; - constants_info_[154].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[154].offset = 0; - constants_info_[154].data_size = 5120; - constants_info_[154].from_folded = false; - constants_info_[154].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[154].shape = {1280}; - constants_info_[154].stride = {1}; - constants_info_[154].layout = static_cast(cached_torch_layout_strided); - constants_info_[154].original_fqn = "mv2.features.18.1.weight"; - constants_info_[155].name = "mv2_features_18_1_bias"; - constants_info_[155].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[155].offset = 0; - constants_info_[155].data_size = 5120; - constants_info_[155].from_folded = false; - constants_info_[155].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[155].shape = {1280}; - constants_info_[155].stride = {1}; - constants_info_[155].layout = static_cast(cached_torch_layout_strided); - constants_info_[155].original_fqn = "mv2.features.18.1.bias"; - constants_info_[156].name = "mv2_classifier_1_weight"; - constants_info_[156].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[156].offset = 0; - constants_info_[156].data_size = 5120000; - constants_info_[156].from_folded = false; - constants_info_[156].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[156].shape = {1000, 1280}; - constants_info_[156].stride = {1280, 1}; - constants_info_[156].layout = static_cast(cached_torch_layout_strided); - constants_info_[156].original_fqn = "mv2.classifier.1.weight"; - constants_info_[157].name = "mv2_classifier_1_bias"; - constants_info_[157].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[157].offset = 0; - constants_info_[157].data_size = 4000; - constants_info_[157].from_folded = false; - constants_info_[157].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[157].shape = {1000}; - constants_info_[157].stride = {1}; - constants_info_[157].layout = static_cast(cached_torch_layout_strided); - constants_info_[157].original_fqn = "mv2.classifier.1.bias"; - constants_info_[158].name = "mv2_features_0_1_running_mean"; - constants_info_[158].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[158].offset = 0; - constants_info_[158].data_size = 128; - constants_info_[158].from_folded = false; - constants_info_[158].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[158].shape = {32}; - constants_info_[158].stride = {1}; - constants_info_[158].layout = static_cast(cached_torch_layout_strided); - constants_info_[158].original_fqn = "mv2.features.0.1.running_mean"; - constants_info_[159].name = "mv2_features_0_1_running_var"; - constants_info_[159].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[159].offset = 0; - constants_info_[159].data_size = 128; - constants_info_[159].from_folded = false; - constants_info_[159].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[159].shape = {32}; - constants_info_[159].stride = {1}; - constants_info_[159].layout = static_cast(cached_torch_layout_strided); - constants_info_[159].original_fqn = "mv2.features.0.1.running_var"; - constants_info_[160].name = "mv2_features_1_conv_0_1_running_mean"; - constants_info_[160].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[160].offset = 0; - constants_info_[160].data_size = 128; - constants_info_[160].from_folded = false; - constants_info_[160].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[160].shape = {32}; - constants_info_[160].stride = {1}; - constants_info_[160].layout = static_cast(cached_torch_layout_strided); - constants_info_[160].original_fqn = "mv2.features.1.conv.0.1.running_mean"; - constants_info_[161].name = "mv2_features_1_conv_0_1_running_var"; - constants_info_[161].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[161].offset = 0; - constants_info_[161].data_size = 128; - constants_info_[161].from_folded = false; - constants_info_[161].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[161].shape = {32}; - constants_info_[161].stride = {1}; - constants_info_[161].layout = static_cast(cached_torch_layout_strided); - constants_info_[161].original_fqn = "mv2.features.1.conv.0.1.running_var"; - constants_info_[162].name = "mv2_features_1_conv_2_running_mean"; - constants_info_[162].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[162].offset = 0; - constants_info_[162].data_size = 64; - constants_info_[162].from_folded = false; - constants_info_[162].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[162].shape = {16}; - constants_info_[162].stride = {1}; - constants_info_[162].layout = static_cast(cached_torch_layout_strided); - constants_info_[162].original_fqn = "mv2.features.1.conv.2.running_mean"; - constants_info_[163].name = "mv2_features_1_conv_2_running_var"; - constants_info_[163].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[163].offset = 0; - constants_info_[163].data_size = 64; - constants_info_[163].from_folded = false; - constants_info_[163].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[163].shape = {16}; - constants_info_[163].stride = {1}; - constants_info_[163].layout = static_cast(cached_torch_layout_strided); - constants_info_[163].original_fqn = "mv2.features.1.conv.2.running_var"; - constants_info_[164].name = "mv2_features_2_conv_0_1_running_mean"; - constants_info_[164].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[164].offset = 0; - constants_info_[164].data_size = 384; - constants_info_[164].from_folded = false; - constants_info_[164].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[164].shape = {96}; - constants_info_[164].stride = {1}; - constants_info_[164].layout = static_cast(cached_torch_layout_strided); - constants_info_[164].original_fqn = "mv2.features.2.conv.0.1.running_mean"; - constants_info_[165].name = "mv2_features_2_conv_0_1_running_var"; - constants_info_[165].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[165].offset = 0; - constants_info_[165].data_size = 384; - constants_info_[165].from_folded = false; - constants_info_[165].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[165].shape = {96}; - constants_info_[165].stride = {1}; - constants_info_[165].layout = static_cast(cached_torch_layout_strided); - constants_info_[165].original_fqn = "mv2.features.2.conv.0.1.running_var"; - constants_info_[166].name = "mv2_features_2_conv_1_1_running_mean"; - constants_info_[166].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[166].offset = 0; - constants_info_[166].data_size = 384; - constants_info_[166].from_folded = false; - constants_info_[166].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[166].shape = {96}; - constants_info_[166].stride = {1}; - constants_info_[166].layout = static_cast(cached_torch_layout_strided); - constants_info_[166].original_fqn = "mv2.features.2.conv.1.1.running_mean"; - constants_info_[167].name = "mv2_features_2_conv_1_1_running_var"; - constants_info_[167].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[167].offset = 0; - constants_info_[167].data_size = 384; - constants_info_[167].from_folded = false; - constants_info_[167].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[167].shape = {96}; - constants_info_[167].stride = {1}; - constants_info_[167].layout = static_cast(cached_torch_layout_strided); - constants_info_[167].original_fqn = "mv2.features.2.conv.1.1.running_var"; - constants_info_[168].name = "mv2_features_2_conv_3_running_mean"; - constants_info_[168].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[168].offset = 0; - constants_info_[168].data_size = 96; - constants_info_[168].from_folded = false; - constants_info_[168].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[168].shape = {24}; - constants_info_[168].stride = {1}; - constants_info_[168].layout = static_cast(cached_torch_layout_strided); - constants_info_[168].original_fqn = "mv2.features.2.conv.3.running_mean"; - constants_info_[169].name = "mv2_features_2_conv_3_running_var"; - constants_info_[169].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[169].offset = 0; - constants_info_[169].data_size = 96; - constants_info_[169].from_folded = false; - constants_info_[169].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[169].shape = {24}; - constants_info_[169].stride = {1}; - constants_info_[169].layout = static_cast(cached_torch_layout_strided); - constants_info_[169].original_fqn = "mv2.features.2.conv.3.running_var"; - constants_info_[170].name = "mv2_features_3_conv_0_1_running_mean"; - constants_info_[170].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[170].offset = 0; - constants_info_[170].data_size = 576; - constants_info_[170].from_folded = false; - constants_info_[170].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[170].shape = {144}; - constants_info_[170].stride = {1}; - constants_info_[170].layout = static_cast(cached_torch_layout_strided); - constants_info_[170].original_fqn = "mv2.features.3.conv.0.1.running_mean"; - constants_info_[171].name = "mv2_features_3_conv_0_1_running_var"; - constants_info_[171].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[171].offset = 0; - constants_info_[171].data_size = 576; - constants_info_[171].from_folded = false; - constants_info_[171].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[171].shape = {144}; - constants_info_[171].stride = {1}; - constants_info_[171].layout = static_cast(cached_torch_layout_strided); - constants_info_[171].original_fqn = "mv2.features.3.conv.0.1.running_var"; - constants_info_[172].name = "mv2_features_3_conv_1_1_running_mean"; - constants_info_[172].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[172].offset = 0; - constants_info_[172].data_size = 576; - constants_info_[172].from_folded = false; - constants_info_[172].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[172].shape = {144}; - constants_info_[172].stride = {1}; - constants_info_[172].layout = static_cast(cached_torch_layout_strided); - constants_info_[172].original_fqn = "mv2.features.3.conv.1.1.running_mean"; - constants_info_[173].name = "mv2_features_3_conv_1_1_running_var"; - constants_info_[173].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[173].offset = 0; - constants_info_[173].data_size = 576; - constants_info_[173].from_folded = false; - constants_info_[173].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[173].shape = {144}; - constants_info_[173].stride = {1}; - constants_info_[173].layout = static_cast(cached_torch_layout_strided); - constants_info_[173].original_fqn = "mv2.features.3.conv.1.1.running_var"; - constants_info_[174].name = "mv2_features_3_conv_3_running_mean"; - constants_info_[174].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[174].offset = 0; - constants_info_[174].data_size = 96; - constants_info_[174].from_folded = false; - constants_info_[174].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[174].shape = {24}; - constants_info_[174].stride = {1}; - constants_info_[174].layout = static_cast(cached_torch_layout_strided); - constants_info_[174].original_fqn = "mv2.features.3.conv.3.running_mean"; - constants_info_[175].name = "mv2_features_3_conv_3_running_var"; - constants_info_[175].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[175].offset = 0; - constants_info_[175].data_size = 96; - constants_info_[175].from_folded = false; - constants_info_[175].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[175].shape = {24}; - constants_info_[175].stride = {1}; - constants_info_[175].layout = static_cast(cached_torch_layout_strided); - constants_info_[175].original_fqn = "mv2.features.3.conv.3.running_var"; - constants_info_[176].name = "mv2_features_4_conv_0_1_running_mean"; - constants_info_[176].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[176].offset = 0; - constants_info_[176].data_size = 576; - constants_info_[176].from_folded = false; - constants_info_[176].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[176].shape = {144}; - constants_info_[176].stride = {1}; - constants_info_[176].layout = static_cast(cached_torch_layout_strided); - constants_info_[176].original_fqn = "mv2.features.4.conv.0.1.running_mean"; - constants_info_[177].name = "mv2_features_4_conv_0_1_running_var"; - constants_info_[177].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[177].offset = 0; - constants_info_[177].data_size = 576; - constants_info_[177].from_folded = false; - constants_info_[177].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[177].shape = {144}; - constants_info_[177].stride = {1}; - constants_info_[177].layout = static_cast(cached_torch_layout_strided); - constants_info_[177].original_fqn = "mv2.features.4.conv.0.1.running_var"; - constants_info_[178].name = "mv2_features_4_conv_1_1_running_mean"; - constants_info_[178].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[178].offset = 0; - constants_info_[178].data_size = 576; - constants_info_[178].from_folded = false; - constants_info_[178].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[178].shape = {144}; - constants_info_[178].stride = {1}; - constants_info_[178].layout = static_cast(cached_torch_layout_strided); - constants_info_[178].original_fqn = "mv2.features.4.conv.1.1.running_mean"; - constants_info_[179].name = "mv2_features_4_conv_1_1_running_var"; - constants_info_[179].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[179].offset = 0; - constants_info_[179].data_size = 576; - constants_info_[179].from_folded = false; - constants_info_[179].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[179].shape = {144}; - constants_info_[179].stride = {1}; - constants_info_[179].layout = static_cast(cached_torch_layout_strided); - constants_info_[179].original_fqn = "mv2.features.4.conv.1.1.running_var"; - constants_info_[180].name = "mv2_features_4_conv_3_running_mean"; - constants_info_[180].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[180].offset = 0; - constants_info_[180].data_size = 128; - constants_info_[180].from_folded = false; - constants_info_[180].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[180].shape = {32}; - constants_info_[180].stride = {1}; - constants_info_[180].layout = static_cast(cached_torch_layout_strided); - constants_info_[180].original_fqn = "mv2.features.4.conv.3.running_mean"; - constants_info_[181].name = "mv2_features_4_conv_3_running_var"; - constants_info_[181].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[181].offset = 0; - constants_info_[181].data_size = 128; - constants_info_[181].from_folded = false; - constants_info_[181].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[181].shape = {32}; - constants_info_[181].stride = {1}; - constants_info_[181].layout = static_cast(cached_torch_layout_strided); - constants_info_[181].original_fqn = "mv2.features.4.conv.3.running_var"; - constants_info_[182].name = "mv2_features_5_conv_0_1_running_mean"; - constants_info_[182].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[182].offset = 0; - constants_info_[182].data_size = 768; - constants_info_[182].from_folded = false; - constants_info_[182].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[182].shape = {192}; - constants_info_[182].stride = {1}; - constants_info_[182].layout = static_cast(cached_torch_layout_strided); - constants_info_[182].original_fqn = "mv2.features.5.conv.0.1.running_mean"; - constants_info_[183].name = "mv2_features_5_conv_0_1_running_var"; - constants_info_[183].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[183].offset = 0; - constants_info_[183].data_size = 768; - constants_info_[183].from_folded = false; - constants_info_[183].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[183].shape = {192}; - constants_info_[183].stride = {1}; - constants_info_[183].layout = static_cast(cached_torch_layout_strided); - constants_info_[183].original_fqn = "mv2.features.5.conv.0.1.running_var"; - constants_info_[184].name = "mv2_features_5_conv_1_1_running_mean"; - constants_info_[184].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[184].offset = 0; - constants_info_[184].data_size = 768; - constants_info_[184].from_folded = false; - constants_info_[184].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[184].shape = {192}; - constants_info_[184].stride = {1}; - constants_info_[184].layout = static_cast(cached_torch_layout_strided); - constants_info_[184].original_fqn = "mv2.features.5.conv.1.1.running_mean"; - constants_info_[185].name = "mv2_features_5_conv_1_1_running_var"; - constants_info_[185].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[185].offset = 0; - constants_info_[185].data_size = 768; - constants_info_[185].from_folded = false; - constants_info_[185].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[185].shape = {192}; - constants_info_[185].stride = {1}; - constants_info_[185].layout = static_cast(cached_torch_layout_strided); - constants_info_[185].original_fqn = "mv2.features.5.conv.1.1.running_var"; - constants_info_[186].name = "mv2_features_5_conv_3_running_mean"; - constants_info_[186].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[186].offset = 0; - constants_info_[186].data_size = 128; - constants_info_[186].from_folded = false; - constants_info_[186].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[186].shape = {32}; - constants_info_[186].stride = {1}; - constants_info_[186].layout = static_cast(cached_torch_layout_strided); - constants_info_[186].original_fqn = "mv2.features.5.conv.3.running_mean"; - constants_info_[187].name = "mv2_features_5_conv_3_running_var"; - constants_info_[187].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[187].offset = 0; - constants_info_[187].data_size = 128; - constants_info_[187].from_folded = false; - constants_info_[187].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[187].shape = {32}; - constants_info_[187].stride = {1}; - constants_info_[187].layout = static_cast(cached_torch_layout_strided); - constants_info_[187].original_fqn = "mv2.features.5.conv.3.running_var"; - constants_info_[188].name = "mv2_features_6_conv_0_1_running_mean"; - constants_info_[188].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[188].offset = 0; - constants_info_[188].data_size = 768; - constants_info_[188].from_folded = false; - constants_info_[188].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[188].shape = {192}; - constants_info_[188].stride = {1}; - constants_info_[188].layout = static_cast(cached_torch_layout_strided); - constants_info_[188].original_fqn = "mv2.features.6.conv.0.1.running_mean"; - constants_info_[189].name = "mv2_features_6_conv_0_1_running_var"; - constants_info_[189].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[189].offset = 0; - constants_info_[189].data_size = 768; - constants_info_[189].from_folded = false; - constants_info_[189].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[189].shape = {192}; - constants_info_[189].stride = {1}; - constants_info_[189].layout = static_cast(cached_torch_layout_strided); - constants_info_[189].original_fqn = "mv2.features.6.conv.0.1.running_var"; - constants_info_[190].name = "mv2_features_6_conv_1_1_running_mean"; - constants_info_[190].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[190].offset = 0; - constants_info_[190].data_size = 768; - constants_info_[190].from_folded = false; - constants_info_[190].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[190].shape = {192}; - constants_info_[190].stride = {1}; - constants_info_[190].layout = static_cast(cached_torch_layout_strided); - constants_info_[190].original_fqn = "mv2.features.6.conv.1.1.running_mean"; - constants_info_[191].name = "mv2_features_6_conv_1_1_running_var"; - constants_info_[191].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[191].offset = 0; - constants_info_[191].data_size = 768; - constants_info_[191].from_folded = false; - constants_info_[191].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[191].shape = {192}; - constants_info_[191].stride = {1}; - constants_info_[191].layout = static_cast(cached_torch_layout_strided); - constants_info_[191].original_fqn = "mv2.features.6.conv.1.1.running_var"; - constants_info_[192].name = "mv2_features_6_conv_3_running_mean"; - constants_info_[192].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[192].offset = 0; - constants_info_[192].data_size = 128; - constants_info_[192].from_folded = false; - constants_info_[192].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[192].shape = {32}; - constants_info_[192].stride = {1}; - constants_info_[192].layout = static_cast(cached_torch_layout_strided); - constants_info_[192].original_fqn = "mv2.features.6.conv.3.running_mean"; - constants_info_[193].name = "mv2_features_6_conv_3_running_var"; - constants_info_[193].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[193].offset = 0; - constants_info_[193].data_size = 128; - constants_info_[193].from_folded = false; - constants_info_[193].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[193].shape = {32}; - constants_info_[193].stride = {1}; - constants_info_[193].layout = static_cast(cached_torch_layout_strided); - constants_info_[193].original_fqn = "mv2.features.6.conv.3.running_var"; - constants_info_[194].name = "mv2_features_7_conv_0_1_running_mean"; - constants_info_[194].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[194].offset = 0; - constants_info_[194].data_size = 768; - constants_info_[194].from_folded = false; - constants_info_[194].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[194].shape = {192}; - constants_info_[194].stride = {1}; - constants_info_[194].layout = static_cast(cached_torch_layout_strided); - constants_info_[194].original_fqn = "mv2.features.7.conv.0.1.running_mean"; - constants_info_[195].name = "mv2_features_7_conv_0_1_running_var"; - constants_info_[195].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[195].offset = 0; - constants_info_[195].data_size = 768; - constants_info_[195].from_folded = false; - constants_info_[195].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[195].shape = {192}; - constants_info_[195].stride = {1}; - constants_info_[195].layout = static_cast(cached_torch_layout_strided); - constants_info_[195].original_fqn = "mv2.features.7.conv.0.1.running_var"; - constants_info_[196].name = "mv2_features_7_conv_1_1_running_mean"; - constants_info_[196].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[196].offset = 0; - constants_info_[196].data_size = 768; - constants_info_[196].from_folded = false; - constants_info_[196].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[196].shape = {192}; - constants_info_[196].stride = {1}; - constants_info_[196].layout = static_cast(cached_torch_layout_strided); - constants_info_[196].original_fqn = "mv2.features.7.conv.1.1.running_mean"; - constants_info_[197].name = "mv2_features_7_conv_1_1_running_var"; - constants_info_[197].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[197].offset = 0; - constants_info_[197].data_size = 768; - constants_info_[197].from_folded = false; - constants_info_[197].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[197].shape = {192}; - constants_info_[197].stride = {1}; - constants_info_[197].layout = static_cast(cached_torch_layout_strided); - constants_info_[197].original_fqn = "mv2.features.7.conv.1.1.running_var"; - constants_info_[198].name = "mv2_features_7_conv_3_running_mean"; - constants_info_[198].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[198].offset = 0; - constants_info_[198].data_size = 256; - constants_info_[198].from_folded = false; - constants_info_[198].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[198].shape = {64}; - constants_info_[198].stride = {1}; - constants_info_[198].layout = static_cast(cached_torch_layout_strided); - constants_info_[198].original_fqn = "mv2.features.7.conv.3.running_mean"; - constants_info_[199].name = "mv2_features_7_conv_3_running_var"; - constants_info_[199].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[199].offset = 0; - constants_info_[199].data_size = 256; - constants_info_[199].from_folded = false; - constants_info_[199].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[199].shape = {64}; - constants_info_[199].stride = {1}; - constants_info_[199].layout = static_cast(cached_torch_layout_strided); - constants_info_[199].original_fqn = "mv2.features.7.conv.3.running_var"; - constants_info_[200].name = "mv2_features_8_conv_0_1_running_mean"; - constants_info_[200].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[200].offset = 0; - constants_info_[200].data_size = 1536; - constants_info_[200].from_folded = false; - constants_info_[200].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[200].shape = {384}; - constants_info_[200].stride = {1}; - constants_info_[200].layout = static_cast(cached_torch_layout_strided); - constants_info_[200].original_fqn = "mv2.features.8.conv.0.1.running_mean"; - constants_info_[201].name = "mv2_features_8_conv_0_1_running_var"; - constants_info_[201].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[201].offset = 0; - constants_info_[201].data_size = 1536; - constants_info_[201].from_folded = false; - constants_info_[201].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[201].shape = {384}; - constants_info_[201].stride = {1}; - constants_info_[201].layout = static_cast(cached_torch_layout_strided); - constants_info_[201].original_fqn = "mv2.features.8.conv.0.1.running_var"; - constants_info_[202].name = "mv2_features_8_conv_1_1_running_mean"; - constants_info_[202].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[202].offset = 0; - constants_info_[202].data_size = 1536; - constants_info_[202].from_folded = false; - constants_info_[202].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[202].shape = {384}; - constants_info_[202].stride = {1}; - constants_info_[202].layout = static_cast(cached_torch_layout_strided); - constants_info_[202].original_fqn = "mv2.features.8.conv.1.1.running_mean"; - constants_info_[203].name = "mv2_features_8_conv_1_1_running_var"; - constants_info_[203].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[203].offset = 0; - constants_info_[203].data_size = 1536; - constants_info_[203].from_folded = false; - constants_info_[203].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[203].shape = {384}; - constants_info_[203].stride = {1}; - constants_info_[203].layout = static_cast(cached_torch_layout_strided); - constants_info_[203].original_fqn = "mv2.features.8.conv.1.1.running_var"; - constants_info_[204].name = "mv2_features_8_conv_3_running_mean"; - constants_info_[204].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[204].offset = 0; - constants_info_[204].data_size = 256; - constants_info_[204].from_folded = false; - constants_info_[204].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[204].shape = {64}; - constants_info_[204].stride = {1}; - constants_info_[204].layout = static_cast(cached_torch_layout_strided); - constants_info_[204].original_fqn = "mv2.features.8.conv.3.running_mean"; - constants_info_[205].name = "mv2_features_8_conv_3_running_var"; - constants_info_[205].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[205].offset = 0; - constants_info_[205].data_size = 256; - constants_info_[205].from_folded = false; - constants_info_[205].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[205].shape = {64}; - constants_info_[205].stride = {1}; - constants_info_[205].layout = static_cast(cached_torch_layout_strided); - constants_info_[205].original_fqn = "mv2.features.8.conv.3.running_var"; - constants_info_[206].name = "mv2_features_9_conv_0_1_running_mean"; - constants_info_[206].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[206].offset = 0; - constants_info_[206].data_size = 1536; - constants_info_[206].from_folded = false; - constants_info_[206].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[206].shape = {384}; - constants_info_[206].stride = {1}; - constants_info_[206].layout = static_cast(cached_torch_layout_strided); - constants_info_[206].original_fqn = "mv2.features.9.conv.0.1.running_mean"; - constants_info_[207].name = "mv2_features_9_conv_0_1_running_var"; - constants_info_[207].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[207].offset = 0; - constants_info_[207].data_size = 1536; - constants_info_[207].from_folded = false; - constants_info_[207].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[207].shape = {384}; - constants_info_[207].stride = {1}; - constants_info_[207].layout = static_cast(cached_torch_layout_strided); - constants_info_[207].original_fqn = "mv2.features.9.conv.0.1.running_var"; - constants_info_[208].name = "mv2_features_9_conv_1_1_running_mean"; - constants_info_[208].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[208].offset = 0; - constants_info_[208].data_size = 1536; - constants_info_[208].from_folded = false; - constants_info_[208].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[208].shape = {384}; - constants_info_[208].stride = {1}; - constants_info_[208].layout = static_cast(cached_torch_layout_strided); - constants_info_[208].original_fqn = "mv2.features.9.conv.1.1.running_mean"; - constants_info_[209].name = "mv2_features_9_conv_1_1_running_var"; - constants_info_[209].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[209].offset = 0; - constants_info_[209].data_size = 1536; - constants_info_[209].from_folded = false; - constants_info_[209].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[209].shape = {384}; - constants_info_[209].stride = {1}; - constants_info_[209].layout = static_cast(cached_torch_layout_strided); - constants_info_[209].original_fqn = "mv2.features.9.conv.1.1.running_var"; - constants_info_[210].name = "mv2_features_9_conv_3_running_mean"; - constants_info_[210].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[210].offset = 0; - constants_info_[210].data_size = 256; - constants_info_[210].from_folded = false; - constants_info_[210].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[210].shape = {64}; - constants_info_[210].stride = {1}; - constants_info_[210].layout = static_cast(cached_torch_layout_strided); - constants_info_[210].original_fqn = "mv2.features.9.conv.3.running_mean"; - constants_info_[211].name = "mv2_features_9_conv_3_running_var"; - constants_info_[211].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[211].offset = 0; - constants_info_[211].data_size = 256; - constants_info_[211].from_folded = false; - constants_info_[211].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[211].shape = {64}; - constants_info_[211].stride = {1}; - constants_info_[211].layout = static_cast(cached_torch_layout_strided); - constants_info_[211].original_fqn = "mv2.features.9.conv.3.running_var"; - constants_info_[212].name = "mv2_features_10_conv_0_1_running_mean"; - constants_info_[212].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[212].offset = 0; - constants_info_[212].data_size = 1536; - constants_info_[212].from_folded = false; - constants_info_[212].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[212].shape = {384}; - constants_info_[212].stride = {1}; - constants_info_[212].layout = static_cast(cached_torch_layout_strided); - constants_info_[212].original_fqn = "mv2.features.10.conv.0.1.running_mean"; - constants_info_[213].name = "mv2_features_10_conv_0_1_running_var"; - constants_info_[213].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[213].offset = 0; - constants_info_[213].data_size = 1536; - constants_info_[213].from_folded = false; - constants_info_[213].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[213].shape = {384}; - constants_info_[213].stride = {1}; - constants_info_[213].layout = static_cast(cached_torch_layout_strided); - constants_info_[213].original_fqn = "mv2.features.10.conv.0.1.running_var"; - constants_info_[214].name = "mv2_features_10_conv_1_1_running_mean"; - constants_info_[214].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[214].offset = 0; - constants_info_[214].data_size = 1536; - constants_info_[214].from_folded = false; - constants_info_[214].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[214].shape = {384}; - constants_info_[214].stride = {1}; - constants_info_[214].layout = static_cast(cached_torch_layout_strided); - constants_info_[214].original_fqn = "mv2.features.10.conv.1.1.running_mean"; - constants_info_[215].name = "mv2_features_10_conv_1_1_running_var"; - constants_info_[215].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[215].offset = 0; - constants_info_[215].data_size = 1536; - constants_info_[215].from_folded = false; - constants_info_[215].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[215].shape = {384}; - constants_info_[215].stride = {1}; - constants_info_[215].layout = static_cast(cached_torch_layout_strided); - constants_info_[215].original_fqn = "mv2.features.10.conv.1.1.running_var"; - constants_info_[216].name = "mv2_features_10_conv_3_running_mean"; - constants_info_[216].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[216].offset = 0; - constants_info_[216].data_size = 256; - constants_info_[216].from_folded = false; - constants_info_[216].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[216].shape = {64}; - constants_info_[216].stride = {1}; - constants_info_[216].layout = static_cast(cached_torch_layout_strided); - constants_info_[216].original_fqn = "mv2.features.10.conv.3.running_mean"; - constants_info_[217].name = "mv2_features_10_conv_3_running_var"; - constants_info_[217].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[217].offset = 0; - constants_info_[217].data_size = 256; - constants_info_[217].from_folded = false; - constants_info_[217].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[217].shape = {64}; - constants_info_[217].stride = {1}; - constants_info_[217].layout = static_cast(cached_torch_layout_strided); - constants_info_[217].original_fqn = "mv2.features.10.conv.3.running_var"; - constants_info_[218].name = "mv2_features_11_conv_0_1_running_mean"; - constants_info_[218].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[218].offset = 0; - constants_info_[218].data_size = 1536; - constants_info_[218].from_folded = false; - constants_info_[218].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[218].shape = {384}; - constants_info_[218].stride = {1}; - constants_info_[218].layout = static_cast(cached_torch_layout_strided); - constants_info_[218].original_fqn = "mv2.features.11.conv.0.1.running_mean"; - constants_info_[219].name = "mv2_features_11_conv_0_1_running_var"; - constants_info_[219].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[219].offset = 0; - constants_info_[219].data_size = 1536; - constants_info_[219].from_folded = false; - constants_info_[219].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[219].shape = {384}; - constants_info_[219].stride = {1}; - constants_info_[219].layout = static_cast(cached_torch_layout_strided); - constants_info_[219].original_fqn = "mv2.features.11.conv.0.1.running_var"; - constants_info_[220].name = "mv2_features_11_conv_1_1_running_mean"; - constants_info_[220].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[220].offset = 0; - constants_info_[220].data_size = 1536; - constants_info_[220].from_folded = false; - constants_info_[220].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[220].shape = {384}; - constants_info_[220].stride = {1}; - constants_info_[220].layout = static_cast(cached_torch_layout_strided); - constants_info_[220].original_fqn = "mv2.features.11.conv.1.1.running_mean"; - constants_info_[221].name = "mv2_features_11_conv_1_1_running_var"; - constants_info_[221].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[221].offset = 0; - constants_info_[221].data_size = 1536; - constants_info_[221].from_folded = false; - constants_info_[221].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[221].shape = {384}; - constants_info_[221].stride = {1}; - constants_info_[221].layout = static_cast(cached_torch_layout_strided); - constants_info_[221].original_fqn = "mv2.features.11.conv.1.1.running_var"; - constants_info_[222].name = "mv2_features_11_conv_3_running_mean"; - constants_info_[222].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[222].offset = 0; - constants_info_[222].data_size = 384; - constants_info_[222].from_folded = false; - constants_info_[222].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[222].shape = {96}; - constants_info_[222].stride = {1}; - constants_info_[222].layout = static_cast(cached_torch_layout_strided); - constants_info_[222].original_fqn = "mv2.features.11.conv.3.running_mean"; - constants_info_[223].name = "mv2_features_11_conv_3_running_var"; - constants_info_[223].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[223].offset = 0; - constants_info_[223].data_size = 384; - constants_info_[223].from_folded = false; - constants_info_[223].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[223].shape = {96}; - constants_info_[223].stride = {1}; - constants_info_[223].layout = static_cast(cached_torch_layout_strided); - constants_info_[223].original_fqn = "mv2.features.11.conv.3.running_var"; - constants_info_[224].name = "mv2_features_12_conv_0_1_running_mean"; - constants_info_[224].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[224].offset = 0; - constants_info_[224].data_size = 2304; - constants_info_[224].from_folded = false; - constants_info_[224].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[224].shape = {576}; - constants_info_[224].stride = {1}; - constants_info_[224].layout = static_cast(cached_torch_layout_strided); - constants_info_[224].original_fqn = "mv2.features.12.conv.0.1.running_mean"; - constants_info_[225].name = "mv2_features_12_conv_0_1_running_var"; - constants_info_[225].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[225].offset = 0; - constants_info_[225].data_size = 2304; - constants_info_[225].from_folded = false; - constants_info_[225].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[225].shape = {576}; - constants_info_[225].stride = {1}; - constants_info_[225].layout = static_cast(cached_torch_layout_strided); - constants_info_[225].original_fqn = "mv2.features.12.conv.0.1.running_var"; - constants_info_[226].name = "mv2_features_12_conv_1_1_running_mean"; - constants_info_[226].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[226].offset = 0; - constants_info_[226].data_size = 2304; - constants_info_[226].from_folded = false; - constants_info_[226].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[226].shape = {576}; - constants_info_[226].stride = {1}; - constants_info_[226].layout = static_cast(cached_torch_layout_strided); - constants_info_[226].original_fqn = "mv2.features.12.conv.1.1.running_mean"; - constants_info_[227].name = "mv2_features_12_conv_1_1_running_var"; - constants_info_[227].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[227].offset = 0; - constants_info_[227].data_size = 2304; - constants_info_[227].from_folded = false; - constants_info_[227].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[227].shape = {576}; - constants_info_[227].stride = {1}; - constants_info_[227].layout = static_cast(cached_torch_layout_strided); - constants_info_[227].original_fqn = "mv2.features.12.conv.1.1.running_var"; - constants_info_[228].name = "mv2_features_12_conv_3_running_mean"; - constants_info_[228].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[228].offset = 0; - constants_info_[228].data_size = 384; - constants_info_[228].from_folded = false; - constants_info_[228].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[228].shape = {96}; - constants_info_[228].stride = {1}; - constants_info_[228].layout = static_cast(cached_torch_layout_strided); - constants_info_[228].original_fqn = "mv2.features.12.conv.3.running_mean"; - constants_info_[229].name = "mv2_features_12_conv_3_running_var"; - constants_info_[229].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[229].offset = 0; - constants_info_[229].data_size = 384; - constants_info_[229].from_folded = false; - constants_info_[229].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[229].shape = {96}; - constants_info_[229].stride = {1}; - constants_info_[229].layout = static_cast(cached_torch_layout_strided); - constants_info_[229].original_fqn = "mv2.features.12.conv.3.running_var"; - constants_info_[230].name = "mv2_features_13_conv_0_1_running_mean"; - constants_info_[230].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[230].offset = 0; - constants_info_[230].data_size = 2304; - constants_info_[230].from_folded = false; - constants_info_[230].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[230].shape = {576}; - constants_info_[230].stride = {1}; - constants_info_[230].layout = static_cast(cached_torch_layout_strided); - constants_info_[230].original_fqn = "mv2.features.13.conv.0.1.running_mean"; - constants_info_[231].name = "mv2_features_13_conv_0_1_running_var"; - constants_info_[231].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[231].offset = 0; - constants_info_[231].data_size = 2304; - constants_info_[231].from_folded = false; - constants_info_[231].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[231].shape = {576}; - constants_info_[231].stride = {1}; - constants_info_[231].layout = static_cast(cached_torch_layout_strided); - constants_info_[231].original_fqn = "mv2.features.13.conv.0.1.running_var"; - constants_info_[232].name = "mv2_features_13_conv_1_1_running_mean"; - constants_info_[232].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[232].offset = 0; - constants_info_[232].data_size = 2304; - constants_info_[232].from_folded = false; - constants_info_[232].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[232].shape = {576}; - constants_info_[232].stride = {1}; - constants_info_[232].layout = static_cast(cached_torch_layout_strided); - constants_info_[232].original_fqn = "mv2.features.13.conv.1.1.running_mean"; - constants_info_[233].name = "mv2_features_13_conv_1_1_running_var"; - constants_info_[233].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[233].offset = 0; - constants_info_[233].data_size = 2304; - constants_info_[233].from_folded = false; - constants_info_[233].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[233].shape = {576}; - constants_info_[233].stride = {1}; - constants_info_[233].layout = static_cast(cached_torch_layout_strided); - constants_info_[233].original_fqn = "mv2.features.13.conv.1.1.running_var"; - constants_info_[234].name = "mv2_features_13_conv_3_running_mean"; - constants_info_[234].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[234].offset = 0; - constants_info_[234].data_size = 384; - constants_info_[234].from_folded = false; - constants_info_[234].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[234].shape = {96}; - constants_info_[234].stride = {1}; - constants_info_[234].layout = static_cast(cached_torch_layout_strided); - constants_info_[234].original_fqn = "mv2.features.13.conv.3.running_mean"; - constants_info_[235].name = "mv2_features_13_conv_3_running_var"; - constants_info_[235].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[235].offset = 0; - constants_info_[235].data_size = 384; - constants_info_[235].from_folded = false; - constants_info_[235].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[235].shape = {96}; - constants_info_[235].stride = {1}; - constants_info_[235].layout = static_cast(cached_torch_layout_strided); - constants_info_[235].original_fqn = "mv2.features.13.conv.3.running_var"; - constants_info_[236].name = "mv2_features_14_conv_0_1_running_mean"; - constants_info_[236].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[236].offset = 0; - constants_info_[236].data_size = 2304; - constants_info_[236].from_folded = false; - constants_info_[236].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[236].shape = {576}; - constants_info_[236].stride = {1}; - constants_info_[236].layout = static_cast(cached_torch_layout_strided); - constants_info_[236].original_fqn = "mv2.features.14.conv.0.1.running_mean"; - constants_info_[237].name = "mv2_features_14_conv_0_1_running_var"; - constants_info_[237].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[237].offset = 0; - constants_info_[237].data_size = 2304; - constants_info_[237].from_folded = false; - constants_info_[237].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[237].shape = {576}; - constants_info_[237].stride = {1}; - constants_info_[237].layout = static_cast(cached_torch_layout_strided); - constants_info_[237].original_fqn = "mv2.features.14.conv.0.1.running_var"; - constants_info_[238].name = "mv2_features_14_conv_1_1_running_mean"; - constants_info_[238].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[238].offset = 0; - constants_info_[238].data_size = 2304; - constants_info_[238].from_folded = false; - constants_info_[238].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[238].shape = {576}; - constants_info_[238].stride = {1}; - constants_info_[238].layout = static_cast(cached_torch_layout_strided); - constants_info_[238].original_fqn = "mv2.features.14.conv.1.1.running_mean"; - constants_info_[239].name = "mv2_features_14_conv_1_1_running_var"; - constants_info_[239].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[239].offset = 0; - constants_info_[239].data_size = 2304; - constants_info_[239].from_folded = false; - constants_info_[239].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[239].shape = {576}; - constants_info_[239].stride = {1}; - constants_info_[239].layout = static_cast(cached_torch_layout_strided); - constants_info_[239].original_fqn = "mv2.features.14.conv.1.1.running_var"; - constants_info_[240].name = "mv2_features_14_conv_3_running_mean"; - constants_info_[240].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[240].offset = 0; - constants_info_[240].data_size = 640; - constants_info_[240].from_folded = false; - constants_info_[240].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[240].shape = {160}; - constants_info_[240].stride = {1}; - constants_info_[240].layout = static_cast(cached_torch_layout_strided); - constants_info_[240].original_fqn = "mv2.features.14.conv.3.running_mean"; - constants_info_[241].name = "mv2_features_14_conv_3_running_var"; - constants_info_[241].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[241].offset = 0; - constants_info_[241].data_size = 640; - constants_info_[241].from_folded = false; - constants_info_[241].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[241].shape = {160}; - constants_info_[241].stride = {1}; - constants_info_[241].layout = static_cast(cached_torch_layout_strided); - constants_info_[241].original_fqn = "mv2.features.14.conv.3.running_var"; - constants_info_[242].name = "mv2_features_15_conv_0_1_running_mean"; - constants_info_[242].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[242].offset = 0; - constants_info_[242].data_size = 3840; - constants_info_[242].from_folded = false; - constants_info_[242].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[242].shape = {960}; - constants_info_[242].stride = {1}; - constants_info_[242].layout = static_cast(cached_torch_layout_strided); - constants_info_[242].original_fqn = "mv2.features.15.conv.0.1.running_mean"; - constants_info_[243].name = "mv2_features_15_conv_0_1_running_var"; - constants_info_[243].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[243].offset = 0; - constants_info_[243].data_size = 3840; - constants_info_[243].from_folded = false; - constants_info_[243].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[243].shape = {960}; - constants_info_[243].stride = {1}; - constants_info_[243].layout = static_cast(cached_torch_layout_strided); - constants_info_[243].original_fqn = "mv2.features.15.conv.0.1.running_var"; - constants_info_[244].name = "mv2_features_15_conv_1_1_running_mean"; - constants_info_[244].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[244].offset = 0; - constants_info_[244].data_size = 3840; - constants_info_[244].from_folded = false; - constants_info_[244].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[244].shape = {960}; - constants_info_[244].stride = {1}; - constants_info_[244].layout = static_cast(cached_torch_layout_strided); - constants_info_[244].original_fqn = "mv2.features.15.conv.1.1.running_mean"; - constants_info_[245].name = "mv2_features_15_conv_1_1_running_var"; - constants_info_[245].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[245].offset = 0; - constants_info_[245].data_size = 3840; - constants_info_[245].from_folded = false; - constants_info_[245].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[245].shape = {960}; - constants_info_[245].stride = {1}; - constants_info_[245].layout = static_cast(cached_torch_layout_strided); - constants_info_[245].original_fqn = "mv2.features.15.conv.1.1.running_var"; - constants_info_[246].name = "mv2_features_15_conv_3_running_mean"; - constants_info_[246].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[246].offset = 0; - constants_info_[246].data_size = 640; - constants_info_[246].from_folded = false; - constants_info_[246].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[246].shape = {160}; - constants_info_[246].stride = {1}; - constants_info_[246].layout = static_cast(cached_torch_layout_strided); - constants_info_[246].original_fqn = "mv2.features.15.conv.3.running_mean"; - constants_info_[247].name = "mv2_features_15_conv_3_running_var"; - constants_info_[247].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[247].offset = 0; - constants_info_[247].data_size = 640; - constants_info_[247].from_folded = false; - constants_info_[247].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[247].shape = {160}; - constants_info_[247].stride = {1}; - constants_info_[247].layout = static_cast(cached_torch_layout_strided); - constants_info_[247].original_fqn = "mv2.features.15.conv.3.running_var"; - constants_info_[248].name = "mv2_features_16_conv_0_1_running_mean"; - constants_info_[248].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[248].offset = 0; - constants_info_[248].data_size = 3840; - constants_info_[248].from_folded = false; - constants_info_[248].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[248].shape = {960}; - constants_info_[248].stride = {1}; - constants_info_[248].layout = static_cast(cached_torch_layout_strided); - constants_info_[248].original_fqn = "mv2.features.16.conv.0.1.running_mean"; - constants_info_[249].name = "mv2_features_16_conv_0_1_running_var"; - constants_info_[249].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[249].offset = 0; - constants_info_[249].data_size = 3840; - constants_info_[249].from_folded = false; - constants_info_[249].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[249].shape = {960}; - constants_info_[249].stride = {1}; - constants_info_[249].layout = static_cast(cached_torch_layout_strided); - constants_info_[249].original_fqn = "mv2.features.16.conv.0.1.running_var"; - constants_info_[250].name = "mv2_features_16_conv_1_1_running_mean"; - constants_info_[250].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[250].offset = 0; - constants_info_[250].data_size = 3840; - constants_info_[250].from_folded = false; - constants_info_[250].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[250].shape = {960}; - constants_info_[250].stride = {1}; - constants_info_[250].layout = static_cast(cached_torch_layout_strided); - constants_info_[250].original_fqn = "mv2.features.16.conv.1.1.running_mean"; - constants_info_[251].name = "mv2_features_16_conv_1_1_running_var"; - constants_info_[251].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[251].offset = 0; - constants_info_[251].data_size = 3840; - constants_info_[251].from_folded = false; - constants_info_[251].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[251].shape = {960}; - constants_info_[251].stride = {1}; - constants_info_[251].layout = static_cast(cached_torch_layout_strided); - constants_info_[251].original_fqn = "mv2.features.16.conv.1.1.running_var"; - constants_info_[252].name = "mv2_features_16_conv_3_running_mean"; - constants_info_[252].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[252].offset = 0; - constants_info_[252].data_size = 640; - constants_info_[252].from_folded = false; - constants_info_[252].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[252].shape = {160}; - constants_info_[252].stride = {1}; - constants_info_[252].layout = static_cast(cached_torch_layout_strided); - constants_info_[252].original_fqn = "mv2.features.16.conv.3.running_mean"; - constants_info_[253].name = "mv2_features_16_conv_3_running_var"; - constants_info_[253].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[253].offset = 0; - constants_info_[253].data_size = 640; - constants_info_[253].from_folded = false; - constants_info_[253].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[253].shape = {160}; - constants_info_[253].stride = {1}; - constants_info_[253].layout = static_cast(cached_torch_layout_strided); - constants_info_[253].original_fqn = "mv2.features.16.conv.3.running_var"; - constants_info_[254].name = "mv2_features_17_conv_0_1_running_mean"; - constants_info_[254].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[254].offset = 0; - constants_info_[254].data_size = 3840; - constants_info_[254].from_folded = false; - constants_info_[254].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[254].shape = {960}; - constants_info_[254].stride = {1}; - constants_info_[254].layout = static_cast(cached_torch_layout_strided); - constants_info_[254].original_fqn = "mv2.features.17.conv.0.1.running_mean"; - constants_info_[255].name = "mv2_features_17_conv_0_1_running_var"; - constants_info_[255].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[255].offset = 0; - constants_info_[255].data_size = 3840; - constants_info_[255].from_folded = false; - constants_info_[255].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[255].shape = {960}; - constants_info_[255].stride = {1}; - constants_info_[255].layout = static_cast(cached_torch_layout_strided); - constants_info_[255].original_fqn = "mv2.features.17.conv.0.1.running_var"; - constants_info_[256].name = "mv2_features_17_conv_1_1_running_mean"; - constants_info_[256].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[256].offset = 0; - constants_info_[256].data_size = 3840; - constants_info_[256].from_folded = false; - constants_info_[256].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[256].shape = {960}; - constants_info_[256].stride = {1}; - constants_info_[256].layout = static_cast(cached_torch_layout_strided); - constants_info_[256].original_fqn = "mv2.features.17.conv.1.1.running_mean"; - constants_info_[257].name = "mv2_features_17_conv_1_1_running_var"; - constants_info_[257].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[257].offset = 0; - constants_info_[257].data_size = 3840; - constants_info_[257].from_folded = false; - constants_info_[257].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[257].shape = {960}; - constants_info_[257].stride = {1}; - constants_info_[257].layout = static_cast(cached_torch_layout_strided); - constants_info_[257].original_fqn = "mv2.features.17.conv.1.1.running_var"; - constants_info_[258].name = "mv2_features_17_conv_3_running_mean"; - constants_info_[258].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[258].offset = 0; - constants_info_[258].data_size = 1280; - constants_info_[258].from_folded = false; - constants_info_[258].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[258].shape = {320}; - constants_info_[258].stride = {1}; - constants_info_[258].layout = static_cast(cached_torch_layout_strided); - constants_info_[258].original_fqn = "mv2.features.17.conv.3.running_mean"; - constants_info_[259].name = "mv2_features_17_conv_3_running_var"; - constants_info_[259].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[259].offset = 0; - constants_info_[259].data_size = 1280; - constants_info_[259].from_folded = false; - constants_info_[259].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[259].shape = {320}; - constants_info_[259].stride = {1}; - constants_info_[259].layout = static_cast(cached_torch_layout_strided); - constants_info_[259].original_fqn = "mv2.features.17.conv.3.running_var"; - constants_info_[260].name = "mv2_features_18_1_running_mean"; - constants_info_[260].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[260].offset = 0; - constants_info_[260].data_size = 5120; - constants_info_[260].from_folded = false; - constants_info_[260].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[260].shape = {1280}; - constants_info_[260].stride = {1}; - constants_info_[260].layout = static_cast(cached_torch_layout_strided); - constants_info_[260].original_fqn = "mv2.features.18.1.running_mean"; - constants_info_[261].name = "mv2_features_18_1_running_var"; - constants_info_[261].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[261].offset = 0; - constants_info_[261].data_size = 5120; - constants_info_[261].from_folded = false; - constants_info_[261].type = static_cast(torch::aot_inductor::ConstantType::Buffer); - constants_info_[261].shape = {1280}; - constants_info_[261].stride = {1}; - constants_info_[261].layout = static_cast(cached_torch_layout_strided); - constants_info_[261].original_fqn = "mv2.features.18.1.running_var"; - update_constants_map(std::move(constants_map)); - update_constants_array(std::move(constants_array)); - in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; - out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; - outputs_info_[0].name = "output0"; - this->kernels_ = std::make_unique(); -} - -std::unordered_map AOTInductorModel::const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization -) { - - if (!initialization) { - std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " - << "aot_inductor.use_runtime_constant_folding=False\n"; - } - return {}; -} -} // namespace torch::aot_inductor -using namespace torch::aot_inductor; - -template -static inline void call_triton_poi_fused_convolution_0( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_0', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 4, 'x': 65536}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 451584, 'x': 602112}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 3 - xnumel = 50176 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x1 = xindex - y0 = yindex - tmp0 = tl.load(in_ptr0 + (x1 + 50176*y0), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = ((ynumel + (4 - 1)) / (4)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_0 == nullptr) { - kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cxzopurug2u2kff3zliyvn25jrj6hvbvo6qrp26tzvi5i7zoaq2b.cubin", "triton_poi_fused_convolution_0", 4160, cubin_dir_); - } - CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); - int var_2 = ynumel; - int var_3 = xnumel; - CUdeviceptr global_scratch_4 = 0; - void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; - launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4160, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_1( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_1', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 128, 'x': 16}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6912, 'x': 3456}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 96 - xnumel = 9 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); - uint32_t grid_1 = ((ynumel + (64 - 1)) / (64)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_1 == nullptr) { - kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cwvumepeeo7fjwjgwncwiji54ff6le55tfzp4kzgc4qgueefvrjb.cubin", "triton_poi_fused_convolution_1", 4352, cubin_dir_); - } - CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); - int var_7 = ynumel; - int var_8 = xnumel; - CUdeviceptr global_scratch_9 = 0; - void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; - launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 524288}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 4817408}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 401408 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = tl.full([XBLOCK], True, tl.int1) - x2 = xindex - x0 = (xindex % 32) - tmp0 = tl.load(in_out_ptr0 + (x2), None) - tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, None) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2 = loadKernel("/home/gasoonjia/executorch/c74zcdwgzyij2kup6edvwy6x4v2o3kzogatnfm3fd4ttgs3qq26p.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2", 0, cubin_dir_); - } - CUdeviceptr var_10 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_11 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_12 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_13 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_14 = reinterpret_cast(in_ptr3.data_ptr()); - int var_15 = xnumel; - CUdeviceptr global_scratch_16 = 0; - void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &var_14, &var_15, &global_scratch_16}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_3( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_3', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 262144}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_3', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 2408704}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_3(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 200704 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = tl.full([XBLOCK], True, tl.int1) - x2 = xindex - x0 = (xindex % 16) - tmp0 = tl.load(in_out_ptr0 + (x2), None) - tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, None) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3 = loadKernel("/home/gasoonjia/executorch/cgpouheql4rpwtcaretoqzvk65fkvmoma6frdyhd3ilsvuggrlzy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_3", 0, cubin_dir_); - } - CUdeviceptr var_17 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_18 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_19 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_20 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_21 = reinterpret_cast(in_ptr3.data_ptr()); - int var_22 = xnumel; - CUdeviceptr global_scratch_23 = 0; - void* kernel_args_[] = {&var_17, &var_18, &var_19, &var_20, &var_21, &var_22, &global_scratch_23}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_3, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 2097152}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 14452224}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 1204224 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = tl.full([XBLOCK], True, tl.int1) - x2 = xindex - x0 = (xindex % 96) - tmp0 = tl.load(in_out_ptr0 + (x2), None) - tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), None, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, None) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4 = loadKernel("/home/gasoonjia/executorch/cd4lomi6yttiqc3qnhhhc675ta5iienuto5t67ybtshlxzp6p4ud.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4", 0, cubin_dir_); - } - CUdeviceptr var_24 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_25 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_26 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_27 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_28 = reinterpret_cast(in_ptr3.data_ptr()); - int var_29 = xnumel; - CUdeviceptr global_scratch_30 = 0; - void* kernel_args_[] = {&var_24, &var_25, &var_26, &var_27, &var_28, &var_29, &global_scratch_30}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 524288}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 3614208}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 301056 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 96) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5 = loadKernel("/home/gasoonjia/executorch/c7k3euhriolgsebdxauqyj6p2zdkse6qa6e4ylwbrc7765zcfd3m.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5", 0, cubin_dir_); - } - CUdeviceptr var_31 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_32 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_33 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_34 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_35 = reinterpret_cast(in_ptr3.data_ptr()); - int var_36 = xnumel; - CUdeviceptr global_scratch_37 = 0; - void* kernel_args_[] = {&var_31, &var_32, &var_33, &var_34, &var_35, &var_36, &global_scratch_37}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_6( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_6', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 131072}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_6', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 903552}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_6(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 75264 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 24) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6 = loadKernel("/home/gasoonjia/executorch/ckneyyhrfy6dkwkb6gaodbhn3l2khublcfvrwlajocypscgzcbft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_6", 0, cubin_dir_); - } - CUdeviceptr var_38 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_39 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_40 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_41 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_42 = reinterpret_cast(in_ptr3.data_ptr()); - int var_43 = xnumel; - CUdeviceptr global_scratch_44 = 0; - void* kernel_args_[] = {&var_38, &var_39, &var_40, &var_41, &var_42, &var_43, &global_scratch_44}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_6, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 524288}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 5421312}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 451584 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 144) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7 = loadKernel("/home/gasoonjia/executorch/c656cklj2pms2iadvspxywzssohwg3dtxcy4dlztkpnbgadleo2n.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7", 0, cubin_dir_); - } - CUdeviceptr var_45 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_46 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_47 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_48 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_49 = reinterpret_cast(in_ptr3.data_ptr()); - int var_50 = xnumel; - CUdeviceptr global_scratch_51 = 0; - void* kernel_args_[] = {&var_45, &var_46, &var_47, &var_48, &var_49, &var_50, &global_scratch_51}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_8( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_8', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 131072}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_8', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1204608}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_add_8(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): - xnumel = 75264 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 24) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x2), xmask) - tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tmp1 - tmp2 - tmp5 = 1e-05 - tmp6 = tmp4 + tmp5 - tmp7 = libdevice.sqrt(tmp6) - tmp8 = tl.full([1], 1, tl.int32) - tmp9 = (tmp8 / tmp7) - tmp10 = 1.0 - tmp11 = tmp9 * tmp10 - tmp12 = tmp3 * tmp11 - tmp14 = tmp12 * tmp13 - tmp16 = tmp14 + tmp15 - tmp17 = tmp0 + tmp16 - tl.store(in_out_ptr0 + (x2), tmp17, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8 = loadKernel("/home/gasoonjia/executorch/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_8", 0, cubin_dir_); - } - CUdeviceptr var_52 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_53 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_54 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_55 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_56 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_57 = reinterpret_cast(in_ptr4.data_ptr()); - int var_58 = xnumel; - CUdeviceptr global_scratch_59 = 0; - void* kernel_args_[] = {&var_52, &var_53, &var_54, &var_55, &var_56, &var_57, &var_58, &global_scratch_59}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_8, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 131072}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1357056}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 112896 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 144) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9 = loadKernel("/home/gasoonjia/executorch/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9", 0, cubin_dir_); - } - CUdeviceptr var_60 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_61 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_62 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_63 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_64 = reinterpret_cast(in_ptr3.data_ptr()); - int var_65 = xnumel; - CUdeviceptr global_scratch_66 = 0; - void* kernel_args_[] = {&var_60, &var_61, &var_62, &var_63, &var_64, &var_65, &global_scratch_66}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_10( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_10', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 32768}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 301568}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 25088 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 32) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10 = loadKernel("/home/gasoonjia/executorch/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_10", 0, cubin_dir_); - } - CUdeviceptr var_67 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_68 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_69 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_70 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_71 = reinterpret_cast(in_ptr3.data_ptr()); - int var_72 = xnumel; - CUdeviceptr global_scratch_73 = 0; - void* kernel_args_[] = {&var_67, &var_68, &var_69, &var_70, &var_71, &var_72, &global_scratch_73}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_10, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 262144}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1809408}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 150528 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 192) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11 = loadKernel("/home/gasoonjia/executorch/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11", 0, cubin_dir_); - } - CUdeviceptr var_74 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_75 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_76 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_77 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_78 = reinterpret_cast(in_ptr3.data_ptr()); - int var_79 = xnumel; - CUdeviceptr global_scratch_80 = 0; - void* kernel_args_[] = {&var_74, &var_75, &var_76, &var_77, &var_78, &var_79, &global_scratch_80}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_12( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_12', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 32768}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 401920}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_add_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): - xnumel = 25088 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 32) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x2), xmask) - tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tmp1 - tmp2 - tmp5 = 1e-05 - tmp6 = tmp4 + tmp5 - tmp7 = libdevice.sqrt(tmp6) - tmp8 = tl.full([1], 1, tl.int32) - tmp9 = (tmp8 / tmp7) - tmp10 = 1.0 - tmp11 = tmp9 * tmp10 - tmp12 = tmp3 * tmp11 - tmp14 = tmp12 * tmp13 - tmp16 = tmp14 + tmp15 - tmp17 = tmp0 + tmp16 - tl.store(in_out_ptr0 + (x2), tmp17, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12 = loadKernel("/home/gasoonjia/executorch/c4id4zognxxqwo4qci5zcry3oobj4eoerxfp5yxnlo5pdfcwnqtn.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_12", 0, cubin_dir_); - } - CUdeviceptr var_81 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_82 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_83 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_84 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_85 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_86 = reinterpret_cast(in_ptr4.data_ptr()); - int var_87 = xnumel; - CUdeviceptr global_scratch_88 = 0; - void* kernel_args_[] = {&var_81, &var_82, &var_83, &var_84, &var_85, &var_86, &var_87, &global_scratch_88}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_12, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 65536}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 454656}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 37632 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 192) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13 = loadKernel("/home/gasoonjia/executorch/cxn357cdpjzfyhgfzkziumdqzvax6wmbfva3bo36qlb2w5deusut.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13", 0, cubin_dir_); - } - CUdeviceptr var_89 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_90 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_91 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_92 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_93 = reinterpret_cast(in_ptr3.data_ptr()); - int var_94 = xnumel; - CUdeviceptr global_scratch_95 = 0; - void* kernel_args_[] = {&var_89, &var_90, &var_91, &var_92, &var_93, &var_94, &global_scratch_95}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_14( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_14', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 16384}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_14', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 151552}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_14(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 12544 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 64) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14 = loadKernel("/home/gasoonjia/executorch/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_14", 0, cubin_dir_); - } - CUdeviceptr var_96 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_97 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_98 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_99 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_100 = reinterpret_cast(in_ptr3.data_ptr()); - int var_101 = xnumel; - CUdeviceptr global_scratch_102 = 0; - void* kernel_args_[] = {&var_96, &var_97, &var_98, &var_99, &var_100, &var_101, &global_scratch_102}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_14, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 131072}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 909312}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 75264 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 384) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15 = loadKernel("/home/gasoonjia/executorch/caqye62oxfgou2x7ke4dl35rberxbjhgbjfnpcgtkr4avrno4ixy.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15", 0, cubin_dir_); - } - CUdeviceptr var_103 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_104 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_105 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_106 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_107 = reinterpret_cast(in_ptr3.data_ptr()); - int var_108 = xnumel; - CUdeviceptr global_scratch_109 = 0; - void* kernel_args_[] = {&var_103, &var_104, &var_105, &var_106, &var_107, &var_108, &global_scratch_109}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_16( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_16', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 16384}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_16', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 201728}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_add_16(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): - xnumel = 12544 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 64) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x2), xmask) - tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tmp1 - tmp2 - tmp5 = 1e-05 - tmp6 = tmp4 + tmp5 - tmp7 = libdevice.sqrt(tmp6) - tmp8 = tl.full([1], 1, tl.int32) - tmp9 = (tmp8 / tmp7) - tmp10 = 1.0 - tmp11 = tmp9 * tmp10 - tmp12 = tmp3 * tmp11 - tmp14 = tmp12 * tmp13 - tmp16 = tmp14 + tmp15 - tmp17 = tmp0 + tmp16 - tl.store(in_out_ptr0 + (x2), tmp17, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16 = loadKernel("/home/gasoonjia/executorch/cafig5mi4e5ufzbj47ahikyfz3zcex4yxqvcdqpm27f6d4mtoxbo.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_16", 0, cubin_dir_); - } - CUdeviceptr var_110 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_111 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_112 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_113 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_114 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_115 = reinterpret_cast(in_ptr4.data_ptr()); - int var_116 = xnumel; - CUdeviceptr global_scratch_117 = 0; - void* kernel_args_[] = {&var_110, &var_111, &var_112, &var_113, &var_114, &var_115, &var_116, &global_scratch_117}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_16, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_17( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_17', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 32768}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_17', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 227328}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_17(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 18816 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 96) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17 = loadKernel("/home/gasoonjia/executorch/ctc4njxfwewhkkjkreaoqgsbyrr7s3dbfmgdfcunjbmfgrzqksu4.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_17", 0, cubin_dir_); - } - CUdeviceptr var_118 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_119 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_120 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_121 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_122 = reinterpret_cast(in_ptr3.data_ptr()); - int var_123 = xnumel; - CUdeviceptr global_scratch_124 = 0; - void* kernel_args_[] = {&var_118, &var_119, &var_120, &var_121, &var_122, &var_123, &global_scratch_124}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_17, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 131072}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 1363968}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 112896 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 576) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18 = loadKernel("/home/gasoonjia/executorch/cklg2ezqvtkbhlekhvyenxwrgnlwt2msvmc7427nuluwqezzy5lx.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18", 0, cubin_dir_); - } - CUdeviceptr var_125 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_126 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_127 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_128 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_129 = reinterpret_cast(in_ptr3.data_ptr()); - int var_130 = xnumel; - CUdeviceptr global_scratch_131 = 0; - void* kernel_args_[] = {&var_125, &var_126, &var_127, &var_128, &var_129, &var_130, &global_scratch_131}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18, grid_0, grid_1, grid_2, 8, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_19( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_19', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 32768}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_19', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 302592}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_add_19(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): - xnumel = 18816 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 96) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x2), xmask) - tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tmp1 - tmp2 - tmp5 = 1e-05 - tmp6 = tmp4 + tmp5 - tmp7 = libdevice.sqrt(tmp6) - tmp8 = tl.full([1], 1, tl.int32) - tmp9 = (tmp8 / tmp7) - tmp10 = 1.0 - tmp11 = tmp9 * tmp10 - tmp12 = tmp3 * tmp11 - tmp14 = tmp12 * tmp13 - tmp16 = tmp14 + tmp15 - tmp17 = tmp0 + tmp16 - tl.store(in_out_ptr0 + (x2), tmp17, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19 = loadKernel("/home/gasoonjia/executorch/c3sj66uvazrx3drgx5zzvxlffnqf3kezaikukfqbiue2bb2vcbdg.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_19", 0, cubin_dir_); - } - CUdeviceptr var_132 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_133 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_134 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_135 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_136 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_137 = reinterpret_cast(in_ptr4.data_ptr()); - int var_138 = xnumel; - CUdeviceptr global_scratch_139 = 0; - void* kernel_args_[] = {&var_132, &var_133, &var_134, &var_135, &var_136, &var_137, &var_138, &global_scratch_139}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_19, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 32768}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 347904}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 28224 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 576) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20 = loadKernel("/home/gasoonjia/executorch/c2oewcn4k655ga3vky43nudfhqe4py7nuxkauuy7fcrnhwyg4gsl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20", 0, cubin_dir_); - } - CUdeviceptr var_140 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_141 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_142 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_143 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_144 = reinterpret_cast(in_ptr3.data_ptr()); - int var_145 = xnumel; - CUdeviceptr global_scratch_146 = 0; - void* kernel_args_[] = {&var_140, &var_141, &var_142, &var_143, &var_144, &var_145, &global_scratch_146}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_21( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_21', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 8192}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_21', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 96640}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_21(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 7840 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 160) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21 = loadKernel("/home/gasoonjia/executorch/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_21", 0, cubin_dir_); - } - CUdeviceptr var_147 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_148 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_149 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_150 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_151 = reinterpret_cast(in_ptr3.data_ptr()); - int var_152 = xnumel; - CUdeviceptr global_scratch_153 = 0; - void* kernel_args_[] = {&var_147, &var_148, &var_149, &var_150, &var_151, &var_152, &global_scratch_153}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_21, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 65536}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 579840}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 47040 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 960) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tl.store(in_out_ptr0 + (x2), tmp19, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (512 - 1)) / (512)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22 = loadKernel("/home/gasoonjia/executorch/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22", 0, cubin_dir_); - } - CUdeviceptr var_154 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_155 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_156 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_157 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_158 = reinterpret_cast(in_ptr3.data_ptr()); - int var_159 = xnumel; - CUdeviceptr global_scratch_160 = 0; - void* kernel_args_[] = {&var_154, &var_155, &var_156, &var_157, &var_158, &var_159, &global_scratch_160}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_add_23( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_add_23', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 8192}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_add_23', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 128000}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_add_23(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, XBLOCK : tl.constexpr): - xnumel = 7840 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 160) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x2), xmask) - tmp2 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp4 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp13 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp15 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tmp1 - tmp2 - tmp5 = 1e-05 - tmp6 = tmp4 + tmp5 - tmp7 = libdevice.sqrt(tmp6) - tmp8 = tl.full([1], 1, tl.int32) - tmp9 = (tmp8 / tmp7) - tmp10 = 1.0 - tmp11 = tmp9 * tmp10 - tmp12 = tmp3 * tmp11 - tmp14 = tmp12 * tmp13 - tmp16 = tmp14 + tmp15 - tmp17 = tmp0 + tmp16 - tl.store(in_out_ptr0 + (x2), tmp17, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (128 - 1)) / (128)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23 = loadKernel("/home/gasoonjia/executorch/c2yybeoyrkfdeh34rwaadbn7z3xbhkdmautjebwjj3cnspt7codl.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_add_23", 0, cubin_dir_); - } - CUdeviceptr var_161 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_162 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_163 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_164 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_165 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_166 = reinterpret_cast(in_ptr4.data_ptr()); - int var_167 = xnumel; - CUdeviceptr global_scratch_168 = 0; - void* kernel_args_[] = {&var_161, &var_162, &var_163, &var_164, &var_165, &var_166, &var_167, &global_scratch_168}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_add_23, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused__native_batch_norm_legit_no_training_24( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused__native_batch_norm_legit_no_training_24', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 16384}, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__native_batch_norm_legit_no_training_24', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 193280}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused__native_batch_norm_legit_no_training_24(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, XBLOCK : tl.constexpr): - xnumel = 15680 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x2 = xindex - x0 = (xindex % 320) - tmp0 = tl.load(in_out_ptr0 + (x2), xmask) - tmp1 = tl.load(in_ptr0 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tl.store(in_out_ptr0 + (x2), tmp15, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (256 - 1)) / (256)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 == nullptr) { - kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24 = loadKernel("/home/gasoonjia/executorch/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin", "triton_poi_fused__native_batch_norm_legit_no_training_24", 0, cubin_dir_); - } - CUdeviceptr var_169 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_170 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_171 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_172 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_173 = reinterpret_cast(in_ptr3.data_ptr()); - int var_174 = xnumel; - CUdeviceptr global_scratch_175 = 0; - void* kernel_args_[] = {&var_169, &var_170, &var_171, &var_172, &var_173, &var_174, &global_scratch_175}; - launchKernel(kernels_.triton_poi_fused__native_batch_norm_legit_no_training_24, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -template -static inline void call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25( - const in_out_ptr0_type_& in_out_ptr0, - const in_ptr0_type_& in_ptr0, - const in_ptr1_type_& in_ptr1, - const in_ptr2_type_& in_ptr2, - const in_ptr3_type_& in_ptr3, - const in_ptr4_type_& in_ptr4, - int64_t xnumel, - int64_t r0_numel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.persistent_reduction( - size_hints={'x': 2048, 'r0_': 64}, - reduction_hint=ReductionHint.OUTER, - filename=__file__, - triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'in_ptr4': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]], (4,): [['tt.divisibility', 16]], (5,): [['tt.divisibility', 16]], (6,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 1, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 281600, 'r0_': 0}} - ) - @triton.jit - def triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, xnumel, r0_numel, XBLOCK : tl.constexpr): - xnumel = 1280 - r0_numel = 49 - R0_BLOCK: tl.constexpr = 64 - rnumel = r0_numel - RBLOCK: tl.constexpr = R0_BLOCK - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:, None] - xmask = xindex < xnumel - r0_index = tl.arange(0, R0_BLOCK)[None, :] - r0_offset = 0 - r0_mask = r0_index < r0_numel - roffset = r0_offset - rindex = r0_index - r0_1 = r0_index - x0 = xindex - tmp0 = tl.load(in_ptr0 + (x0 + 1280*r0_1), r0_mask & xmask, other=0.0) - tmp1 = tl.load(in_ptr1 + (x0), xmask, eviction_policy='evict_last') - tmp3 = tl.load(in_ptr2 + (x0), xmask, eviction_policy='evict_last') - tmp12 = tl.load(in_ptr3 + (x0), xmask, eviction_policy='evict_last') - tmp14 = tl.load(in_ptr4 + (x0), xmask, eviction_policy='evict_last') - tmp2 = tmp0 - tmp1 - tmp4 = 1e-05 - tmp5 = tmp3 + tmp4 - tmp6 = libdevice.sqrt(tmp5) - tmp7 = tl.full([1, 1], 1, tl.int32) - tmp8 = (tmp7 / tmp6) - tmp9 = 1.0 - tmp10 = tmp8 * tmp9 - tmp11 = tmp2 * tmp10 - tmp13 = tmp11 * tmp12 - tmp15 = tmp13 + tmp14 - tmp16 = 0.0 - tmp17 = triton_helpers.maximum(tmp15, tmp16) - tmp18 = 6.0 - tmp19 = triton_helpers.minimum(tmp17, tmp18) - tmp20 = tl.broadcast_to(tmp19, [XBLOCK, R0_BLOCK]) - tmp22 = tl.where(r0_mask & xmask, tmp20, 0) - tmp23 = tl.sum(tmp22, 1)[:, None] - tmp24 = 49.0 - tmp25 = (tmp23 / tmp24) - tl.debug_barrier() - tl.store(in_out_ptr0 + (x0), tmp25, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 == nullptr) { - kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25 = loadKernel("/home/gasoonjia/executorch/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin", "triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25", 1024, cubin_dir_); - } - CUdeviceptr var_176 = reinterpret_cast(in_out_ptr0.data_ptr()); - CUdeviceptr var_177 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_178 = reinterpret_cast(in_ptr1.data_ptr()); - CUdeviceptr var_179 = reinterpret_cast(in_ptr2.data_ptr()); - CUdeviceptr var_180 = reinterpret_cast(in_ptr3.data_ptr()); - CUdeviceptr var_181 = reinterpret_cast(in_ptr4.data_ptr()); - int var_182 = xnumel; - int var_183 = r0_numel; - CUdeviceptr global_scratch_184 = 0; - void* kernel_args_[] = {&var_176, &var_177, &var_178, &var_179, &var_180, &var_181, &var_182, &var_183, &global_scratch_184}; - launchKernel(kernels_.triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25, grid_0, grid_1, grid_2, 8, 1024, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_permute_copy_26( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_permute_copy_26', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'x': 2097152}, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'xnumel': 'i32', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (2,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid1D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_permute_copy_26', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'x': 15360000}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_permute_copy_26(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr): - xnumel = 1280000 - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[:] - xmask = xindex < xnumel - x0 = xindex - tmp0 = tl.load(in_ptr0 + (x0), xmask) - tl.store(out_ptr0 + (x0), tmp0, xmask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (1024 - 1)) / (1024)); - uint32_t grid_1 = 1; - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_permute_copy_26 == nullptr) { - kernels_.triton_poi_fused_permute_copy_26 = loadKernel("/home/gasoonjia/executorch/czj7vvfy745m4rwqvkdetdltbkwsdx6kjaldi7zklwlc3zi37bno.cubin", "triton_poi_fused_permute_copy_26", 0, cubin_dir_); - } - CUdeviceptr var_185 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_186 = reinterpret_cast(out_ptr0.data_ptr()); - int var_187 = xnumel; - CUdeviceptr global_scratch_188 = 0; - void* kernel_args_[] = {&var_185, &var_186, &var_187, &global_scratch_188}; - launchKernel(kernels_.triton_poi_fused_permute_copy_26, grid_0, grid_1, grid_2, 4, 0, kernel_args_, stream_); -} - -namespace torch::aot_inductor { - -void AOTInductorModel::_const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) {} - -AOTI_NOINLINE static void check_input_0( - AtenTensorHandle* input_handles -) { - ConstantHandle arg262_1 = ConstantHandle(input_handles[0]); - int32_t arg262_1_dtype; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg262_1, &arg262_1_dtype)); - - int32_t arg262_1_expected_dtype = aoti_torch_dtype_float32(); - if (arg262_1_expected_dtype != arg262_1_dtype) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dtype, " - << "expected: " << arg262_1_expected_dtype << "(at::kFloat), " - << "but got: " << arg262_1_dtype << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg262_1_size = arg262_1.sizes(); - - if (1 != arg262_1_size[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 0, " - << "expected: 1, " << "but got: " << arg262_1_size[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (3 != arg262_1_size[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 1, " - << "expected: 3, " << "but got: " << arg262_1_size[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (224 != arg262_1_size[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 2, " - << "expected: 224, " << "but got: " << arg262_1_size[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (224 != arg262_1_size[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 3, " - << "expected: 224, " << "but got: " << arg262_1_size[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg262_1_stride = arg262_1.strides(); - - if (150528 != arg262_1_stride[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 0, " - << "expected: 150528, " << "but got: " << arg262_1_stride[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (50176 != arg262_1_stride[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 1, " - << "expected: 50176, " << "but got: " << arg262_1_stride[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (224 != arg262_1_stride[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 2, " - << "expected: 224, " << "but got: " << arg262_1_stride[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (1 != arg262_1_stride[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 3, " - << "expected: 1, " << "but got: " << arg262_1_stride[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - int32_t arg262_1_device_type; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg262_1, &arg262_1_device_type)); - - int32_t arg262_1_expected_device_type = 1; - if (arg262_1_expected_device_type != arg262_1_device_type) { - std::stringstream ss; - ss << "input_handles[0]: unmatched device type, " - << "expected: " << arg262_1_expected_device_type << "1(cuda), " - << "but got: " << arg262_1_device_type << "\n"; - throw std::runtime_error(ss.str()); - } -} - -static bool _check_aoti_runtime_check_inputs_env() { - const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); - const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; - return result; -} - -AOTI_NOINLINE static void __check_inputs_outputs( - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - if (!_check_aoti_runtime_check_inputs_env()){ - return; - } - check_input_0(input_handles); -} - -void AOTInductorModel::run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) { - __check_inputs_outputs(input_handles, output_handles); - - auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); - auto arg262_1 = std::move(inputs[0]); - [[maybe_unused]] auto& mv2_features_0_0_weight = constants_->at(0); - [[maybe_unused]] auto& mv2_features_0_1_weight = constants_->at(1); - [[maybe_unused]] auto& mv2_features_0_1_bias = constants_->at(2); - [[maybe_unused]] auto& mv2_features_1_conv_0_0_weight = constants_->at(3); - [[maybe_unused]] auto& mv2_features_1_conv_0_1_weight = constants_->at(4); - [[maybe_unused]] auto& mv2_features_1_conv_0_1_bias = constants_->at(5); - [[maybe_unused]] auto& mv2_features_1_conv_1_weight = constants_->at(6); - [[maybe_unused]] auto& mv2_features_1_conv_2_weight = constants_->at(7); - [[maybe_unused]] auto& mv2_features_1_conv_2_bias = constants_->at(8); - [[maybe_unused]] auto& mv2_features_2_conv_0_0_weight = constants_->at(9); - [[maybe_unused]] auto& mv2_features_2_conv_0_1_weight = constants_->at(10); - [[maybe_unused]] auto& mv2_features_2_conv_0_1_bias = constants_->at(11); - [[maybe_unused]] auto& mv2_features_2_conv_1_0_weight = constants_->at(12); - [[maybe_unused]] auto& mv2_features_2_conv_1_1_weight = constants_->at(13); - [[maybe_unused]] auto& mv2_features_2_conv_1_1_bias = constants_->at(14); - [[maybe_unused]] auto& mv2_features_2_conv_2_weight = constants_->at(15); - [[maybe_unused]] auto& mv2_features_2_conv_3_weight = constants_->at(16); - [[maybe_unused]] auto& mv2_features_2_conv_3_bias = constants_->at(17); - [[maybe_unused]] auto& mv2_features_3_conv_0_0_weight = constants_->at(18); - [[maybe_unused]] auto& mv2_features_3_conv_0_1_weight = constants_->at(19); - [[maybe_unused]] auto& mv2_features_3_conv_0_1_bias = constants_->at(20); - [[maybe_unused]] auto& mv2_features_3_conv_1_0_weight = constants_->at(21); - [[maybe_unused]] auto& mv2_features_3_conv_1_1_weight = constants_->at(22); - [[maybe_unused]] auto& mv2_features_3_conv_1_1_bias = constants_->at(23); - [[maybe_unused]] auto& mv2_features_3_conv_2_weight = constants_->at(24); - [[maybe_unused]] auto& mv2_features_3_conv_3_weight = constants_->at(25); - [[maybe_unused]] auto& mv2_features_3_conv_3_bias = constants_->at(26); - [[maybe_unused]] auto& mv2_features_4_conv_0_0_weight = constants_->at(27); - [[maybe_unused]] auto& mv2_features_4_conv_0_1_weight = constants_->at(28); - [[maybe_unused]] auto& mv2_features_4_conv_0_1_bias = constants_->at(29); - [[maybe_unused]] auto& mv2_features_4_conv_1_0_weight = constants_->at(30); - [[maybe_unused]] auto& mv2_features_4_conv_1_1_weight = constants_->at(31); - [[maybe_unused]] auto& mv2_features_4_conv_1_1_bias = constants_->at(32); - [[maybe_unused]] auto& mv2_features_4_conv_2_weight = constants_->at(33); - [[maybe_unused]] auto& mv2_features_4_conv_3_weight = constants_->at(34); - [[maybe_unused]] auto& mv2_features_4_conv_3_bias = constants_->at(35); - [[maybe_unused]] auto& mv2_features_5_conv_0_0_weight = constants_->at(36); - [[maybe_unused]] auto& mv2_features_5_conv_0_1_weight = constants_->at(37); - [[maybe_unused]] auto& mv2_features_5_conv_0_1_bias = constants_->at(38); - [[maybe_unused]] auto& mv2_features_5_conv_1_0_weight = constants_->at(39); - [[maybe_unused]] auto& mv2_features_5_conv_1_1_weight = constants_->at(40); - [[maybe_unused]] auto& mv2_features_5_conv_1_1_bias = constants_->at(41); - [[maybe_unused]] auto& mv2_features_5_conv_2_weight = constants_->at(42); - [[maybe_unused]] auto& mv2_features_5_conv_3_weight = constants_->at(43); - [[maybe_unused]] auto& mv2_features_5_conv_3_bias = constants_->at(44); - [[maybe_unused]] auto& mv2_features_6_conv_0_0_weight = constants_->at(45); - [[maybe_unused]] auto& mv2_features_6_conv_0_1_weight = constants_->at(46); - [[maybe_unused]] auto& mv2_features_6_conv_0_1_bias = constants_->at(47); - [[maybe_unused]] auto& mv2_features_6_conv_1_0_weight = constants_->at(48); - [[maybe_unused]] auto& mv2_features_6_conv_1_1_weight = constants_->at(49); - [[maybe_unused]] auto& mv2_features_6_conv_1_1_bias = constants_->at(50); - [[maybe_unused]] auto& mv2_features_6_conv_2_weight = constants_->at(51); - [[maybe_unused]] auto& mv2_features_6_conv_3_weight = constants_->at(52); - [[maybe_unused]] auto& mv2_features_6_conv_3_bias = constants_->at(53); - [[maybe_unused]] auto& mv2_features_7_conv_0_0_weight = constants_->at(54); - [[maybe_unused]] auto& mv2_features_7_conv_0_1_weight = constants_->at(55); - [[maybe_unused]] auto& mv2_features_7_conv_0_1_bias = constants_->at(56); - [[maybe_unused]] auto& mv2_features_7_conv_1_0_weight = constants_->at(57); - [[maybe_unused]] auto& mv2_features_7_conv_1_1_weight = constants_->at(58); - [[maybe_unused]] auto& mv2_features_7_conv_1_1_bias = constants_->at(59); - [[maybe_unused]] auto& mv2_features_7_conv_2_weight = constants_->at(60); - [[maybe_unused]] auto& mv2_features_7_conv_3_weight = constants_->at(61); - [[maybe_unused]] auto& mv2_features_7_conv_3_bias = constants_->at(62); - [[maybe_unused]] auto& mv2_features_8_conv_0_0_weight = constants_->at(63); - [[maybe_unused]] auto& mv2_features_8_conv_0_1_weight = constants_->at(64); - [[maybe_unused]] auto& mv2_features_8_conv_0_1_bias = constants_->at(65); - [[maybe_unused]] auto& mv2_features_8_conv_1_0_weight = constants_->at(66); - [[maybe_unused]] auto& mv2_features_8_conv_1_1_weight = constants_->at(67); - [[maybe_unused]] auto& mv2_features_8_conv_1_1_bias = constants_->at(68); - [[maybe_unused]] auto& mv2_features_8_conv_2_weight = constants_->at(69); - [[maybe_unused]] auto& mv2_features_8_conv_3_weight = constants_->at(70); - [[maybe_unused]] auto& mv2_features_8_conv_3_bias = constants_->at(71); - [[maybe_unused]] auto& mv2_features_9_conv_0_0_weight = constants_->at(72); - [[maybe_unused]] auto& mv2_features_9_conv_0_1_weight = constants_->at(73); - [[maybe_unused]] auto& mv2_features_9_conv_0_1_bias = constants_->at(74); - [[maybe_unused]] auto& mv2_features_9_conv_1_0_weight = constants_->at(75); - [[maybe_unused]] auto& mv2_features_9_conv_1_1_weight = constants_->at(76); - [[maybe_unused]] auto& mv2_features_9_conv_1_1_bias = constants_->at(77); - [[maybe_unused]] auto& mv2_features_9_conv_2_weight = constants_->at(78); - [[maybe_unused]] auto& mv2_features_9_conv_3_weight = constants_->at(79); - [[maybe_unused]] auto& mv2_features_9_conv_3_bias = constants_->at(80); - [[maybe_unused]] auto& mv2_features_10_conv_0_0_weight = constants_->at(81); - [[maybe_unused]] auto& mv2_features_10_conv_0_1_weight = constants_->at(82); - [[maybe_unused]] auto& mv2_features_10_conv_0_1_bias = constants_->at(83); - [[maybe_unused]] auto& mv2_features_10_conv_1_0_weight = constants_->at(84); - [[maybe_unused]] auto& mv2_features_10_conv_1_1_weight = constants_->at(85); - [[maybe_unused]] auto& mv2_features_10_conv_1_1_bias = constants_->at(86); - [[maybe_unused]] auto& mv2_features_10_conv_2_weight = constants_->at(87); - [[maybe_unused]] auto& mv2_features_10_conv_3_weight = constants_->at(88); - [[maybe_unused]] auto& mv2_features_10_conv_3_bias = constants_->at(89); - [[maybe_unused]] auto& mv2_features_11_conv_0_0_weight = constants_->at(90); - [[maybe_unused]] auto& mv2_features_11_conv_0_1_weight = constants_->at(91); - [[maybe_unused]] auto& mv2_features_11_conv_0_1_bias = constants_->at(92); - [[maybe_unused]] auto& mv2_features_11_conv_1_0_weight = constants_->at(93); - [[maybe_unused]] auto& mv2_features_11_conv_1_1_weight = constants_->at(94); - [[maybe_unused]] auto& mv2_features_11_conv_1_1_bias = constants_->at(95); - [[maybe_unused]] auto& mv2_features_11_conv_2_weight = constants_->at(96); - [[maybe_unused]] auto& mv2_features_11_conv_3_weight = constants_->at(97); - [[maybe_unused]] auto& mv2_features_11_conv_3_bias = constants_->at(98); - [[maybe_unused]] auto& mv2_features_12_conv_0_0_weight = constants_->at(99); - [[maybe_unused]] auto& mv2_features_12_conv_0_1_weight = constants_->at(100); - [[maybe_unused]] auto& mv2_features_12_conv_0_1_bias = constants_->at(101); - [[maybe_unused]] auto& mv2_features_12_conv_1_0_weight = constants_->at(102); - [[maybe_unused]] auto& mv2_features_12_conv_1_1_weight = constants_->at(103); - [[maybe_unused]] auto& mv2_features_12_conv_1_1_bias = constants_->at(104); - [[maybe_unused]] auto& mv2_features_12_conv_2_weight = constants_->at(105); - [[maybe_unused]] auto& mv2_features_12_conv_3_weight = constants_->at(106); - [[maybe_unused]] auto& mv2_features_12_conv_3_bias = constants_->at(107); - [[maybe_unused]] auto& mv2_features_13_conv_0_0_weight = constants_->at(108); - [[maybe_unused]] auto& mv2_features_13_conv_0_1_weight = constants_->at(109); - [[maybe_unused]] auto& mv2_features_13_conv_0_1_bias = constants_->at(110); - [[maybe_unused]] auto& mv2_features_13_conv_1_0_weight = constants_->at(111); - [[maybe_unused]] auto& mv2_features_13_conv_1_1_weight = constants_->at(112); - [[maybe_unused]] auto& mv2_features_13_conv_1_1_bias = constants_->at(113); - [[maybe_unused]] auto& mv2_features_13_conv_2_weight = constants_->at(114); - [[maybe_unused]] auto& mv2_features_13_conv_3_weight = constants_->at(115); - [[maybe_unused]] auto& mv2_features_13_conv_3_bias = constants_->at(116); - [[maybe_unused]] auto& mv2_features_14_conv_0_0_weight = constants_->at(117); - [[maybe_unused]] auto& mv2_features_14_conv_0_1_weight = constants_->at(118); - [[maybe_unused]] auto& mv2_features_14_conv_0_1_bias = constants_->at(119); - [[maybe_unused]] auto& mv2_features_14_conv_1_0_weight = constants_->at(120); - [[maybe_unused]] auto& mv2_features_14_conv_1_1_weight = constants_->at(121); - [[maybe_unused]] auto& mv2_features_14_conv_1_1_bias = constants_->at(122); - [[maybe_unused]] auto& mv2_features_14_conv_2_weight = constants_->at(123); - [[maybe_unused]] auto& mv2_features_14_conv_3_weight = constants_->at(124); - [[maybe_unused]] auto& mv2_features_14_conv_3_bias = constants_->at(125); - [[maybe_unused]] auto& mv2_features_15_conv_0_0_weight = constants_->at(126); - [[maybe_unused]] auto& mv2_features_15_conv_0_1_weight = constants_->at(127); - [[maybe_unused]] auto& mv2_features_15_conv_0_1_bias = constants_->at(128); - [[maybe_unused]] auto& mv2_features_15_conv_1_0_weight = constants_->at(129); - [[maybe_unused]] auto& mv2_features_15_conv_1_1_weight = constants_->at(130); - [[maybe_unused]] auto& mv2_features_15_conv_1_1_bias = constants_->at(131); - [[maybe_unused]] auto& mv2_features_15_conv_2_weight = constants_->at(132); - [[maybe_unused]] auto& mv2_features_15_conv_3_weight = constants_->at(133); - [[maybe_unused]] auto& mv2_features_15_conv_3_bias = constants_->at(134); - [[maybe_unused]] auto& mv2_features_16_conv_0_0_weight = constants_->at(135); - [[maybe_unused]] auto& mv2_features_16_conv_0_1_weight = constants_->at(136); - [[maybe_unused]] auto& mv2_features_16_conv_0_1_bias = constants_->at(137); - [[maybe_unused]] auto& mv2_features_16_conv_1_0_weight = constants_->at(138); - [[maybe_unused]] auto& mv2_features_16_conv_1_1_weight = constants_->at(139); - [[maybe_unused]] auto& mv2_features_16_conv_1_1_bias = constants_->at(140); - [[maybe_unused]] auto& mv2_features_16_conv_2_weight = constants_->at(141); - [[maybe_unused]] auto& mv2_features_16_conv_3_weight = constants_->at(142); - [[maybe_unused]] auto& mv2_features_16_conv_3_bias = constants_->at(143); - [[maybe_unused]] auto& mv2_features_17_conv_0_0_weight = constants_->at(144); - [[maybe_unused]] auto& mv2_features_17_conv_0_1_weight = constants_->at(145); - [[maybe_unused]] auto& mv2_features_17_conv_0_1_bias = constants_->at(146); - [[maybe_unused]] auto& mv2_features_17_conv_1_0_weight = constants_->at(147); - [[maybe_unused]] auto& mv2_features_17_conv_1_1_weight = constants_->at(148); - [[maybe_unused]] auto& mv2_features_17_conv_1_1_bias = constants_->at(149); - [[maybe_unused]] auto& mv2_features_17_conv_2_weight = constants_->at(150); - [[maybe_unused]] auto& mv2_features_17_conv_3_weight = constants_->at(151); - [[maybe_unused]] auto& mv2_features_17_conv_3_bias = constants_->at(152); - [[maybe_unused]] auto& mv2_features_18_0_weight = constants_->at(153); - [[maybe_unused]] auto& mv2_features_18_1_weight = constants_->at(154); - [[maybe_unused]] auto& mv2_features_18_1_bias = constants_->at(155); - [[maybe_unused]] auto& mv2_classifier_1_weight = constants_->at(156); - [[maybe_unused]] auto& mv2_classifier_1_bias = constants_->at(157); - [[maybe_unused]] auto& mv2_features_0_1_running_mean = constants_->at(158); - [[maybe_unused]] auto& mv2_features_0_1_running_var = constants_->at(159); - [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_mean = constants_->at(160); - [[maybe_unused]] auto& mv2_features_1_conv_0_1_running_var = constants_->at(161); - [[maybe_unused]] auto& mv2_features_1_conv_2_running_mean = constants_->at(162); - [[maybe_unused]] auto& mv2_features_1_conv_2_running_var = constants_->at(163); - [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_mean = constants_->at(164); - [[maybe_unused]] auto& mv2_features_2_conv_0_1_running_var = constants_->at(165); - [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_mean = constants_->at(166); - [[maybe_unused]] auto& mv2_features_2_conv_1_1_running_var = constants_->at(167); - [[maybe_unused]] auto& mv2_features_2_conv_3_running_mean = constants_->at(168); - [[maybe_unused]] auto& mv2_features_2_conv_3_running_var = constants_->at(169); - [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_mean = constants_->at(170); - [[maybe_unused]] auto& mv2_features_3_conv_0_1_running_var = constants_->at(171); - [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_mean = constants_->at(172); - [[maybe_unused]] auto& mv2_features_3_conv_1_1_running_var = constants_->at(173); - [[maybe_unused]] auto& mv2_features_3_conv_3_running_mean = constants_->at(174); - [[maybe_unused]] auto& mv2_features_3_conv_3_running_var = constants_->at(175); - [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_mean = constants_->at(176); - [[maybe_unused]] auto& mv2_features_4_conv_0_1_running_var = constants_->at(177); - [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_mean = constants_->at(178); - [[maybe_unused]] auto& mv2_features_4_conv_1_1_running_var = constants_->at(179); - [[maybe_unused]] auto& mv2_features_4_conv_3_running_mean = constants_->at(180); - [[maybe_unused]] auto& mv2_features_4_conv_3_running_var = constants_->at(181); - [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_mean = constants_->at(182); - [[maybe_unused]] auto& mv2_features_5_conv_0_1_running_var = constants_->at(183); - [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_mean = constants_->at(184); - [[maybe_unused]] auto& mv2_features_5_conv_1_1_running_var = constants_->at(185); - [[maybe_unused]] auto& mv2_features_5_conv_3_running_mean = constants_->at(186); - [[maybe_unused]] auto& mv2_features_5_conv_3_running_var = constants_->at(187); - [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_mean = constants_->at(188); - [[maybe_unused]] auto& mv2_features_6_conv_0_1_running_var = constants_->at(189); - [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_mean = constants_->at(190); - [[maybe_unused]] auto& mv2_features_6_conv_1_1_running_var = constants_->at(191); - [[maybe_unused]] auto& mv2_features_6_conv_3_running_mean = constants_->at(192); - [[maybe_unused]] auto& mv2_features_6_conv_3_running_var = constants_->at(193); - [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_mean = constants_->at(194); - [[maybe_unused]] auto& mv2_features_7_conv_0_1_running_var = constants_->at(195); - [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_mean = constants_->at(196); - [[maybe_unused]] auto& mv2_features_7_conv_1_1_running_var = constants_->at(197); - [[maybe_unused]] auto& mv2_features_7_conv_3_running_mean = constants_->at(198); - [[maybe_unused]] auto& mv2_features_7_conv_3_running_var = constants_->at(199); - [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_mean = constants_->at(200); - [[maybe_unused]] auto& mv2_features_8_conv_0_1_running_var = constants_->at(201); - [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_mean = constants_->at(202); - [[maybe_unused]] auto& mv2_features_8_conv_1_1_running_var = constants_->at(203); - [[maybe_unused]] auto& mv2_features_8_conv_3_running_mean = constants_->at(204); - [[maybe_unused]] auto& mv2_features_8_conv_3_running_var = constants_->at(205); - [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_mean = constants_->at(206); - [[maybe_unused]] auto& mv2_features_9_conv_0_1_running_var = constants_->at(207); - [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_mean = constants_->at(208); - [[maybe_unused]] auto& mv2_features_9_conv_1_1_running_var = constants_->at(209); - [[maybe_unused]] auto& mv2_features_9_conv_3_running_mean = constants_->at(210); - [[maybe_unused]] auto& mv2_features_9_conv_3_running_var = constants_->at(211); - [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_mean = constants_->at(212); - [[maybe_unused]] auto& mv2_features_10_conv_0_1_running_var = constants_->at(213); - [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_mean = constants_->at(214); - [[maybe_unused]] auto& mv2_features_10_conv_1_1_running_var = constants_->at(215); - [[maybe_unused]] auto& mv2_features_10_conv_3_running_mean = constants_->at(216); - [[maybe_unused]] auto& mv2_features_10_conv_3_running_var = constants_->at(217); - [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_mean = constants_->at(218); - [[maybe_unused]] auto& mv2_features_11_conv_0_1_running_var = constants_->at(219); - [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_mean = constants_->at(220); - [[maybe_unused]] auto& mv2_features_11_conv_1_1_running_var = constants_->at(221); - [[maybe_unused]] auto& mv2_features_11_conv_3_running_mean = constants_->at(222); - [[maybe_unused]] auto& mv2_features_11_conv_3_running_var = constants_->at(223); - [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_mean = constants_->at(224); - [[maybe_unused]] auto& mv2_features_12_conv_0_1_running_var = constants_->at(225); - [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_mean = constants_->at(226); - [[maybe_unused]] auto& mv2_features_12_conv_1_1_running_var = constants_->at(227); - [[maybe_unused]] auto& mv2_features_12_conv_3_running_mean = constants_->at(228); - [[maybe_unused]] auto& mv2_features_12_conv_3_running_var = constants_->at(229); - [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_mean = constants_->at(230); - [[maybe_unused]] auto& mv2_features_13_conv_0_1_running_var = constants_->at(231); - [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_mean = constants_->at(232); - [[maybe_unused]] auto& mv2_features_13_conv_1_1_running_var = constants_->at(233); - [[maybe_unused]] auto& mv2_features_13_conv_3_running_mean = constants_->at(234); - [[maybe_unused]] auto& mv2_features_13_conv_3_running_var = constants_->at(235); - [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_mean = constants_->at(236); - [[maybe_unused]] auto& mv2_features_14_conv_0_1_running_var = constants_->at(237); - [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_mean = constants_->at(238); - [[maybe_unused]] auto& mv2_features_14_conv_1_1_running_var = constants_->at(239); - [[maybe_unused]] auto& mv2_features_14_conv_3_running_mean = constants_->at(240); - [[maybe_unused]] auto& mv2_features_14_conv_3_running_var = constants_->at(241); - [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_mean = constants_->at(242); - [[maybe_unused]] auto& mv2_features_15_conv_0_1_running_var = constants_->at(243); - [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_mean = constants_->at(244); - [[maybe_unused]] auto& mv2_features_15_conv_1_1_running_var = constants_->at(245); - [[maybe_unused]] auto& mv2_features_15_conv_3_running_mean = constants_->at(246); - [[maybe_unused]] auto& mv2_features_15_conv_3_running_var = constants_->at(247); - [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_mean = constants_->at(248); - [[maybe_unused]] auto& mv2_features_16_conv_0_1_running_var = constants_->at(249); - [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_mean = constants_->at(250); - [[maybe_unused]] auto& mv2_features_16_conv_1_1_running_var = constants_->at(251); - [[maybe_unused]] auto& mv2_features_16_conv_3_running_mean = constants_->at(252); - [[maybe_unused]] auto& mv2_features_16_conv_3_running_var = constants_->at(253); - [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_mean = constants_->at(254); - [[maybe_unused]] auto& mv2_features_17_conv_0_1_running_var = constants_->at(255); - [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_mean = constants_->at(256); - [[maybe_unused]] auto& mv2_features_17_conv_1_1_running_var = constants_->at(257); - [[maybe_unused]] auto& mv2_features_17_conv_3_running_mean = constants_->at(258); - [[maybe_unused]] auto& mv2_features_17_conv_3_running_var = constants_->at(259); - [[maybe_unused]] auto& mv2_features_18_1_running_mean = constants_->at(260); - [[maybe_unused]] auto& mv2_features_18_1_running_var = constants_->at(261); - - if ((long(arg262_1.data_ptr()) & (16 -1)) != 0) { - AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); - AtenTensorHandle arg262_1_aligned; - aoti_torch_clone_preserve_strides(arg262_1, &arg262_1_aligned); - arg262_1 = std::move(RAIIAtenTensorHandle(arg262_1_aligned)); - } - inputs.clear(); - [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); - - AOTICudaStreamGuard stream_guard(stream, this->device_idx_); - static constexpr int64_t int_array_0[] = {1L, 3L, 224L, 224L}; - static constexpr int64_t int_array_1[] = {150528L, 1L, 672L, 3L}; - AtenTensorHandle buf0_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); - RAIIAtenTensorHandle buf0(buf0_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_0(arg262_1, buf0, 3L, 50176L, this->device_idx_, stream, kernels, this->cubin_dir_); - arg262_1.reset(); - static constexpr int64_t int_array_2[] = {32L, 3L, 3L, 3L}; - static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; - AtenTensorHandle buf1_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); - RAIIAtenTensorHandle buf1(buf1_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_1(mv2_features_0_0_weight, buf1, 96L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - AtenTensorHandle buf2_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); - RAIIAtenTensorHandle buf2(buf2_handle); - buf0.reset(); - buf1.reset(); - auto buf3 = std::move(buf2); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf3, mv2_features_0_1_running_mean, mv2_features_0_1_running_var, mv2_features_0_1_weight, mv2_features_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default, aten_hardtanh_default, aten_convolution_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf4_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf3, mv2_features_1_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 32L, &buf4_handle)); - RAIIAtenTensorHandle buf4(buf4_handle); - buf3.reset(); - auto buf5 = std::move(buf4); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_2(buf5, mv2_features_1_conv_0_1_running_mean, mv2_features_1_conv_0_1_running_var, mv2_features_1_conv_0_1_weight, mv2_features_1_conv_0_1_bias, 401408L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_1, aten_hardtanh_default_1, aten_convolution_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf6_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf5, mv2_features_1_conv_1_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf6_handle)); - RAIIAtenTensorHandle buf6(buf6_handle); - buf5.reset(); - auto buf7 = std::move(buf6); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_3(buf7, mv2_features_1_conv_2_running_mean, mv2_features_1_conv_2_running_var, mv2_features_1_conv_2_weight, mv2_features_1_conv_2_bias, 200704L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_2, aten_convolution_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution] - AtenTensorHandle buf8_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf7, mv2_features_2_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf8_handle)); - RAIIAtenTensorHandle buf8(buf8_handle); - buf7.reset(); - auto buf9 = std::move(buf8); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_4(buf9, mv2_features_2_conv_0_1_running_mean, mv2_features_2_conv_0_1_running_var, mv2_features_2_conv_0_1_weight, mv2_features_2_conv_0_1_bias, 1204224L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_3, aten_hardtanh_default_2, aten_convolution_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf10_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf9, mv2_features_2_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 96L, &buf10_handle)); - RAIIAtenTensorHandle buf10(buf10_handle); - buf9.reset(); - auto buf11 = std::move(buf10); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_5(buf11, mv2_features_2_conv_1_1_running_mean, mv2_features_2_conv_1_1_running_var, mv2_features_2_conv_1_1_weight, mv2_features_2_conv_1_1_bias, 301056L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_4, aten_hardtanh_default_3, aten_convolution_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf12_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf11, mv2_features_2_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf12_handle)); - RAIIAtenTensorHandle buf12(buf12_handle); - buf11.reset(); - auto buf13 = std::move(buf12); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_5], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_6(buf13, mv2_features_2_conv_3_running_mean, mv2_features_2_conv_3_running_var, mv2_features_2_conv_3_weight, mv2_features_2_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default_6], Original ATen: [aten.convolution] - AtenTensorHandle buf14_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf13, mv2_features_3_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf14_handle)); - RAIIAtenTensorHandle buf14(buf14_handle); - auto buf15 = std::move(buf14); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf15, mv2_features_3_conv_0_1_running_mean, mv2_features_3_conv_0_1_running_var, mv2_features_3_conv_0_1_weight, mv2_features_3_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_6, aten_hardtanh_default_4, aten_convolution_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf16_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf15, mv2_features_3_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 144L, &buf16_handle)); - RAIIAtenTensorHandle buf16(buf16_handle); - buf15.reset(); - auto buf17 = std::move(buf16); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf17, mv2_features_3_conv_1_1_running_mean, mv2_features_3_conv_1_1_running_var, mv2_features_3_conv_1_1_weight, mv2_features_3_conv_1_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_7, aten_hardtanh_default_5, aten_convolution_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf18_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf17, mv2_features_3_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf18_handle)); - RAIIAtenTensorHandle buf18(buf18_handle); - buf17.reset(); - auto buf19 = std::move(buf13); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_8(buf19, buf18, mv2_features_3_conv_3_running_mean, mv2_features_3_conv_3_running_var, mv2_features_3_conv_3_weight, mv2_features_3_conv_3_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf18.reset(); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_8, aten_add_tensor, aten_convolution_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] - AtenTensorHandle buf20_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf19, mv2_features_4_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf20_handle)); - RAIIAtenTensorHandle buf20(buf20_handle); - buf19.reset(); - auto buf21 = std::move(buf20); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_7(buf21, mv2_features_4_conv_0_1_running_mean, mv2_features_4_conv_0_1_running_var, mv2_features_4_conv_0_1_weight, mv2_features_4_conv_0_1_bias, 451584L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_9, aten_hardtanh_default_6, aten_convolution_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf22_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf21, mv2_features_4_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 144L, &buf22_handle)); - RAIIAtenTensorHandle buf22(buf22_handle); - buf21.reset(); - auto buf23 = std::move(buf22); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_9(buf23, mv2_features_4_conv_1_1_running_mean, mv2_features_4_conv_1_1_running_var, mv2_features_4_conv_1_1_weight, mv2_features_4_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_10, aten_hardtanh_default_7, aten_convolution_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf24_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf23, mv2_features_4_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf24_handle)); - RAIIAtenTensorHandle buf24(buf24_handle); - buf23.reset(); - auto buf25 = std::move(buf24); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_11], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_10(buf25, mv2_features_4_conv_3_running_mean, mv2_features_4_conv_3_running_var, mv2_features_4_conv_3_weight, mv2_features_4_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default_12], Original ATen: [aten.convolution] - AtenTensorHandle buf26_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf25, mv2_features_5_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf26_handle)); - RAIIAtenTensorHandle buf26(buf26_handle); - auto buf27 = std::move(buf26); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf27, mv2_features_5_conv_0_1_running_mean, mv2_features_5_conv_0_1_running_var, mv2_features_5_conv_0_1_weight, mv2_features_5_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_12, aten_hardtanh_default_8, aten_convolution_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf28_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf27, mv2_features_5_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf28_handle)); - RAIIAtenTensorHandle buf28(buf28_handle); - buf27.reset(); - auto buf29 = std::move(buf28); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf29, mv2_features_5_conv_1_1_running_mean, mv2_features_5_conv_1_1_running_var, mv2_features_5_conv_1_1_weight, mv2_features_5_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_13, aten_hardtanh_default_9, aten_convolution_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf30_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf29, mv2_features_5_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf30_handle)); - RAIIAtenTensorHandle buf30(buf30_handle); - buf29.reset(); - auto buf31 = std::move(buf25); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_14, aten_add_tensor_1], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf31, buf30, mv2_features_5_conv_3_running_mean, mv2_features_5_conv_3_running_var, mv2_features_5_conv_3_weight, mv2_features_5_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf30.reset(); - // Topologically Sorted Source Nodes: [aten_convolution_default_15], Original ATen: [aten.convolution] - AtenTensorHandle buf32_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf31, mv2_features_6_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf32_handle)); - RAIIAtenTensorHandle buf32(buf32_handle); - auto buf33 = std::move(buf32); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf33, mv2_features_6_conv_0_1_running_mean, mv2_features_6_conv_0_1_running_var, mv2_features_6_conv_0_1_weight, mv2_features_6_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_15, aten_hardtanh_default_10, aten_convolution_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf34_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf33, mv2_features_6_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf34_handle)); - RAIIAtenTensorHandle buf34(buf34_handle); - buf33.reset(); - auto buf35 = std::move(buf34); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf35, mv2_features_6_conv_1_1_running_mean, mv2_features_6_conv_1_1_running_var, mv2_features_6_conv_1_1_weight, mv2_features_6_conv_1_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_16, aten_hardtanh_default_11, aten_convolution_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf36_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf35, mv2_features_6_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf36_handle)); - RAIIAtenTensorHandle buf36(buf36_handle); - buf35.reset(); - auto buf37 = std::move(buf31); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_12(buf37, buf36, mv2_features_6_conv_3_running_mean, mv2_features_6_conv_3_running_var, mv2_features_6_conv_3_weight, mv2_features_6_conv_3_bias, 25088L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf36.reset(); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_17, aten_add_tensor_2, aten_convolution_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] - AtenTensorHandle buf38_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf37, mv2_features_7_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf38_handle)); - RAIIAtenTensorHandle buf38(buf38_handle); - buf37.reset(); - auto buf39 = std::move(buf38); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_11(buf39, mv2_features_7_conv_0_1_running_mean, mv2_features_7_conv_0_1_running_var, mv2_features_7_conv_0_1_weight, mv2_features_7_conv_0_1_bias, 150528L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_18, aten_hardtanh_default_12, aten_convolution_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf40_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf39, mv2_features_7_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 192L, &buf40_handle)); - RAIIAtenTensorHandle buf40(buf40_handle); - buf39.reset(); - auto buf41 = std::move(buf40); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_13(buf41, mv2_features_7_conv_1_1_running_mean, mv2_features_7_conv_1_1_running_var, mv2_features_7_conv_1_1_weight, mv2_features_7_conv_1_1_bias, 37632L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_19, aten_hardtanh_default_13, aten_convolution_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf42_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf41, mv2_features_7_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf42_handle)); - RAIIAtenTensorHandle buf42(buf42_handle); - buf41.reset(); - auto buf43 = std::move(buf42); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_20], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_14(buf43, mv2_features_7_conv_3_running_mean, mv2_features_7_conv_3_running_var, mv2_features_7_conv_3_weight, mv2_features_7_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default_21], Original ATen: [aten.convolution] - AtenTensorHandle buf44_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf43, mv2_features_8_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf44_handle)); - RAIIAtenTensorHandle buf44(buf44_handle); - auto buf45 = std::move(buf44); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf45, mv2_features_8_conv_0_1_running_mean, mv2_features_8_conv_0_1_running_var, mv2_features_8_conv_0_1_weight, mv2_features_8_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_21, aten_hardtanh_default_14, aten_convolution_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf46_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf45, mv2_features_8_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf46_handle)); - RAIIAtenTensorHandle buf46(buf46_handle); - buf45.reset(); - auto buf47 = std::move(buf46); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf47, mv2_features_8_conv_1_1_running_mean, mv2_features_8_conv_1_1_running_var, mv2_features_8_conv_1_1_weight, mv2_features_8_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_22, aten_hardtanh_default_15, aten_convolution_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf48_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf47, mv2_features_8_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf48_handle)); - RAIIAtenTensorHandle buf48(buf48_handle); - buf47.reset(); - auto buf49 = std::move(buf43); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_23, aten_add_tensor_3], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf49, buf48, mv2_features_8_conv_3_running_mean, mv2_features_8_conv_3_running_var, mv2_features_8_conv_3_weight, mv2_features_8_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf48.reset(); - // Topologically Sorted Source Nodes: [aten_convolution_default_24], Original ATen: [aten.convolution] - AtenTensorHandle buf50_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf49, mv2_features_9_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf50_handle)); - RAIIAtenTensorHandle buf50(buf50_handle); - auto buf51 = std::move(buf50); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf51, mv2_features_9_conv_0_1_running_mean, mv2_features_9_conv_0_1_running_var, mv2_features_9_conv_0_1_weight, mv2_features_9_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_24, aten_hardtanh_default_16, aten_convolution_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf52_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf51, mv2_features_9_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf52_handle)); - RAIIAtenTensorHandle buf52(buf52_handle); - buf51.reset(); - auto buf53 = std::move(buf52); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf53, mv2_features_9_conv_1_1_running_mean, mv2_features_9_conv_1_1_running_var, mv2_features_9_conv_1_1_weight, mv2_features_9_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_25, aten_hardtanh_default_17, aten_convolution_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf54_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf53, mv2_features_9_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf54_handle)); - RAIIAtenTensorHandle buf54(buf54_handle); - buf53.reset(); - auto buf55 = std::move(buf49); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_26, aten_add_tensor_4], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf55, buf54, mv2_features_9_conv_3_running_mean, mv2_features_9_conv_3_running_var, mv2_features_9_conv_3_weight, mv2_features_9_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf54.reset(); - // Topologically Sorted Source Nodes: [aten_convolution_default_27], Original ATen: [aten.convolution] - AtenTensorHandle buf56_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf55, mv2_features_10_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf56_handle)); - RAIIAtenTensorHandle buf56(buf56_handle); - auto buf57 = std::move(buf56); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf57, mv2_features_10_conv_0_1_running_mean, mv2_features_10_conv_0_1_running_var, mv2_features_10_conv_0_1_weight, mv2_features_10_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_27, aten_hardtanh_default_18, aten_convolution_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf58_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf57, mv2_features_10_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf58_handle)); - RAIIAtenTensorHandle buf58(buf58_handle); - buf57.reset(); - auto buf59 = std::move(buf58); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf59, mv2_features_10_conv_1_1_running_mean, mv2_features_10_conv_1_1_running_var, mv2_features_10_conv_1_1_weight, mv2_features_10_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_28, aten_hardtanh_default_19, aten_convolution_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf60_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf59, mv2_features_10_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf60_handle)); - RAIIAtenTensorHandle buf60(buf60_handle); - buf59.reset(); - auto buf61 = std::move(buf55); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_16(buf61, buf60, mv2_features_10_conv_3_running_mean, mv2_features_10_conv_3_running_var, mv2_features_10_conv_3_weight, mv2_features_10_conv_3_bias, 12544L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf60.reset(); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_29, aten_add_tensor_5, aten_convolution_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] - AtenTensorHandle buf62_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf61, mv2_features_11_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf62_handle)); - RAIIAtenTensorHandle buf62(buf62_handle); - buf61.reset(); - auto buf63 = std::move(buf62); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf63, mv2_features_11_conv_0_1_running_mean, mv2_features_11_conv_0_1_running_var, mv2_features_11_conv_0_1_weight, mv2_features_11_conv_0_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_30, aten_hardtanh_default_20, aten_convolution_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf64_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf63, mv2_features_11_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 384L, &buf64_handle)); - RAIIAtenTensorHandle buf64(buf64_handle); - buf63.reset(); - auto buf65 = std::move(buf64); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_15(buf65, mv2_features_11_conv_1_1_running_mean, mv2_features_11_conv_1_1_running_var, mv2_features_11_conv_1_1_weight, mv2_features_11_conv_1_1_bias, 75264L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_31, aten_hardtanh_default_21, aten_convolution_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf66_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf65, mv2_features_11_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf66_handle)); - RAIIAtenTensorHandle buf66(buf66_handle); - buf65.reset(); - auto buf67 = std::move(buf66); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_32], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_17(buf67, mv2_features_11_conv_3_running_mean, mv2_features_11_conv_3_running_var, mv2_features_11_conv_3_weight, mv2_features_11_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default_33], Original ATen: [aten.convolution] - AtenTensorHandle buf68_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf67, mv2_features_12_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf68_handle)); - RAIIAtenTensorHandle buf68(buf68_handle); - auto buf69 = std::move(buf68); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf69, mv2_features_12_conv_0_1_running_mean, mv2_features_12_conv_0_1_running_var, mv2_features_12_conv_0_1_weight, mv2_features_12_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_33, aten_hardtanh_default_22, aten_convolution_default_34], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf70_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf69, mv2_features_12_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf70_handle)); - RAIIAtenTensorHandle buf70(buf70_handle); - buf69.reset(); - auto buf71 = std::move(buf70); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf71, mv2_features_12_conv_1_1_running_mean, mv2_features_12_conv_1_1_running_var, mv2_features_12_conv_1_1_weight, mv2_features_12_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_34, aten_hardtanh_default_23, aten_convolution_default_35], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf72_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf71, mv2_features_12_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf72_handle)); - RAIIAtenTensorHandle buf72(buf72_handle); - buf71.reset(); - auto buf73 = std::move(buf67); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_35, aten_add_tensor_6], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf73, buf72, mv2_features_12_conv_3_running_mean, mv2_features_12_conv_3_running_var, mv2_features_12_conv_3_weight, mv2_features_12_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf72.reset(); - // Topologically Sorted Source Nodes: [aten_convolution_default_36], Original ATen: [aten.convolution] - AtenTensorHandle buf74_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf73, mv2_features_13_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf74_handle)); - RAIIAtenTensorHandle buf74(buf74_handle); - auto buf75 = std::move(buf74); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf75, mv2_features_13_conv_0_1_running_mean, mv2_features_13_conv_0_1_running_var, mv2_features_13_conv_0_1_weight, mv2_features_13_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_36, aten_hardtanh_default_24, aten_convolution_default_37], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf76_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf75, mv2_features_13_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf76_handle)); - RAIIAtenTensorHandle buf76(buf76_handle); - buf75.reset(); - auto buf77 = std::move(buf76); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf77, mv2_features_13_conv_1_1_running_mean, mv2_features_13_conv_1_1_running_var, mv2_features_13_conv_1_1_weight, mv2_features_13_conv_1_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_37, aten_hardtanh_default_25, aten_convolution_default_38], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf78_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf77, mv2_features_13_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf78_handle)); - RAIIAtenTensorHandle buf78(buf78_handle); - buf77.reset(); - auto buf79 = std::move(buf73); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_19(buf79, buf78, mv2_features_13_conv_3_running_mean, mv2_features_13_conv_3_running_var, mv2_features_13_conv_3_weight, mv2_features_13_conv_3_bias, 18816L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf78.reset(); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_38, aten_add_tensor_7, aten_convolution_default_39], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] - AtenTensorHandle buf80_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf79, mv2_features_14_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf80_handle)); - RAIIAtenTensorHandle buf80(buf80_handle); - buf79.reset(); - auto buf81 = std::move(buf80); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_18(buf81, mv2_features_14_conv_0_1_running_mean, mv2_features_14_conv_0_1_running_var, mv2_features_14_conv_0_1_weight, mv2_features_14_conv_0_1_bias, 112896L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_39, aten_hardtanh_default_26, aten_convolution_default_40], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf82_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf81, mv2_features_14_conv_1_0_weight, nullptr, std::array{2L, 2L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 576L, &buf82_handle)); - RAIIAtenTensorHandle buf82(buf82_handle); - buf81.reset(); - auto buf83 = std::move(buf82); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_20(buf83, mv2_features_14_conv_1_1_running_mean, mv2_features_14_conv_1_1_running_var, mv2_features_14_conv_1_1_weight, mv2_features_14_conv_1_1_bias, 28224L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_40, aten_hardtanh_default_27, aten_convolution_default_41], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf84_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf83, mv2_features_14_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf84_handle)); - RAIIAtenTensorHandle buf84(buf84_handle); - buf83.reset(); - auto buf85 = std::move(buf84); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_41], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_21(buf85, mv2_features_14_conv_3_running_mean, mv2_features_14_conv_3_running_var, mv2_features_14_conv_3_weight, mv2_features_14_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default_42], Original ATen: [aten.convolution] - AtenTensorHandle buf86_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf85, mv2_features_15_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf86_handle)); - RAIIAtenTensorHandle buf86(buf86_handle); - auto buf87 = std::move(buf86); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf87, mv2_features_15_conv_0_1_running_mean, mv2_features_15_conv_0_1_running_var, mv2_features_15_conv_0_1_weight, mv2_features_15_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_42, aten_hardtanh_default_28, aten_convolution_default_43], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf88_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf87, mv2_features_15_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf88_handle)); - RAIIAtenTensorHandle buf88(buf88_handle); - buf87.reset(); - auto buf89 = std::move(buf88); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf89, mv2_features_15_conv_1_1_running_mean, mv2_features_15_conv_1_1_running_var, mv2_features_15_conv_1_1_weight, mv2_features_15_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_43, aten_hardtanh_default_29, aten_convolution_default_44], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf90_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf89, mv2_features_15_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf90_handle)); - RAIIAtenTensorHandle buf90(buf90_handle); - buf89.reset(); - auto buf91 = std::move(buf85); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_44, aten_add_tensor_8], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf91, buf90, mv2_features_15_conv_3_running_mean, mv2_features_15_conv_3_running_var, mv2_features_15_conv_3_weight, mv2_features_15_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf90.reset(); - // Topologically Sorted Source Nodes: [aten_convolution_default_45], Original ATen: [aten.convolution] - AtenTensorHandle buf92_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf91, mv2_features_16_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf92_handle)); - RAIIAtenTensorHandle buf92(buf92_handle); - auto buf93 = std::move(buf92); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf93, mv2_features_16_conv_0_1_running_mean, mv2_features_16_conv_0_1_running_var, mv2_features_16_conv_0_1_weight, mv2_features_16_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_45, aten_hardtanh_default_30, aten_convolution_default_46], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf94_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf93, mv2_features_16_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf94_handle)); - RAIIAtenTensorHandle buf94(buf94_handle); - buf93.reset(); - auto buf95 = std::move(buf94); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf95, mv2_features_16_conv_1_1_running_mean, mv2_features_16_conv_1_1_running_var, mv2_features_16_conv_1_1_weight, mv2_features_16_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_46, aten_hardtanh_default_31, aten_convolution_default_47], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf96_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf95, mv2_features_16_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf96_handle)); - RAIIAtenTensorHandle buf96(buf96_handle); - buf95.reset(); - auto buf97 = std::move(buf91); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add] - call_triton_poi_fused__native_batch_norm_legit_no_training_add_23(buf97, buf96, mv2_features_16_conv_3_running_mean, mv2_features_16_conv_3_running_var, mv2_features_16_conv_3_weight, mv2_features_16_conv_3_bias, 7840L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf96.reset(); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_47, aten_add_tensor_9, aten_convolution_default_48], Original ATen: [aten._native_batch_norm_legit_no_training, aten.add, aten.convolution] - AtenTensorHandle buf98_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf97, mv2_features_17_conv_0_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf98_handle)); - RAIIAtenTensorHandle buf98(buf98_handle); - buf97.reset(); - auto buf99 = std::move(buf98); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf99, mv2_features_17_conv_0_1_running_mean, mv2_features_17_conv_0_1_running_var, mv2_features_17_conv_0_1_weight, mv2_features_17_conv_0_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_48, aten_hardtanh_default_32, aten_convolution_default_49], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf100_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf99, mv2_features_17_conv_1_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 960L, &buf100_handle)); - RAIIAtenTensorHandle buf100(buf100_handle); - buf99.reset(); - auto buf101 = std::move(buf100); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh] - call_triton_poi_fused__native_batch_norm_legit_no_training_hardtanh_22(buf101, mv2_features_17_conv_1_1_running_mean, mv2_features_17_conv_1_1_running_var, mv2_features_17_conv_1_1_weight, mv2_features_17_conv_1_1_bias, 47040L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_49, aten_hardtanh_default_33, aten_convolution_default_50], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.convolution] - AtenTensorHandle buf102_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf101, mv2_features_17_conv_2_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf102_handle)); - RAIIAtenTensorHandle buf102(buf102_handle); - buf101.reset(); - auto buf103 = std::move(buf102); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50], Original ATen: [aten._native_batch_norm_legit_no_training] - call_triton_poi_fused__native_batch_norm_legit_no_training_24(buf103, mv2_features_17_conv_3_running_mean, mv2_features_17_conv_3_running_var, mv2_features_17_conv_3_weight, mv2_features_17_conv_3_bias, 15680L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_50, aten_convolution_default_51], Original ATen: [aten._native_batch_norm_legit_no_training, aten.convolution] - AtenTensorHandle buf104_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf103, mv2_features_18_0_weight, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{0L, 0L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf104_handle)); - RAIIAtenTensorHandle buf104(buf104_handle); - buf103.reset(); - static constexpr int64_t int_array_4[] = {1L, 1280L, 1L, 1L}; - static constexpr int64_t int_array_5[] = {1280L, 1L, 1280L, 1280L}; - AtenTensorHandle buf105_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf105_handle)); - RAIIAtenTensorHandle buf105(buf105_handle); - static constexpr int64_t int_array_6[] = {1280L, 1L, 1L, 1L}; - auto buf106 = wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf105, 4, int_array_4, int_array_6, 0L)); buf105.reset(); // reuse - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean] - call_triton_per_fused__native_batch_norm_legit_no_training_hardtanh_mean_25(buf106, buf104, mv2_features_18_1_running_mean, mv2_features_18_1_running_var, mv2_features_18_1_weight, mv2_features_18_1_bias, 1280L, 49L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf104.reset(); - static constexpr int64_t int_array_7[] = {1280L, 1000L}; - static constexpr int64_t int_array_8[] = {1L, 1280L}; - AtenTensorHandle buf107_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_7, int_array_8, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf107_handle)); - RAIIAtenTensorHandle buf107(buf107_handle); - // Topologically Sorted Source Nodes: [aten_permute_copy_default], Original ATen: [aten.permute_copy] - call_triton_poi_fused_permute_copy_26(mv2_classifier_1_weight, buf107, 1280000L, this->device_idx_, stream, kernels, this->cubin_dir_); - static constexpr int64_t int_array_9[] = {1L, 1000L}; - static constexpr int64_t int_array_10[] = {1000L, 1L}; - AtenTensorHandle buf108_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(2, int_array_9, int_array_10, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf108_handle)); - RAIIAtenTensorHandle buf108(buf108_handle); - // Topologically Sorted Source Nodes: [aten__native_batch_norm_legit_no_training_default_51, aten_hardtanh_default_34, aten_mean_dim, aten_view_copy_default, aten_permute_copy_default, aten_addmm_default], Original ATen: [aten._native_batch_norm_legit_no_training, aten.hardtanh, aten.mean, aten.view_copy, aten.permute_copy, aten.addmm] - static constexpr int64_t int_array_11[] = {0L, 1L}; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_addmm_out(buf108, mv2_classifier_1_bias, wrap_with_raii_handle_if_needed(reinterpret_tensor_wrapper(buf106, 2, int_array_8, int_array_11, 0L)), buf107, 1L, 1L)); - buf106.reset(); - buf107.reset(); - output_handles[0] = buf108.release(); -} // AOTInductorModel::run_impl -} // namespace torch::aot_inductor - - - - -// Compile cmd -// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o -// Link cmd -// g++ /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper.o /home/gasoonjia/executorch/c5cna3r6nfys2tflf6chfc3l6l6rv4a3am2yslkkhyp4e7oaf7ej.kernel.o /home/gasoonjia/executorch/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7/clbguuj2vb7nlf7qm72hrkynyiorwc3udkaj656f3v5xcdaoib67.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json b/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/ce5v7wqyagkbtsmdw5kshpjd2t6vrjvl6ndtpaca5r3ct3piucq7.wrapper_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin b/cedahkafk34ku7ldx6xjj5g7kdphvxc3vywwrxoqogx6xqos4uft.cubin deleted file mode 100644 index 5098c505ebb138fa361a0971a8c7d89086e12f3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+Td&ZJz`FBm61Q{^C@`iK(E)oA44uyQB{n$c>-iF# zmeRCLXrPHsd-*VF9}u4=HffrKCN@o5KLDx_+R#2seC#4LAzHd56ofQ}_xnFDm$RMh zB~7bs;*{L;{NG=n=Q*E!?9KysMI({?CMK>Z`z@Q7a~j^hNdRvTi}ArL9%K`&$og=< zibWVRn$@Q7`Bn)tHO@^7n2%<&>Qq|HaAu8~GudEg;d&XifTyozr*RWq9n6mF2JCUo1CjaBB^P1~D= z)(rP{s@^DBb!4IDbGY1anux|PP-=~eU7DFPCp~N0RzSVxVD8yDCPrRWt1aIW7n{T4hVyC|CS(r{~KjwJa2D|}yZQTh*r@=;<)#;Vt+Jj<=} z=v}YjH~cvl5r*_-HlPBcQ!nNHlcrx~F{*CAf7+fl8k3Vv+ZTN=mzJVqHDI;RnhL)E zo;&X`1;3dwt$NKu|Dq+r%f;MfVt4826yA9Pk9$t7ng6u~6fCRG4d6doKpV4}s)c?@ zmtYRdQe%TXi*pD^$<7cqCg-;SuRk{wqKIh;c#UoC!F#|*Lihp6F~|8+J^ar|e#NIW z{*d!4FIL2!=lJ*vLeBoc@qIn`tD@wmI$c-DKF4ofLCD#6fgc(S-ehrf+$ zm&DSXE{(+=r!lqV`273`OSF&Ak0kh%F~aZe(IOGjR%{=F5wbCrfhT~9vOj|{68T$l zGwu;!SS0OFyJ^4Sm8&(UGE>IPoU)pYhVw|xO1mwVt~RFa^a_d&j*6AGo!MsEYWTHu zy;e%QbAGkq%1QWkJ zMq^aF6~mI&kGnQQ(?!tsLoE7S6#3S81*kJRD*!~uVWw@1XjhVo&(e)t)xJr$U37b# z+|sV5=@z=}r`sUiJlvL+mKK+^>j^aJ#`(BAj{-caeVvf6kkdlirkO*xVY*FsDZe+M za9dXpIE&lj(&F;++4n?Rjv;-RWg_Pnk;6=6`P~ceesGp2VVP(Bz2&7P${NxzPidgS z7_%r^t>NfJV;s+umN#YldUM*`ld+g#RVp}Hnr5?h*w)ABl#nNhZs2U0a`dcjl#04I zIUdSpAJ)I1AK5dyTc`iA@!fin#v=YDZaf2A0rbq#oBx?EI)xs}j{i%2Mbejp{yoAT z*)8VrpB4CG0d^2$tywG8ZQZaP-<#7fdXFDrhHK&JQa5JC3cBI?p5Bv~i#B3%6J}=j zWi*z(oW^pO(^&p;8Y_g1<#QL`Z?Y9eLuxO1-;J>w=_AS^oDvm1H#)|S5IBWy$7ege zf7soZ&EJAW%yA_|*+1;JW%hS4t+bU0=t9WjHLT`gwp-5cgn0UlXSCCE1N z3CplQn$5u1h8Iwf2T%mPKsB;`*QonOlULBi0Rl3ZvZI;NT#@3OsTzNk$A-fvZC1zsS z&FWaY@uTQ}7uG@N=)q zo^AQI+o;#da|S-AmVG?K;DykE+}^$RY_05@b*t&~jYUXhjjs4$rz<|CyDPo}*$$Ot zp=_0_wu#4d-v~v|Jud{&=SB294r;KD)Qj*$`}IbNu3Sfykm`vNwe5-$mN{mIT1BhXGK(d$jxb5ACr*{Im2QR6i;9!)487qnGsv&Q61y{=T|xi(_MPKQqJq4VV}>FPX??&SF% zENthoJ=5N;NIY9P?r)w_IXP_^o)eIHvV=TO1)d7JI5;}b)$m+U_sOS@ES@^Vxn)a* zuHfd`=aKSYoA_)VSfqt9sc1Y=_AC!UZh>cvh~WZp$~q=UJ@L0>s2 zGQpiFWcG7El;wbjIvNG!>Q|VAe~zV+=Vp1a^1@FT<^G51qNKti5-XgS4N(v_Ae_w< z$MU(tc%J7WOUE?@sUFxuM!1X%O<{(DV1Sh_MRZPw6@|RvGH$4nd3qq`>lde1=fN62 zN_Xr0nB^9+N{=EY=GlIb@50`d2W{J_&{OGAuoz97ACUd>#i&&(dG;)x=+|`}?a{}- zj9RC*b-8e9gn9#;w#pP+|B-&AF_uoSD`0sW{$pFkoE(V_h34ZiHcz-uZ6OlA+siO^ zVP^ox*$88cTLU=8^bWeluI_-N^nN!aA7P0Om>U6IJBW9q+uJO5ZGxSl7mawW8XJTU zc*#TWdFWrerpRKO6D*1OU=;JQ7-{1dz00J@$8!9;96!Q%Ua;VuCFAi+kbQa$ywXYh zy>Dyj-6wZXsbP1lp(kMYHzy~KMB0pA1Cs;SCD}D|QgA0mC&^A6NwnFwF(;nq0k>jo zQL?aw6!_(v>tP@llI%yV1b=6H>o#Zv{Y0yuO;94~dNswq*Xr-5x8R8npG~pTDe(9A zUj_KqKFR-hg565IBoo+SE0Djsi%HNi+9^6{&~2e#{#w$qPB zfA-1^0 z+v&yQw14r<0Qr}6*#1CcFGGJzLwe@Xvh#>99RnR(1<7B~*jpzR4E_%R6MstbLnHjt zv2}s}793n-`Q9pD0B=1F&RDTY?nr7E~nno4nRYd(jkBg3<;On8^i{bkp zx=v94f^Q!wmys^BOysVo%MnJ_xYB?bzsvuJ${kzmsBL zocoB;{@rdrsj*Wjc6C69(ElN9n)|$-^D^7PGjmkF-^8~pHMtSSYFht3Om z%F&Z4zrF-|w@xCQk`H@T!4}-RdB3)X{D?GkqD!C)WKXUk|AO2fH+2bgf$a1e^1qPB zyPRl8T7h^`;z^#4!fyEy6b3%^ z*ndgN>mhlXDtT2$`dtU%CWsii-jwpH&Sam?CncxwKVL(BaTR%Lsc!kdt|6b0A1E@? zC$dlHb+`OB#J`&VMJccRBzfZNmM_9S{(9EHK`Bo;4_``N;plfaBrlhXLbG}Gfk+a_ zKCPemfJ011q$X58G-}jJD-SR({@?l<@cgx=)`iI@##i-=(23=jN z^&f>tc~!q>RlI6EC=IGn<&V%8Bd6-4?5I)IOO0WB)W1Pplzn8vJ diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp deleted file mode 100644 index 90c865f5f5e..00000000000 --- a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp +++ /dev/null @@ -1,965 +0,0 @@ - -#include -// Definition of AOTI runtime interface functions - -#include -#include - -#include -#include - -#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ - try { \ - __VA_ARGS__ \ - } catch (const std::exception& e) { \ - std::cerr << "Error: " << e.what() << '\n'; \ - return AOTI_RUNTIME_FAILURE; \ - } catch (...) { \ - std::cerr << "Unknown exception occurred.\n"; \ - return AOTI_RUNTIME_FAILURE; \ - } \ - return AOTI_RUNTIME_SUCCESS; - -#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ - do { \ - AOTI_RUNTIME_CHECK( \ - actual_size == expected_size, \ - "expected " + std::string(name) + " vector size to be " + \ - std::to_string(expected_size) + ", but got " + \ - std::to_string(actual_size)); \ - } while (0) - -// AOTInductor uses at::addmm_out, which doesn't supports -// arguments that requires gradient. For this reason, we -// enforce no_grad context for run APIs. -// -// A RAII, thread local (!) guard that enables or disables grad mode upon -// construction, and sets it back to the original value upon destruction. -struct AOTINoGradGuard { - AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(false); - } - AOTINoGradGuard(const AOTINoGradGuard&) = delete; - AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; - ~AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(prev_mode); - } - AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; - AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; - bool prev_mode{aoti_torch_grad_mode_is_enabled()}; -}; - -extern "C" { - -AOTIRuntimeError AOTInductorModelContainerCreate( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - bool is_cpu, - const char* cubin_dir) { - return AOTInductorModelContainerCreateWithDevice( - container_handle, - num_models, - is_cpu ? "cpu" : "cuda", - cubin_dir); -} - -AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - const char* device_str, - const char* cubin_dir) { - if (num_models == 0) { - std::cerr << "Error: num_models must be positive, but got 0\n"; - return AOTI_RUNTIME_FAILURE; - } - CONVERT_EXCEPTION_TO_ERROR_CODE({ - std::optional cubin_dir_opt; - if (cubin_dir != nullptr) { - cubin_dir_opt.emplace(cubin_dir); - } - auto* container = new torch::aot_inductor::AOTInductorModelContainer( - num_models, std::string(device_str), cubin_dir_opt); - *container_handle = - reinterpret_cast(container); - }) -} - -AOTIRuntimeError AOTInductorModelContainerDelete( - AOTInductorModelContainerHandle container_handle) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto* container = - reinterpret_cast( - container_handle); - delete container; - }); -} - -AOTIRuntimeError AOTInductorModelContainerRun( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_single_threaded( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumConstants( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *num_constants = container->num_constants(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantName( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** name) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *name = container->constant_name(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** original_fqn) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *original_fqn = container->constant_original_fqn(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( - AOTInductorModelContainerHandle container_handle, - size_t idx, - bool* from_folded) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantType( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* type) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* dtype) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *dtype = container->constant_dtype(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( - AOTInductorModelContainerHandle container_handle, - size_t idx, - size_t* data_size) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *data_size = container->constant_data_size(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive) { - auto* container = - reinterpret_cast( - container_handle); - auto constants_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { const auto ret = container->extract_constants_map(use_inactive); - for (const auto& pair: ret) { - constants_map->emplace(pair.first, pair.second); - } - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update, /* user_managed = */ true); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle) { - return AOTInductorModelContainerUpdateConstantBuffer(container_handle, - constant_map_handle, - /*use_inactive*/ true, - /*validate_full_update*/ true); -} - -AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->free_inactive_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( - AOTInductorModelContainerHandle container_handle, - bool use_inactive, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_const_fold(use_inactive, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->swap_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumInputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_inputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_inputs = container->num_inputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetInputName( - AOTInductorModelContainerHandle container_handle, - size_t input_idx, - const char** ret_input_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_input_names = container->input_name(input_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_outputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_outputs = container->num_outputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetOutputName( - AOTInductorModelContainerHandle container_handle, - size_t output_idx, - const char** ret_output_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_output_names = container->output_name(output_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetCallSpec( - AOTInductorModelContainerHandle container_handle, - const char** in_spec, - const char** out_spec) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - *in_spec = container->get_in_spec(); - *out_spec = container->get_out_spec(); - }) -} - -AOTIRuntimeError AOTInductorModelCreate( - AOTInductorModelHandle* model_handle, - AOTInductorConstantMapHandle constant_map_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto constant_array = std::make_shared>(); - auto input_map = reinterpret_cast*>(constant_map_handle); - - auto model = new torch::aot_inductor::AOTInductorModel( - constant_map, - constant_array, - "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models - "" - ); - - if (input_map) { - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - } else { - model->load_constants(); - } - - *model_handle = reinterpret_cast(model); - })} - -AOTIRuntimeError AOTInductorModelRun( - AOTInductorModelHandle model_handle, - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - model->run_impl( - input_handles, - output_handles, - (torch::aot_inductor::DeviceStreamType) nullptr, - nullptr); - }) -} - -AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast( - model_handle); - delete model; - })} - -AOTIRuntimeError AOTInductorModelGetNumOutputs( - AOTInductorModelHandle model_handle, - size_t* ret_num_outputs) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast(model_handle); - *ret_num_outputs = model->num_outputs(); - }) -} - -AOTIRuntimeError AOTInductorModelUpdateConstantsMap( - AOTInductorModelHandle model_handle, - AOTInductorConstantMapHandle constant_map_handle) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto input_map = - reinterpret_cast*>( - constant_map_handle); - - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - model->update_constants_map(std::move(constant_map)); - }) -} - -} // extern "C" - - -#define CUDA_DRIVER_CHECK(EXPR) \ -do { \ - CUresult code = EXPR; \ - const char *msg; \ - CUresult code_get_error = cuGetErrorString(code, &msg); \ - if (code_get_error != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string("invalid error code!")); \ - } \ - if (code != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string(msg)); \ - } \ -} while (0); - -static inline CUfunction loadKernel( - std::string filePath, - const std::string &funcName, - uint32_t sharedMemBytes, - const std::optional &cubinDir = std::nullopt) { - if (cubinDir) { - std::filesystem::path p1{*cubinDir}; - std::filesystem::path p2{filePath}; - filePath = (p1 / p2.filename()).string(); - } - - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline void launchKernel( - CUfunction func, - uint32_t gridX, - uint32_t gridY, - uint32_t gridZ, - uint32_t numWarps, - uint32_t sharedMemBytes, - void* args[], - cudaStream_t stream) { - CUDA_DRIVER_CHECK(cuLaunchKernel( - func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr - )); -} -CACHE_TORCH_DTYPE(float32); -CACHE_TORCH_DEVICE(cuda); -CACHE_TORCH_LAYOUT(strided); -namespace torch::aot_inductor { -namespace { -class AOTInductorModelKernels : public AOTInductorModelKernelsBase { - public: - CUfunction triton_poi_fused_convolution_0{nullptr}; - CUfunction triton_poi_fused_convolution_1{nullptr}; - CUfunction triton_poi_fused_convolution_2{nullptr}; -}; -} // namespace - - - -AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) - : AOTInductorModelBase(1, - 1, - 1, - device_str, - std::move(cubin_dir), - true) { - inputs_info_[0].name = "arg2_1"; - constants_info_[0].name = "conv_weight"; - constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[0].offset = 0; - constants_info_[0].data_size = 540; - constants_info_[0].from_folded = false; - constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[0].shape = {5, 3, 3, 3}; - constants_info_[0].stride = {27, 9, 3, 1}; - constants_info_[0].layout = static_cast(cached_torch_layout_strided); - constants_info_[0].original_fqn = "conv.weight"; - update_constants_map(std::move(constants_map)); - update_constants_array(std::move(constants_array)); - in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; - out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; - outputs_info_[0].name = "output0"; - this->kernels_ = std::make_unique(); -} - -std::unordered_map AOTInductorModel::const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization -) { - - if (!initialization) { - std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " - << "aot_inductor.use_runtime_constant_folding=False\n"; - } - return {}; -} -} // namespace torch::aot_inductor -using namespace torch::aot_inductor; - -template -static inline void call_triton_poi_fused_convolution_0( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_0', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 12 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_0 == nullptr) { - kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); - } - CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); - int var_2 = ynumel; - int var_3 = xnumel; - CUdeviceptr global_scratch_4 = 0; - void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; - launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_1( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_1', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 15 - xnumel = 9 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_1 == nullptr) { - kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); - } - CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); - int var_7 = ynumel; - int var_8 = xnumel; - CUdeviceptr global_scratch_9 = 0; - void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; - launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_2( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_2', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 20 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y0 = (yindex % 5) - y1 = yindex // 5 - y3 = yindex - tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') - tmp1 = y0 - tmp2 = tl.full([1, 1], 2, tl.int64) - tmp3 = tmp1 < tmp2 - tmp4 = tl.full([1, 1], 1, tl.int64) - tmp5 = tmp1 < tmp4 - tmp6 = -0.0312186349183321 - tmp7 = -0.18273277580738068 - tmp8 = tl.where(tmp5, tmp6, tmp7) - tmp9 = tl.full([1, 1], 3, tl.int64) - tmp10 = tmp1 < tmp9 - tmp11 = tl.full([1, 1], 4, tl.int64) - tmp12 = tmp1 < tmp11 - tmp13 = -0.12337345629930496 - tmp14 = 0.12138354778289795 - tmp15 = tl.where(tmp12, tmp13, tmp14) - tmp16 = 0.05455135554075241 - tmp17 = tl.where(tmp10, tmp16, tmp15) - tmp18 = tl.where(tmp3, tmp8, tmp17) - tmp19 = tmp0 + tmp18 - tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); - uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_2 == nullptr) { - kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); - } - CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); - int var_12 = ynumel; - int var_13 = xnumel; - CUdeviceptr global_scratch_14 = 0; - void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; - launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); -} - -namespace torch::aot_inductor { - -void AOTInductorModel::_const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) {} - -AOTI_NOINLINE static void check_input_0( - AtenTensorHandle* input_handles -) { - ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); - int32_t arg2_1_dtype; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); - - int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); - if (arg2_1_expected_dtype != arg2_1_dtype) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dtype, " - << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " - << "but got: " << arg2_1_dtype << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_size = arg2_1.sizes(); - - if (4 != arg2_1_size[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 0, " - << "expected: 4, " << "but got: " << arg2_1_size[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (3 != arg2_1_size[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 1, " - << "expected: 3, " << "but got: " << arg2_1_size[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 2, " - << "expected: 8, " << "but got: " << arg2_1_size[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 3, " - << "expected: 8, " << "but got: " << arg2_1_size[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_stride = arg2_1.strides(); - - if (192 != arg2_1_stride[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 0, " - << "expected: 192, " << "but got: " << arg2_1_stride[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (64 != arg2_1_stride[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 1, " - << "expected: 64, " << "but got: " << arg2_1_stride[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_stride[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 2, " - << "expected: 8, " << "but got: " << arg2_1_stride[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (1 != arg2_1_stride[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 3, " - << "expected: 1, " << "but got: " << arg2_1_stride[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - int32_t arg2_1_device_type; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); - - int32_t arg2_1_expected_device_type = 1; - if (arg2_1_expected_device_type != arg2_1_device_type) { - std::stringstream ss; - ss << "input_handles[0]: unmatched device type, " - << "expected: " << arg2_1_expected_device_type << "1(cuda), " - << "but got: " << arg2_1_device_type << "\n"; - throw std::runtime_error(ss.str()); - } -} - -static bool _check_aoti_runtime_check_inputs_env() { - const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); - const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; - return result; -} - -AOTI_NOINLINE static void __check_inputs_outputs( - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - if (!_check_aoti_runtime_check_inputs_env()){ - return; - } - check_input_0(input_handles); -} - -void AOTInductorModel::run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) { - __check_inputs_outputs(input_handles, output_handles); - - auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); - auto arg2_1 = std::move(inputs[0]); - [[maybe_unused]] auto& conv_weight = constants_->at(0); - - if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { - AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); - AtenTensorHandle arg2_1_aligned; - aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); - arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); - } - inputs.clear(); - [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); - - AOTICudaStreamGuard stream_guard(stream, this->device_idx_); - static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; - static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; - AtenTensorHandle buf0_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); - RAIIAtenTensorHandle buf0(buf0_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - arg2_1.reset(); - static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; - static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; - AtenTensorHandle buf1_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); - RAIIAtenTensorHandle buf1(buf1_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - AtenTensorHandle buf2_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); - RAIIAtenTensorHandle buf2(buf2_handle); - buf0.reset(); - buf1.reset(); - static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; - static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; - AtenTensorHandle buf3_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); - RAIIAtenTensorHandle buf3(buf3_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf2.reset(); - output_handles[0] = buf3.release(); -} // AOTInductorModel::run_impl -} // namespace torch::aot_inductor - - - - -// Compile cmd -// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o -// Link cmd -// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json b/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin deleted file mode 100644 index 000ca4c1209b77cdaec3c8757e532677b79ccc0f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8968 zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8YXC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7 zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM# zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew$8dP>`$ zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1) zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5GuFVIK-CY(EZ`C4GTUaK$2L zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3 zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty z02F5dZe(2`e4Kc=$%pwRQS}@3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8KFIpOH77K*R{H z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn|U zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC@q#ggXYY*W& z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM# z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f zaF0gz)9(?V>4(C7I_d98v%g93M{PB?<|F=0Y6OQboct=_OL6Henm2?q5@Rb8dR?2(87mIVOHSAc?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma z2PCeK{b#acB;VbBF-P?YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS? zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy zX;T0AF8WpVCl87*-V#gk(k}YzYNH|^)Jf!hx{MDSt=TG81f(G@hxR@S;eJuxU_y-WA7n6z+AV6L z&H=?w^}nge(LnE3Wb$^wM)E4&}Q7kG>CM(ANG0^Xb1#Sq$YwAN-H8Xo!uGROV+!a z+3~U)ld?$>LZ||@eX7)ls!s^V&rtIwVMrfPPcXs@hRgxd-}8pgtBGOW*B5xC?M)N{KZ%my{7$RG>%?FWK5e3 z7OIm$#V^bk%kE+UGkeadRVw8tiq2%d$yg!ZtUgh!HY-nLY73d-lbJdmVHr2$r~Tu{ z-E6avcd9e_lZ{|;;dr6oyC)W_S*zM)+8_S@%QpvKzk#t~gf5z)>07krs~_nnV`JJ? zJr+C1w6@*=uzV5!bS;+DF6mFlObs+s`*8WAi*f~wp2LMCn5+}sY zYKZ7BuoC0B*&QR$5g(X-qo!|#PJJhJWL>wZ8>jB6*gH(qx6yQjxH52^bjqy7YS+tFRrAEjxQx(BJ-PhA~d`%?R=h=K>mK;AnWCy5OQjw+zhf@#o!(>8Uv_4XnWHg>KZ%=30yhPE^3-ygixLVs^^3s)29zEyp@l z+g!1o+;h!Mq^`Y*^tCsUx%MWeA|^8F_2!J+*y)I}Ym_@#c00X1pC|`K*Gx}m*%EYPN3ieQ0`snG;nO!VH;mT<$|5Z7f(64kLp6n70GVpC2tBx>(~a< z`|&$h(VN|{-W2pw40zU%^9z-_wJ3s4p%zRO0;lL&4Fp=7FRA57p$bj-MBlrbQ>a}V z3^u)Lu&9L_)SH3ct`M;=6WDVKFd84074d5ru3Z(q3XVv8Xzz?wtX;g7|TATOP-lb97K?6rR2UD)AgwiPPAaJ#5*ot^T zS|)sA_;xA_ZHxI(h%!M5rD+KlOV*P`H+037_MjF{a&s#4Nz$dI#6SjD)*l-8n8^lk zD(PTdVsL`wCrLfc*J6nr;QpDXu>4%9d54)aX-8M_NASY$9)EZwE-f|rgGH16z+E7a z*W96k?>T{2t(1y|riHJZLV%|{E(Z>$_wDoQ#X?}0oLaz-w~$I%J)yx)PiRW_N@#~u zUD`-_!70ppHs1FFD-t+&dMXS&BLe3kP)>9$xQI+-On-n#q;&)cslEWQqdft_GRLeV zK%O4iFu-giz?t3vdDgokgZl$aA@SD{A&K>cs1|D_J(1t*4Ur$=To~fsNQnH*cWq*d zA4QkW?`mmEm3KWsLaZ-HnBb~bf((~%9UI9VP9?)IQ|V}&8MT+QqJpLQbA&Q!HZzl+ zIXHdKK~X&vUF1bKoz+hGf#p!+o?WQe$U*`*em!& z!Tk;_I0f=ycxf1cFk^CvU5AlwNsKLGPPUD$k@<|lT7ZpPYM zLfBwt7hPxDx?ujD9Ff=9P#2vW0bPFx_Z-_h%(&HH@93~U2K)Fh<(CIw(r?Z&BR<4N zF&~Rzo;64lZ*+%2lh5S%PdR>yk>aLCcLNl=YG0R=UcAyt`2O1%p(lE)bKe?zl%4-{ zJhP;A_%#nXa6Z9q#+3+fkf8rhXyo8g`zckWeKa+4C{x`(lb?B<=jU9lE&W)IEx*kqkc#v%c{WDBEXJ6RF zE+p73@Rz$s3lDgUzljIbc>r|yx5YOMLUha*IlLA8ZR{_X2Yt*>Y{E@ct8=cy^_^KV zI_wz;kiWN$vWZ440XmDtpzqI<>~u30BbaTsMrgez8bh2uMD+77vfrPC<7Yr8d92q& zBhKk@qF+c1oNf*ch4L3jeuUFU1U+`T8IOnbT_kUif6UkWNBVfVImc43DhVd%P|vo{h%$I&ACGU$^;Cm8gPb@tZ;Gx@=2?Pu%&qx~>n;`+YF zVfbfm8zue~`fD5x>Es`8W9YvL{~9{duO(6p`pI89+x%J${th(~lHTI+n=k1IrzP}} z{nt1>)IW}V8RiZ{2lj8=9~^|s5GEM>zAn~-`@e(WLB2i&b2=Lpx4wYio&;STe?i~r z$K%A$tKwS;ZKuC^GeA*O{*++PHc_4c4{k@{g1#f_{IZtL-cPXaHa}%lk2;;7>g;TS zZ42oT`VzpleeehO-D4kmd-tGXs*C!(USU$ez z1zXtn^5)i%e_P652qnd&SKeDg{uSxyyPqTf@*47|P<*NVE&7rLxYz#eoQwZJBxbUA zZxWMT5>|U;ba62ySUu2#e@@E38j+_*O|N_$8v~!JJ>o1UH^_|0(^Sc;eWc$F33q^q zk@h1gulAYzr<71~3jgypY)eDV16R9rkf)U*(4)wB!r@tqLJW zvK4LglO_4oEB|ZtK@ereyjT7sqgq!rD!G3F DI_8#s diff --git a/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin b/cguqxqtxyno4btxkugwlps3lbm56okihdtohl53vad3fobxqjmuc.cubin deleted file mode 100644 index 88b88a29bf7f3c8af0026294261be1288801c901..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11320 zcmeHNZEPFm9e?bzQ>Q6*8c+u+tXHT<*}C&<5~p-+N})qq5glck1Vg8beU42X+dW@g zr=`^GCN`#xPVCd9NkeEKHi?gDf@#|LKv6YSh)?^lPhEy4L`#)~f{@1We*fp?a<-FA zOVetbI5qb?|M%DDdG5|V{m`LfO6qttL6*P6mijdL>s=A+ptRSPXd-IX;p%S7bX)L{?4n}IOQT-Sb0o=oSmFD!i_(7}l#dcaGFGKB?O62^ zkG}5I+?qRAM}$6o$?GqG=+YZ_|GeLOSD|Y3BQy4lQ7aZ3wkuj*E-FRGLcl6-nhHMh z$U~2qg5OMo~>ex%B`IsdJF{2g37 zB$noMX)N>%jj>H<+U=bz(mB)K8R1jL2*0aOi$q9Up*;vj$c9t~o&_q%{sP88;P26m zxCek?frL9#Pq;NFUn*A%vw6(SX{%AIRi7?fiDH^1O0^j~v5cZKqhKZMYO|5BYHm4E zDNiNpb8e|t%^0a!i&>FHMS_*knsopnd; z>6xd?wPvO@-kO~|(K}V^So-SqVQtPOFEL$2ocHA=Rb1d+?zy0+5z=bVx#gyzy z-F6y834Zp6KfO8d+gB0QAaX6PP(a%pSo-Lyb|N?&_(%%|FR{Ra)&lV1Wi1fY1B;p# zOa`tjX{P~P1)v8Ou3TNxo(;ak0*ecmmli&{!Wp$wAYFU^8dxp`0#~km5DID+85F_9 zZ%)$~)b?Rm)CO_aW@)+r+5w0KUkW1M8m|C#Mt22(@Hx!1EdlLTQt>6ak*nHw=(dAy z&yZW%?KIs)w*zz=rkjJ?;^N}MqIM5~Cfzt6cjqX;%i6aId7hjW(l*Vibc@q%rbqds z9);Vwg}^3m3yTX&OP4?DEHXmh;H z>4Wk5o>XQp7B9z5A;!VDvnQKgM!`X#3VYIoV>=F+cpI(e%_+QjR+GDFr|`Z1;? zZwkxy$@N+>Pu;(QfwA>7kb(h<3a2%NGgE6Cvmy;C>^$+jYn2N|3!yhk6+3tp;` zOc;jw(Nq$y)*O$5EPx{DSt^k2){Tm5G{E#gT>k=~= zucve@TlpT@O!J(cgc2TFT0ez)a?Scz^rTKL!lUVSO7Oz@9;bMq1u7Xih1^M};4={5 z=U(L<+j8xCty0d<8F)3#yLfED=buCA-Mj5(Iq#YktKsrJMM$NLp7>y=CqAXSC%!|e zE|p{WdP!?THeWIcBX< z@@^UPq8#%@i51WtC0`>2T16|>GKD3whA>I2FHV)Qxz@_3^^~(316IiE9oj{ ziZ9-R*cAHfNqhZTB86CAq{t~(wH&FJ$F-J}mnoI>Vojy}!P1_>gDr}KEzLWC8mpKY zPmdoM-G4xAD~is0o7&7rXN_ZyYgl!BmuwkD_k^LCd79JM3Rhzg8yjuFwCO1^?lMoh z+zllmBeq>?IQUIDl80`F_OoK3XdG( z+_I&@y5Q#7=aKSY8+aY}EYiX#DjH7|`B10hiywiWDsC+Uh0E;3<5NYw*sY4p*O49R zVA^xgRSt?wa3`|K1KbZ~+2f&(Mgh6{WhUWY)l$i;r+Be4!cQ3G{`={oq{1Q+E1Z`N zkrg%|oJ!`#GU@DihUXzg$2A41p4NOuxJ(_I{0v#a04qI;&ha5kZ<5j_iZ1%l8 zlNVy_LJa(agSP?RHz4_+iLiadOCFK@$!Lc?2Z-z^2_EjSA4<4R@JNUKNW$cQOVqMM z-<01@fIiS+-;?yIC>sI&Gx*baoC9}94cE$j29OfQVK7JJi92xOo^xvo(!#Zs5C3S3m z8{6&%gRh*wcbwapKL=RTJQ(?pG{jnBIyjP1z)@Z=4$lYKb|)0#8Rq(8fg$t<^gqiU zSVH}`g|<84Fzso4gFyZz9kxH#*qhMb(vY4uTDFb&5+TsBeUSV$jlFwL!QlS{F!9GE zKQzKW9orfB_u_Qp`O#m~&`#}dEW@0>4DXDRe!-7^qWJiQZIITleu3*3{P0)5!1)Dy zk@%y0f8#%eVLj-MCh|e{FKiDgW;PewfGvBfqf!2saFfwKaAn#-5vF0}x;XTe0~QUF~;7 ze?P{)KKCi3J-yTUuEySovD-a5g#Hg<(>(C42ltx?AAbTrj`Qi^#~wOh9{J{Dk5ke< zzdp~ve?y&({E65!JR*t@m81ai z(0L(GIeJ><*Fn&GbrRt<`S4c~Y~H<>_sA;px-@jQN1z8}?N#Jomiyy|9)TW^onJ-% z*YbGR)qh}+5%kL6C-8MIKsmX%RZMnBSe1eHX#XX*;rgT>iTQ$*e^ug1o{qv^`8Wy# zpZe_o@=hVBi^1y=C9mpOgD~MHC7&8DNO@IfvQOuel2iCsR*}E7f;_cUul$u&A&zhCqZd!UC=$v+Ib zx>o8x3Xk%t{y(eYRpVi4P>m{oguWg*RUc(XjjCR1^xLEU_3EMo)Tjz5N534QR6P{C X8dZJN=$E7TR2~(-8dYA^sO0_$U6JTG diff --git a/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin b/ck3qqmftbtrn5fy62j3h5u27se3qnktzxry4gir7xw4kwvpbmfut.cubin deleted file mode 100644 index cd3b21f44c86f0181124fbc89ca5c63a37d6c9ea..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13832 zcmeHOYiwKBeLv)-MCxIQlAlWAnD(U3fdh&!kyPv?nqzwiU?ahTVOz6|p-AeH$P{x) z$ya_Xaux*Lhc)Xqpbh$^$hU47_UM3OU%Yf})1bhx7Tbrep8_Kvio_ih+g$_HEBE{T z&$*YE4<(kD4H%XS_}+8=@7MXCb9L#LPoFUw;+y0EI^jf&HVH|*ucYPC>X zT`nxF)hp#fX|=k(y0X?-1~lbS-wj=Tsc2WqJD{{H^@_a?mV}Dim6f|Q2!zGEqbyhE z*A@#4cJX{g#r0%mxr#A$bF|JPQ$O)7$v^6 znTh0-iYqINcCof3!LQk?jn&3R4Q3O!TCF#V)kflzqO6PLp6~ycipi~_XUXioS3BfY;=;Bdjg#wb|&YwAV`dmTN7ZQcy%5t&ZJ6@mFrd~oHQ=50uMLwm{O(92? z8tux|*8w@~gNDrbM|j4*Hs-_lhG?6eR}nBvx!f&mRp5OsfEj&N;Qd|jcLnZt^A>t< zkn#sx05eJgJ|1oPwP{G67W(ln`X;VP1Lc?(?7qzB=BjlmbYs;nEiG5eYbA^di^cltYW0QX z;`9ROOAEE?3-$SOsrZ@Z+ET>393Tw@;}7Sfj5{z#0qOv<8FGfOR352v`FgZ3KUy0yl2m_;Bl%brFCM zZ{GNs46OcOEPzogz`4GIzMq^VBs78hh2Yov0@gvwTIFpFxA$*+aO3uE$i|#nZdwZz zb|2^dQNn}0r8or5B54lbw)O8;fVy5LG=LlD7?y(Xg7%;uf>tz_$aN6p!5>(N?vrpj zs2KhBcmv>zhq87DtX<^%meTyuz%A>|mpT5iK>qe+M#r)V+09#>H%9QLHNoNU@ixNS ze%@Z-jmE=E+#aBji@e>hVHizv^c-*E&9_C*HPI=I%k=r51bSJNKL!3RovUfGJwtg@ z9PQ!lNg?{4FqCC{7T_CO)?>sz!P|EQd5%M^?wg|ShXVc^8Rvlfxu$+phBtqoB_q+hM&>U%pt0WqHJm<4{_>K)s)lQ^xj~zF#dH6Ii zWuKT$v_Mb~Q29i97BDq=wt$0wwN#j|EG}0QkFaD`$~sT+c|)7ZJsf-X<8$j~2366` zp1)i6B-m*YHW!+5esw**rd>$FgsD=axLl5J!0h_cO58Y?g!Cz!_!%JlEVDo$N^$47 zDpt!beJUaJK&F$&)%-%mRgp@1h=*et$Xc~Mo|K?OQz~)Xk4Y0_So3X|j9`FIW@&My zQH!rMjIvr%l%*h-@;Z^CE(O+JNUagliS#?1nt?_gBe?93Q7S980M8iJmu$~xhSg3Y z7(n=Ap@T{|^uWMYyi*LTumJ9hYg*z;YzlGkg($6E)xkq3T47C#pTr~FKCpO5a9cpr z5>SGZ(Cj0p+qS^lrHj*qx2G&EJ`22Et+r6Gsdh=?avFHL*3z62myP58B-f>AYe~=I zEHt@Ec9Gg_grFBv+m1l=LTU#fycbfuNJDrpq;>?tdm#0~V03yR5k>?V{JoHpM@EB4 zb$)FnzO?L?SVlNm>G_%b>})!dm9hoR%4%H4!p>*`phOwg*XBLAu62z?(nYk7P2x)h zX;}-t%t&xE8lNU(vFz99m4>E+R&0fyR*7#0G?eN`NNkD7bRZ)y<3KPT9<5C$Bi;@v^0`!@65LBdm80 zv=-#YhhhX>6sy^XqD&VBiGCFQcTu@^6y0-CXnKAwTJNGzM1B+lW>B(rd}go=Toj7H z&&AbKV}}5N*;G82xqw_87qZ^)P?acQc$RQbizT~KY*cEiE6b&gIR3U);$JU|EAg|b zM;@uHFP9pHm14ai#~IC)jDyRFLaWAhW+ctQf*1}k`dAu2n`{$6d8J}$sZzjOFXVys z5}9LHaKNEKso1etBK90ihH;(-Z7ia855}?$bQ+65i6j`C+Oe@I&)6BCv8*(IWRa13 zFcw^0#=1FzvFRNfoA!*&`;0}+wHcdq1G2L^AX}HYSP^b)Gdngn5JU2cZHE}VSg%$bv?d7{MlboylS zbmo-wdTMgd%KGiECT}*Ph+VZ4$^A%>n$xFnj^I^Ez84UNj z7A7laMa=_oehfE$!^3^w$HPt2V7Tr3Fp?WBjk~@NqwN;kxas>abk|7Zit@enWhKZ= zIE4A(B8ItUG%3p*>9Z06&DL>R&PoWf!bwz*1Crnl@WlY=G9b{=|{@YJqFFRSS8k1!7YjC7qXA(Amq=HJ>nD3?OJv zMP5>^8T0U__41n683o4-j}E@V%1b2>-(i>}l-SIyS?s1p@=^%KFtgfSl+=PIpaD4c znop+d8C|iLaJCie^&c{L{s1I<1*hF9J3o6Yl|VMPVHDv>9Y_gVGG+KO{L3-+rr8)0 zXb##&qdr7?2U8xdd&r4hfdEQKCbkoib+;}#pZD-V%djgA?+=d2M_PXjtyIhW`{Nb< ztfC@s69eW40>$}xyRwcyx^`Yyx=7_$7TtuMjGxxL+ko@M5^Wf`G;l?QrX%vz@j<8y zjp;EphsJ&5Tu6R%652m3MupS?rEVWq?%Yy)@I#a_Z!X`deDi>cw9$(Y@LNygo9F#a z6?!0|VgpLm@Gle{7G3f&5ro!6Ooav`Y6#J^InWea_I3_q`SN{HhaZ73Df?d&_4x#MuzC0;I%!2bC}k~k!a{2 zM|`jvJpQW_yCU+{eSbVn^%C*wWniGaULtMvGHDOD)hG0i%z?i)-Z1)ihmVSK z+};}vWu3j@6Ro*IyvADbnvV84BO2R8QgytHe>IUe<&5uc(T}RD%fqdB{ST~%DQC}K zzrHuH9yo8UH~dl#rphJjt*?*q3wiPD>uZl+s4ae1e`ge(5*PD*#KL;Jf5=~NQD-QO zHm92JHkp6KZ-2N3Do(N+J>I?Ckw8vWc!F7a~-F`)S%C9~D z9~R>^+1S^dz1BS8_S0coMY##oe&eRa^+hbI31tNZIb`a9xr*xnV~1$w!@|CIVdT}5TiA{P8`P@fVwe?4^A9u0cG zUb;Toci0{YnhqiR8~KOXfXV-pfXEw-nYqXt4|>0HLjDiiV?pm%TZj+&M-qPeJwfT$ z0KuOA!}jjrZjO032WhJ|C}{IqW;=9psP% z%qN^-{NBplUk@SweZ$PJT)@A&5us%Ko(lgyH=@SvKB+f%zngGET**qYzH~nNqL>;y z_gm>lwJ7!r_=7o5U#b?>{dLmm>zl(o?Cz(|aA?a{A5)I5Z|M*3Oru_Qx%SEV;qS36 zs#hR@@jyLII=iHPb{ToEO{fqZ2pz;h?fRHH_lKCswP~sM>EAHQ0TlFaegynD&tNF_ zpx)3=TKg;N|FsPz^`hlXt8?!8MQM4zzHvwKJlAagfu-IWRR=sgg#JI6!^_XE&m!=@ z76JAi_OH%s&HzTadH62rRlYe7@4fi{$nahH1b+7F4ib?bl~Xj{O^ia)utn)+s`y4K z*EsNu|1<|4ZBO|0bQqrC&Dm znDiRNM9Z5j~wWP&Y^3-elmvrJuDqGj|`a8vZ{s0P@Q@*(ET~x3&FU`Vahy zCWsljPW^5ke*!#hnlp6nBX9WKxvSshbMt>3`ah1}r6WH}s*~T@hW}}!-_OsI=;Z&q zZTOdZ@YBvt{uE#kfkPN~{noW<_%}@*(0&d#TO=}i8F$Kjq64nt20*3eV*c8-DD`!~ z$WQrgm&@M%fgK hW&1BT-tN<{Pg=q@aq(oBc>DPTv<;sdANT3<{TGw(1gaA!++RKMY`+)c~O_QccXo6|l`T|MzhlNT+b?ErpCEx0rSypR_f&zGwQQOwLaNkhG(}t!*i=%qi#BlsyR8+w9BSh zx4i1CZ5A!BR59xfciODkQ&kUB({rtAy;`3#E0$aKta`=F6nupnq^b$sc6no!xpvcb zXQ4ICy`8ExidGF-sCpbOHR?@7`d%(AMaOEuYM(U~ z{NR0e-)9PbGi_S6s)hbVO9YpTxy!`v($Oir^8_CEtXebwYYWIuI1tE>RQY?(e{Bzc z8`mz0r8!+1i#$eSa?8p2`C%4opPV0#@hM}3-`%4{BBZUzUIZg#BPs)r0~Ka}0%Iuj zx5Q@LL%^_5%A0mlUc)U_s`c_r2{UubYBn16Z&t08cZj7bjcGfzf}(??Y^Ch_Y%^sw zylSdeEvB3~uhOXJj7&P!ta|q69IJH5nzEZhamrMxQ|?UNtKxiQCGXJTd~@14T=vSv zBS*4cZOU!l$~JR(!77Ved!0cM zO#I>mjbUvch9xbIyEa49MbP#`Ec{#;`PO&^s53e%07SrHrfmyp*OH1)(~Vr!zCpKL zbbE~4(ypiJ7P{@H+W_5M+?JM>7MHY}2{h@(`M5iW0G`*rM#xvlX(4UXtWLKfx=nW} zzu%{DTh|aci`(MT;_~wOcSTxGAbpsnLl+m3!*po*olEb$f1W2{nP>go<)tOc8qzRN zX`sRwvnX1%QP&H`7@j9Bcgptk=CnDUwwPg+%Q#t@X0v+K)<@}-kSB?5;B1+y>lxiB z7IblPJd(*gs()TTHa@ajr~lEh-Fkt>0{$g#EDc;9^z`xD{+TX1g&xU_{Y!lX(wBw) zabb__7IOH{2z((AJBYE`tQKpwZrF9tozpLSj~`}+W8vviH)cljy5V@P-jkTiHezxU zW_tHkG?uxV#`4ch4bibt5_J_KnodIptx{uQ$W*=Sz|^dB7;3>sN`AIve82D%}ULXDav5? z8*+(0zM6a3GdC;&*;NyO-FU++uGl-ehV#eORVIJk-kyS7I z*+xEL8TLmqY53Z3eG2jbilFDIMz-e|HP2}B3OYDIKn7DLlP!#oQk*k2qf(W0Hv_K) z!}v%(pBu}JjZw_5>VPaxCeXo^L(oJJiruWx3<3mj!1{O`V?&afg|B3;*6^v+P;M4H zBV%x;IaBm0qM>*=*_`zANVRM$hO%@89Eg)!RLhNYmZ>fn$mU8h;rebp-6B6^SHrr* zObj_09cx#9sGQC6oF0P`9$HpEj)HQ{`e*gDPCde->2yl)!uc+zc%Wq}899aANvGg5 z5a8!tm0a8MY^PDHmgWq6PAz$OhQSM=1KB-$?AdC`Giz4U;~R^R${1bo!A@6vN_SU$ z2QnQh$wJ90RcsTF>7EgYo_n76qtA)xc^uSW9jO=LiT3M_5?#5DC?VAoC2HFhB`kBy zdZXmsGU`V;8i*1N-5DicBV}4etJN}tC9;k%NvtPMm9dp>h0)83lkW@#KhE($oO~-?DI7O_f?BPQnBo}cgho|Ok}yI!WJ(&Jz;nl?WmB7=eX zScJ_J?h{*xgzxs!j9uF4!%;TO*y2_njxfE0uCePo;4r=44akRBtOMpoK-Ui9-RSl< zi`*Dv=jcTvUaLk1-~(Rr(0d;G*KR1V$mSSJU_KDWd^AGZ_(ktBY4VXA|1QT5F?zEN=Xt=b2wRjaY#|AL`Q~~E$b|&^VJpVp+3wp0ji8@u#n}WUlCD>i?7OXa zoZf=RK6p0C&L+Vhk6#CPU!UZEEXMW`FL^}r#}aM!Bp|Y%CU~IDo{_La@L-#LTf*dj zOVq7R?~;F;0==)zz9s3!1RDhXBly!k^Xw+Fzt!#M0krQW_%z$nV*Tu;5Oa!)8tg}I z0X+Wfp9}3U=&vE(me$?@b!;1yaCv&a4t=kk-o)NavS>@bylsPycz3m;>2{Zq1FKB z9}xP(L#;s$4|4e(@Q1A7lKu>}`uYje$LT5LKhjmNXji?WQL0zOuU9nMS+7V(y-q$I zN%-;H6oP$n6c^d&`QcOnFHrnj`uJ=A?VsAhhLeCd4{(nuA3sh2jt%-S`ftQbV$IFp zPU_gg4z|+^hkyFYEo0oq!fC*o=EKN;tSQzG)4`EW08a3Fad;ujcDj)W&oI}Q4E3Ww zp#O1p=Q8TQBeK(tMrr@zn*s7K>9GC2#$Ja0mWK4qqh;q2Un&ARwhEHJq_H4R}=Xl`)79$9HD;Z_DS9iC#e1$el^8zb~TME%BzU_fgTkd zKf%{S+>PM-AG%Ia|AIdp?gtbn6yimGP|h>TpAp^g$l+ zfgigv@$>uw12nTPiX8+l3nl9A@qL;o96zn+;xX}&qEL6OL0Cu_`u!!%?H2y zz(bU@kFSq2@L!Pu+MdO}(&^5G1ivpf4WCH-!Hf2w)#OJc@}@3gw>*9G-DQHS|2jXs z7^_GD;-T|Go^teL%C9ei-mQ}eXXL|PMX>qzZr-o1AwMh)o$3Ad zzpf!4lOHJ3(kHS{=XJOIHpIW0{{<y|ITKK^>vz(FZbIS*b+Ug79>HzY5Y zi$b$`^?^td$Ud!~d7nc}hNXR?2e0{?1tinxlgL)^qTlzBPu=oAlJY@bns>|JCHjZm z*~O^j?*(05tMwm+M|oAhXH~pvJSYvSQRR=&mm{a@qwJ_r)k}>*d(^*vU6gljEAk~R|(hfvb-UhMTI+zcU)AZh90larW%#ZJQoK3Mj>&JLK zi!f$*v!3tzRskzDFU<*<&!#u)l?0*@cA}bK14$Yz0)x4%V(+k;DMDIuOpHDTIk`dY0!2p$+gB zlHDs1;A(!|^y~1D>|$G#mqx8zgs9dfKynQN{9jC^~8zr*qL%@FNeLe83d^X3DfG zWee4a4GtH-dB{XJ=-78&M*@#AqfWqo+i?gEN#}9?myW|$nx8tX*XS5!aJ)1&+;hmT z03&Y!oEDQD2XPC~Umoe;|Pnl&Nq(4`I0fhAMMd15Xx3;ADkAlF_nQogC)vp7$cFtB?d7@z{DbB{#)X&>@$Ywkx7tN45{sz`gh;tq#%PkyUMgmb+W%MpN2g&EMjP` zHa`&YK11Uo;; z_>;KF6ma99r_SB>&va3HKb@ZZM16VEmx2Cc!XDYpXYrR7`20BRAjYy+E>vvYupQrB z(l^;Wk1(TV;n7Ps7INddQS)8BCmowqZF0qC>gY8$k-qjOGS}Wj_S%~m51GhjHd`}t zV<$q&u2Jpe*e&$rJ@5GClAf8!v1I~hP^WlN(FaCrw@==QGsjT{1T-+}-k#01QE&^W z((RcHaO}`26K_>c(JbH%%1IrkR#7R5`Zo%aH;vPFdW#kR#BJ;7&23q48hR-P+`Gv6 zx$2^^Ac9V#J&YE8t6Vbb2-KUc7&0AcG^SCP=*QM`3hiu*!DiMC7VUA1db7~m5hC_w z6lFjGhT}t85g&-@_2NSd-J&1k@WXOSK|YZ&*n`!HbP66;-GG9aK@s$E+6CLM85Q5~ z_#V~Jy+H=kKmcxx##0S^jQ%-kAupmBlRLY zQGC5oq9!*HC8Txolk ztX-`zx}`XIU&se>9t*|E8%f7@ruo?|iJs8ApVZDb6Dh=cB1KNQs&=G79yhzCe4Em# zAl7sy94z$|9_+NZcV&15P+?7HCo_{LChk5V+KQs{+NLrK(U~{z`i51*x3s!3?VmLi zGf#770xfL=2GQ7ZaDtki65}rOq|0t730cu~so>x@ zR4PhkmTMz6bUIXW51k)xNzX$8^=D<3_h4Z=i>@9f@oZ(dzgbG<^qgh5j*?7sds!~e zQ-QmF7l-hRtKqp2)yMZ59j>zphuiK^(UQ3V?C3ezX0GiAk0j~i?FTH|DbW+kIlQKL zHt(ea^1F~#D7f|_p5wZ&b3Ls5vv@CKx8SD*_c}0V70HKzz!iF z@gt%2WQ;Y4{<$F{;rD$m_&L7E{ zCjfN%ra?JS*XQM;8?W>dzV|DV&=dX|b?*jxl%0R~d|^4#;-9%Fked>0&yw`u0A?q_ zE-VkW*z;J^r?4LbuE$tYvam}@@XM%2fxMJpFV*Axo77#qkaEy3)CbrUZ3HPE6@eeaBcKGI^}lk`G@je!0!{As=Z!ZxzM)7{1cXx&fni)^TlZ9E)dwZc^m z_G32#{>jCQUu;D||1ILJYpwlI$96#pm#6Q9(ARu%8+$j&`s(s)SPOK-d#JvHy@%S` zM)Kevtq<-%$+AJN??HC{?ICvet&)z z^b4dAe}G5p39dgOygeY793U_bi$I`<>iu^;E(j>ou1d%rNmZX$cb z+}<#^w?EcbCr5B`e<{BHL=fLi5%7<2{t?bUhJ14E+OsP6%i=lM!T4!HgO++_hX-PJ@sxc~bJ?x%VV{C8P`>O;gg2D)lxJn{8=?>6sy^l|*i#+MI0eDZ|($k!fzl#=$@^;rh~BQikkLiB}B zccvs*k+(fXCI0Y5{a`(L>Rxm;KS7>;S?glqa#!WU)vPo~JoMbgdmLswW%BD*(7Syi z!jtmxWmd2SV>fSp1Npb4{9gh|vFMh!H<16ObQCwWj;mY#;s)}+;LV+}$M{X6t)g50 z7D1}DImGIEnV>_Gp!SI0b3@Ehur?LK(K0XPe3|+%}1fcer>{Cf7IfeiJ2J%PNk*DXIZu#GDAU`GLQ^E~i<8;65mcP1z zd~F^36lb^mjgZH`o-^>gl&74BFD0*V^t(RE%cn_&X6#0>RKG1#{^!;I}y2sx9j@O1?M`P41{BPk!|rFFObVNpNqo-Rfue?REzTCe^nZpy3r|7?m^ z%@0X~YF7Cp^j755{wO<|J diff --git a/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin b/cluvzszdtr4ykyrpkxlp2moyesdw57fomp6qblpztzjs77ltlqpm.cubin deleted file mode 100644 index e8cdf9d03a89109e7d9e93424697209d3422b085..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13240 zcmeHOUu;{~c|YW(BvMjD$#GpNv*)YSLR8eb{7=bAmMC$OWmtk_O*(7|3XT*hksU?S z_g=}A9XE_S6iHDu!~Ov2ThOOI4A>ruEiJmu4`~+!MHlR0=))d1quWDgYqYF&#g>)( zecw6f^72y3mRJt9As5y?=llQXeBU{jy!1n`mP)$pqdaoRx zzvFqaB9_G{?ngyP2+OHDuI&~}nCWqQL&7{3oNA-o7FJ`+sx{V{!g4EZ*K+Ne+iVnC z&01k?)2WmTg+|e>ZB+`TqPtoxG@ABCp(Js5iMzxSmdkPOp zl>^8x;-IYYoip zN?j2>xHy1lY!#$1(F;z|aiq`z_#&2KSir-z+;+ij!$Y!*Mblndt=cLhN#4Us-}^2q z|AEjxDh$aK>-BZJ*s5~$ExYM9-R%}4^yy0;!V1Vry`A@ui+-PG)Ybm%MrFfluB|y0 zSJu61Elo!^pxd_!8h-Yv$DS%k{z9@)tk;UDUzCKuT?(fP*j)xHg)2|uamRk}f3tzS zwi@*R12xd$Y^J)=Z)*$Eabk$+q2@e-qh#KPji~vE1 zacnbxe2D)mieK|-jep4e+Kax}HyA(CN2u9fGk$ake>6;f>eIDPb{Rk3N2uBF0)J$> zpPx=t68?ty?;PUq;X0+TG^fiDkr!!k+@`%+tu@M)J*o?bm? zwYEj*yZ`k2-y8j_W2$4Sm3pgUJ46Zp{4f9Zt$^W?e2DWKmZ!y+u7OcLJ|?k#RzvOgH#UI7D}boY?#Z|5}sm<_IkxNosGhoWKmef zavA5DQ*de*DrTO}CUt(97EZMFhM6+0(y}Sfs~1wK&zrwwUObaKZPH(U>9n~_<1+pf zZYc>|7WCw$``=F&oqaE)mi}9P%cL(2{b!^-vb&tYUrOSav#^60Yfi0Hub5V);o95g zEuSol!fF-qOlw-3`K)QRT-zK<%q<%!u!%zQ^hanc^-&s2f0V{DAEmLZ&sZjX>-DBe zA?H(j+v_ebK20Bt&f#n-o2m4&xJckSsvVyw&DcWgfz+7?(Zq}^A|MnMQJ#&U=#h!e(qt8JwMbh)MMisYZ z)m_Ws6>Q-ES+I#IyOhi(Da_5fRjnzqo$=`RrjyBRCbgVgO2Krw>~Y{&f(FG$`LK0$ zWN<=(1cnS`h#-yH861t0fa37$qH@yI%>C?3z3EADNoA;{^6=EzEO``K4~_zN3X+02 zUn^HMLpjZ9;5d~<-HIexriR3jrUZBky@%T-3^iWd$U5weifvdUznSY$XS+RG1? z6G#o^1br2VH-$&^Ia6^+VR1+6QD(z(^@5{_mFf%MzPVrVXoEgvi z50)Ad8_G>;xHVF`8v7kZtryIfuB#859I}kO@Gc`;JC*mcbHE#E7#G4Q|RtmeY?5s6; z;%B&U$fvc;QhMoZ?%}g?H_~)Y9GB7l3|X&c573ZRTFIdW6 zR&Jv>^bTU+TFBs_If#MFgwDQ$7LMQO`G7s?oK~$P*$h@D#Y00q*S!nc=I3v>cEbUus8%4`* zc=k9UIc8D~4yHk#JSQc`HdOn=Mu3NtlVcav<^UH1JnUbNBa{a#JsPNI;1r&L(8?heMJ>j5?a*Faq>iFkXi?Ak*|;Ng_{>IGY+;d9;?iE?T%9<1cI8<#n>@?X zq*gwo%1Ud|VF{z`#=%TnZFvsqU?#3snV&wBwAU!=QftYwAU2pqZ?ZprIu{Sw0v)|M zM3?oz<&&olq{2cVn7&IZG zfF@l{`tzgohEv2ctrRw3RY)27SiJ%_O+2vO_&9MhjMw=5## zQ89)2bQtq|gtYNR?__B5b2a{_8sEdXBU$k7MR5EI6ekYAYn{YD{ML!y;RxY>d-do6 zdKCJ$50@{7dV*eqk>htyiMefMavo!8O61%y=cY$V{6Sn?zBt(vuV79x-HAY+#VAvq>HZa$Hnip$HwTb;ONi378h^F!9O;3 z6!1f%ivPt?@euKnhmap>s|=GvKPFhc!y}mVUp(3qUj{d!lLSxo#P2A$MeuA-d_}>C z_f#9T0_^P5K{v)8j2i3hRpuZCbzhUqqh5^6*nz*Mej8|WI<&`kO zSL2aMz*qiEh*qg)-p%Yf9hD$O;P0V1041(4L$jeqlO;(z(o z$Wc#!DF*$rcP0I4@$CzsUIstq6Z!pmI}%vWk-&P6jL>>Uy!9Lz8C=iEz=AT%`5%(uwkg_dQR$1SJI9z$XqU|{}Oov(s`cGT4hyDXUwnF0n8T2J0lrO$$S42lo zI0cAJ8-L{~VLow!?0NB-uQR`t$9`h|i0PicrY!^lz_o(?;$%Pn%0FLzLgI&HFDCbP zqF;r*HwWX7wr6!d!`Oc!(RNhBGCq5J+_QHzA?EG52zM0PANdZOKboC||NVGhXL+B$ zbA5lQzUhKzajpoWH6+_5ZB`3%lM@O5iPIRk$zW6xD~~xhc}eGUT3mT=ed^C_&+{C{>^>lsicDP|8ju*yGlN(93uPl z{2!D*hWPjMU)(JPwVxzUd_nnT*vG%29+*|~lym>3=2-`A3fp!{(q@8_j?Q2tTmk8qC!7`6NppzCYD`lGlh zuloO!XkI-&qYUa%=a100BB$3!+tH(5FFpG0QT^(5(foSU>!e4&9Ix#G`PA#9N534! Qr}L=o=uziYk6P}(0Mh}Ap#T5? diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp deleted file mode 100644 index f283030cd98..00000000000 --- a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.cpp - -// Compile cmd -// g++ /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o -// Link cmd -// g++ /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf.wrapper.o /home/gasoonjia/executorch/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel.o /home/gasoonjia/executorch/cfopb6sps4l7mnj7js6b7hf6edzvuowrx5fugystcn2wozmmxwaf/c2axxg3k6hizo5jukgeoinhgbqdavmur6jy4bqwkwu6iqb3x3hb2.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json b/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/cmrb62ufziylboaeypntxlgevb45gi5uw6cqsbpvx756nf43k7mq.kernel_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin b/cmwzm6zpgnuflon4ux22vbg463wrhvpwsjsryjid3yzwslq5jy6j.cubin deleted file mode 100644 index d2228db77f98247a3ac53ee64ff6864a708d7d4f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9528 zcmeGiTZ|i5^?1g+u{XQx-L#}#=ws5xkcu$t8Sk#0q_o?nO^Zm>&=e4j$}qOaYlC;} z%*;C8Y|}ac6@*YL0UsdoQT2;Rzz6CN5<>ZCtEl3mLh!){QWZt0DrvOYCIr-iIp;oR z$M!CT-KL_VSJ}DuoO|B)oHGyS=?4!UP-3y{7T`mH-#}ZeX?o*M4!m(2pC2E29A+U8 zgLu6WVgR~d@dGcg3%HW=;yj1hZ21+p)B@dY>Q%Q~2R(3FfgX6(pzd0Ydetf~_)f{P zTsx>X9jjmm#fs(Dy?LwV%vA$iT7hR*-Kslh8B?L0%}MYpw&#@I7a5-8J6;ooWteSq zwR*v>p{vz^!Nt1k2eunbQT+6hQ&^a@%AP&%NT61AaqT%Zu^Ga47s72?oSF75-}ZgZ zv<}|Jc=t1exJJ;jf)-jxdhukWm3pIEWJux`DMu^7+G$}|=sJ9sOdO9_Q8|PjswOto1ZBxypJFnx#8epT zE3|8f*%86U#rl(YE6|?b7TMv;hY+vAcpv`Dh>u0^TLr~e8GWjczAoq`os@Wj(Mu~< z*}l#21FHzJdXeD=`|y8fczG9G-TsO~_7ANh#A*TY+s0Po(-tQDg3;g5N8iD*Q($RL zLxuR0G^d77wcEQO(K*%Lm0(K%!td^r!Vywe`~t$9H!dUa2Sh2*z-uh_&gc+cW9S*g zCWHCLWKj2tm8x4>DB{YRv;BJA{d(1&%pC_P=8oswdJFGN^L8`mAImm_R>{san~P0* z&d)WzQ7Ip9RJ;ZA#9YZPPiM1r*VG$}5c~C?fAI3)FFyv5MA!L<$JFuI%GFED>Y_3h zyQ(V6>kwO3TL`RNTzPBdlBz11nEI5WA@*rpuHgDLh+V$;_9Zoj*sB+pFJ3{Sb}V*z zS!pZi#!Dz#VF}so7@{MJLERoxZ$Lfj9dse}YKkt0>9T_^Pm(5eJ1w`;<#TxX>tEh} zOHJb9&E;jbbxp0(-Y{LrZ1qcYVKUjpWkN;?S*A-GFBegs7E^}^CdHR8ub}7T!V0>g z#gx}$=m$keZ#te|t-D%Y&*Afs?aeuX=FeMuGB)UTse~QYxBTh}N1LH;DLSgAV~?G4 zHAB-2d5w3_qlWQ>_E~Ld&-8AM{$_H!wLHyv{1dob260okpEy<$>a2Gu_A9I6hNi)nmED<<| z>56X|S~A_ZJ9iJ(9zzw-rDWQ>+sv+_&@M!k?lw)t!Onvgz7)AdtAH;+Ze}mFj9Q78 zfnE@_23FnC%_ja6`_>RVvw6V=3MMb`rbFh>*PHqRcN_atx)|8ilHNj3{Yp(2@xX?a z_HZ}1mPv@;tgq&pzM|I63eKY7sE61U)0hV2pY4{Udj>Ymw1<0!xu%D>vspu?$dI}h zYQ%P&_E=br48__B8hS0zeRdEHwrk+kFsG-caKBJ?#GYqhcV~xT;%&p&hsWf}^E0r~ zrx}4cgASHTArF$`1{a&{_8n6dw~*j7GpNa5D1=OyYa!D##c`pgAFq}|OG+k-8;M0u zrcUNRah9kGa5Blk@DNbqZY|S7+-$mGXjnSpm4s4li4~#YG+8Zi)fk@$t*RL38h3r&%oqR?YSU zR?nQv(0jbVQ&`K3gGQGCl2^2g701FyJCrAUG(#V=+(#BDI_R!2awiNbOAKSLFNofN zFUZyB3&w2YzGOpRW+J{|9d-Lcm4_+8Qoi38s=y66Lu`HC$TRNAum9|1rb2J_M7&`& z+I2L%3QGoetM6>65;x)xvi13cRkZQbnGXFi%&0@y1o-)sdF$s%Mpl=zIWxC^`o8^0 z;UDt?-EQEAqLy9`PUw=DH!eD3>K78rV!`Ut+{fx&+&!pGpx6WX3=L!|RhxQwp;nWe zCTHT^B##8tX@pdhwL2bX7UuH2t$VKAW+lRDSdXJV#XAd@Z=|zC8_I6hXJ)jMMC&<0 zcsn%quHN1qw?ucxllZXgJFGn<#m5A@Q1F~4K1KA7lYOM@n`%E{FXQ_K_Zu*87s-a? zQW713_+$b`aK4e&<9toJk3phqpDDq8^)SB6f1(5NTN2P#0iMUd_}DPBg}o#rp*oR=_)r2yaXqHsdL~ZN z*r0cETKrSY{}A)LIFs)*=pC5+m*ZBk=*3H!gzx|Az~w~0UOcdl9I0oc^Xa8nhkf24 zgYFoGiA5o42hQ#&oL(C2z;|#>pFmzfxD|(GK?3JfFdYoDm~<$B2u~~u@6!a2!jD@C z_HE+c?HESfKix{gEYZ?cQ~OSDMB_?O}x zcm^S|Cxh~q5*_%ifExr4b>MpfMt`=o0G>v$bN1OS@X{nqw3H5J4~*Q1MqGFf#=Q|{ za3_!Z7gF%?7AVX-Iv9k9TX>uXK%;O)<@%Kld`rkLj3Pfj?@s2qEo49GJB#_i^dUUv zrAU6e^Ll3syqbb^YcLf@#Gr!uUVQ{kof#fvxiFy*!wW+gPw0xL;*E0z1qtJqZp9gY zocPbDV!t|p-?B~%doZ5qmNG)}6}CT#@r?Jxf1oG+0|O*F9>#xQpgaEYuK1riGae7^ zJ+FlRBuL*YDdp4|P}tYCiQ9(h{L`%jvv-Kub2|*Q#J4m`nBS(*|Di#$7s&rz1aD1* z@Es{oFyAnI_W<*M5_SZ`!#{iZQ#oRRbB{p69|6`yG=%ZIcX;BXA|2xS=U^+HAGDu> z`tW zt{+9DkKnliAbwFU5*-#uTLhf! zr}}BHQn1yC&`SVCv<+i{}sA9~Fvg$^S6!%duAe zk>BK^{Qo?XR?dfoLOIL$5qdLx@_eKoIm`2sb5tLDUXqC;8N@8lNzPF|@==~g4;#oZ T%Q?zN{>eD@^2xZ$S@Qi0=ju#B diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp deleted file mode 100644 index bbe94294805..00000000000 --- a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// Triton kernels are embedded as comments in /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.cpp - -// Compile cmd -// g++ /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o -// Link cmd -// g++ /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms.wrapper.o /home/gasoonjia/executorch/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel.o /home/gasoonjia/executorch/ccycbuuk3dge36oeokrv5t2cslsgxpcukmcvkgx3hnjvysymcgms/clxvzwn2a5v7ypw7eq6fysn2555bpqqp3ckvq4a6v5o6aba2rxov.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json b/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/cmysbw6incvor2r6jhqnsksxjislopx4gsgpsu6fl2igjbgwzdn6.kernel_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin b/crikv76bp356w3xfrsl6v7yjgadifnrrfofduf4qs74u5yah7y3u.cubin deleted file mode 100644 index 9b7c06c6f791df59d0650fd339ed10f850f64651..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9528 zcmeHNTWlQF8UA-??W`T!8wVPQ3SUzyHiR z$%>nRWM^cThq|0+ksR4SabwV%Lx{sEH8YU zZ8Xbv1EIR1fGbU}71~}n!TB?)PI-RTss;9(qkx9%VI4RPNsM889l~3%B4MUmExXl< zgf4@(k?&rC80Uv=D{R9nxHD#D^pAbl$b_4 zdJ64*VD=`k8M(e0w*vj;oe4^`ycc+lMtksI20oR*ZS0^}4|K_u&63@aitOM*d2g{U2OI$kjaXJ9n(*XGOa36TyE~4}S;OK8fWu zUm9gj@SGbuwX(9GvYk^a`?F$6#P|a}S|Wt9m3a$rB%4tg_ybUB@-e1TAB+rQOd&E# z9SrCEgJCnM)LpMSU%|?nwOh@m_k?R7tbo5#@q@W~^GVGkZ&7MhYXi_yX+3owV?OoX1?_17t03o7SiVU>o%{29g5}b=)$`gi%U#qo7FkWd zkVZ6TU?DBe<@)%5PS_`HS4z8z&E3QUd!glcxSxj`c{suLv|YU1&cl5eE}TEVdj8@$ zfqh?d89l^FFh-i02|Zm~}$EHD^udZ8Gd?6`i(awcI5~pW?15JF9M>*UoynsT<{z z9(B^kO!IO5^ZN4iYnZ=LPS7Lny=%0@4 zvE9-*{>%tpnt&a|=(gN)!_f`L3xh>{vpsJ=8NQ9@Cf%5yn$QhD4D_CSY*x1=G+X%t zSKfyC3D_un0yf4!0UHwu8{>sdpB%ZnlL>8C>I6;Et^D+S+zZ{RZWcJ3BY0}X2|0JXT0 zOTce6*20Fd!s=EPk3(@XLUhG390QKe_Npp86HRk00L?6H7$GQIb!37aX$G-HwBxZr zq}9xGuAR^~8llk=8{rG8iLqFkVDj92!>GHG8kp$sV*!)RqFJ0SO--BAhp}CO+D8)& z9}Hj{HzPu%H!yLhi3n3jK(!ic#CZiiqK+RH_8QHYibDq~c+4qy)0!{G6h1M+6O2$P z1fO!Nj$)`5L~7aB329N42uoI3DW(Dk8SfKx9nkY_;AX~rQ%7~lw--K*%WP0Ti5(I0 z`seh#&bi&dG_-&hE_Rs~*{PZ$s%illlBVGqd_fX~?MmR-q2o6jZe`KHi%=!RLjgV_ z94#C=aH!ny5^xW)xP>-f)p_CLW0O#Dgl@{O(M~@tB2VKWJ5V1`plGml6eCsjPID{p|vLf$E)%U^dz3ATaIWC1M<6rT`mXC0v9eQCN6$|=f&UU{5Ls&C})uX(D|L2bD*y4<)Ry}^fJEp zD@CNIPbtBC*Jbo1Sj6)5a;hUfZ?J(IM`-t=lZotZhK&WzxHJR@?4J2 z=D2h*4YDj+6AZK-)O+! zgP`v>&u^o5*j~Cr-vEOBsg2OSc8S!kR@5XJgpB`ra(;}XOaXi_`pZ-&?Qd)dX+kL|j-OT=Gg};M>zK8nSoEThu zKz^M={Db|hk2wB1!`rhlT+UG%=Zk3Nb`tSFPkY0mp&y;OrN|<5_I}E?hDme+9q_&p z+K&AK|I>8msz&eZ$w(NvyP(nUa-?T@zoCz2#Qy0o3iyixhCi%N9Ecszhx3v}KEWRy zIplY5AU{&_nLV3J=`RX;B#-^ogFca;NdF$dL{S|^_HGKci=!dFTfUu3AG;NL#I>T) zxg0&aNNJ|^??!Why<@8W2=qVY=-$N-iF-z;^L35h$kDYi9YX&Duw@;2iF$!8_<(nrYi zFA7~Ok$U-4MqL_Y9`0wNhhxUmkGKrbyM1CpT|P|IBer<#<~@8F`8TEfTd`!c=$2>y zx^c|;nsoG+A0hwa%g8SdNI}uW*OYYIl5N*tB(#5x(Jd}Ok!Kf>wN(1Q@8vNx&@_BxBRWq z`Jp?LO0ib*_kpgijm{s(%^|A)&!c$N{D3s5W|coiZ$(b+kFuj?wO?vZ+7r)D>fkB? iIjenAb5f3DRQu5-Mk>u}PRen7Dv#ZADz9o*a{mB2)l@wI diff --git a/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin b/csitc2tbez7ytfakpudstbhsobm3wlczsly46p5oeax43spr3eab.cubin deleted file mode 100644 index 6e21efafc59f39c347cd5fc3fb16d696fc03742f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21056 zcmeHPZEPIJd7isHl6NFUQnD?Pl58z)u~87^cza(YV>ec!*hw2Za^e&X;-u#zc_hyv zkLSG;a}+I`R*b+dir8&|A}NX>4T`oY`XdGUBQ1hPMV-Jk+BE2o!f5{ls*oD7B#@S3 zyNw(6dES}b+ubAW#-W9x@Y>wnnfG&^dFP#(-CI3&*FAT~W3gP1(nnnVjaoPNylmX3 z3-yoc>!J_&%ce+uTEtP8JW_6`fnl3kI7D}~ccd4{knXfi+TWY%HYOPwEFD;ba z*=D)6P+F>#Yo%<#*Seef8n9BCeH%2ol}5!~f$EI-J-@g-U0wtz)uxnZmTL_(u9+v& zX}7YtyGTJ{d3QC=R;EwSm*(8^Qbnb8VX<1nJ$;Xq%b|+eN~ujRHOh^K7TTe{jNs`M z@ayW$R;k$nh_s6^^MFj(t20tbdB3&|Z^(avCsae3^5WvWTdpri2i4u>=5lkjjur;= zWp*o4^Zdd!;K;L$OXk+@b$Uitoe@4C07`AeBn zd9hkXU?Wz7L%npgghUw%Aqh!ck8^hW5O^meRN9$hS2Kd2Crpkq8SUrF2)j6+dFkuA zBo+pnrS^7&+$g5@(Lim&+~0?K`)fA_DEjtc)LUv_NB!fdj|J-c49AngKh(kh6~piG zg**c-dVD5MyPkJ#YE|mH+7;&R_oTkNqyFEd-dhE=$+uh7-`lP*cfXGMeM1571~uWU z!hcf-{}zs;rk3|S;P)fE4)#C3zJ5$4w;o?VmXuqiSbw}ji>WZS5|6-`mQ5rL$wgIM zog|8}pAPim93yJ6@#a!}yt(YoEL3Z=r)O|inlCq&munAI%j3BjHNLRCR2gqqak`u> zk5_6djq&nwvpT+5ogS~RHW!v_xwMlRZ&aI=_twiZpDWK-8d~vq$x}V{KCwc&(({k)Adyq`}U81_q*NSJ7fmwLS+%_&IT#*pMB?Fp6&kYzl?F^HH+0H zE3M0{{PLqpr4SYRNW`qYv7cD|@uAq24eK;6RxeMn_zzUL{S+Jn|RGinC_4K0xqODMJ+- zhRToVx=CI7)c{98g4D}apXTw@7!jjj3dBpo0_Q~Yhk1!xZCN2)(RwVRwsFa_aPiQ9 z!F%bcW&L9;z8;HNw@53egwFZUJnB#p2q9&S#H>Bk zeJAi+Dq32GwTM@23h?e z?`)nP;)#+kfr8ufisxs<4UlJfMS-h9i6|2hI3Qt9xHmo_EFInOxZ>0fOFI(TB9j{T0 zI&=T>2H<)1)mKGv@VY6DV-m1_pDnz4#hPJ3G}9<#m_e>yy}EgIb8}Ot6Y_6vUfX=t z`h!Qwy}{s>Bm+AyU(@)zsM|0M|G{ka$|Wr=9nCI!Ild7?6f^6kSCA#E%Qbs4J%Md| z*`2R6?Z#4RDpOYJ^6V^Dy^T_%dZuC*xq?k&Ute)+HZ~aCK-+1of9GqqQ?%35leS(0 zml{}6Kk(Vxm01$|Ov0m=zz;aiXYF6M*QN@`ZT=M}j@y&GPU6qhO=M7)2R(E44Rvuj z{eUy^rurtSFAM!s+8*sr=J4m}`pG=(0FY{-(RWOx>t$@B?eytl z-cHw>uHAvo?pP{`ol@rb^|t6-cZ=EUZZUV=E#|Mg#X`VhF1t(dZ=$mJHtL=T=)OK< zY*M|OkDeaJI(XL37K&<(#d!>GrpDA*{f@lzexxp`n+BGtG4~FqSZLS5Pt?ucku9Q5 z9lfW72c@-{(lj25)-vy7eqO|Cm5C!gZFrsHar=SYO%+Jhoo0}bch`&qGi)I?DQw!( z@=E%&#w3p>jmp{$J)A}r?v)y$o54W&G^1v^^0L;UQ?BsU#F>UUy{Q_ZGc76LWFyY9wRlby^++_+vu6)??fA8wukLljjOGpKbVW9zIpyvR# zZ%RjK-O*H>^HZQMm1on7%f=74fOUixr6Y3h0La-{pMmi$Z28Pok*+SCUQB&Q3&*X`SgN}CF>5fK%Yz9@i@&h>H=sk&Wtc+1gf#vFX0i++5L zxf*EAL_o$|?QO6|I}xnWkln6Mz>JOjVxEt#h#s|oP{IV?Hv)-TAL#ZoUHFlHsDJ~ujw{IBm%wE&pI5|?rauB z(-3qjDu{3k7|*zsariq0(-WJ`Hec$^kE7h51X|ddBO6J#6PCD8% zhzn5_00l;)RcK$NipfK&BE55u6EaOtX38@Ql@cB)HPgYqlZezUbOgU_#>XtLF_YG# z#}IfN3?`y=C~KKcY&`FOK5QV>flU_O5^Xg?AzFpdi&mlcej#jV8Dm7^E-Rs(&HLC) z1h7F`4P&EYaF)HZUEnY{#j~Fe9g20r<3(XRbh5}HQ7k$GEbDdATMeBoECU`%idl(< z?4%EmlL_GCATvb(5;$iy!ky8>JpEPxf>6sRbG_6Wq@Qn(b$+v z{W!Ptk&_DoB&mg?Yfdkv3K~y3HJ6*nPMj=!;G~{VJO=chj?JAI^kC+&*j__X{K^vP z0E22g>^9ToI^LhQ(sRu-o6K-X*(tiL5J7U8bwyCAz$+^GK+$we^Br`-(a!J;IJbC6oF1$BSjV$ zCQv2>h6#-=z~_?1BME|9Ymw`rxutU2t@&gDX+n>0J=Vb#;gN=F8zvus$7~NAY!7s& zRE*9#cRge7grF=C3(8R@Z5ZdwT##w8?z@JjwrQf0GIfCkZwx#>F5@D6bQo^%Tw^_An@alTvn-E9ZOq#1!_CT!bDL^kLxw;iCLArD&)c_XuK_lqC zPt44DHt2p9ffwSFpwUYB6buc7st|{SPPj>;Nn8p5CUIiirA*zyh&KJ16Q}~r5)^#2 z7s9xXun*amKIap=FdK(+AciNU*QF)28Kg*mm~|cJ+CU6X3UZUeF@eSeNK-NtwOW!Q zp*tnuY9>92<-imU?l7skng@1!5zO!?3#vR+hqPPUn(Z?z+BCf$@^_)6bW7d09! zLArD*)qu{eOUV6gY=E(zWpWD)gDGu9E1^t1i+msA&}KqS5wKIV|mLvIGFVKuPZ7zfl25%su zbGobO3q56~igK9#l9+_TNYpzS1B$fUv&3fPBE#D;DUS%z7Uk;fykJN>W~Zyc38{`i|Bj;@La|T>b<0SVi0oGcq0niQI7{7$s!0FdQQ32o!Wl z3q`qpZsM5L1KJfF0%r57bx(j84Q@SEJDQK%9xlW?hJc=_9c`-3Wx#RxtTU3xb9(H_ z9z3*A38nj-9)NVB4NjyFWsmEtklq%j>$ymxb2!7a)d;6JUeC1?nbTf)wFiNLSH!(2 zF|mtBab{7I(~+j#X>)akxQ>23-N7fLC^$@Xkn0Ve!v#w?YW02^1UARBB=H>`^?JGz zPDHK9)pL;gZ`)IH^vilU#c_=;-xeP{3dk%5?NpS{-CGpvWxb zty{^6#(Po3B~)OW=VS$@N$H$P^mUNt)^7@2NALE92ZJ}y0n>iZ;c!2X=ksVh z7X*x8BWHr#lV;bR7^_d%cqnUrB+TtUA6e%DPi2@XKIrJ83AipX*3K2K^8; zsG&&RJ81|!TFlYcpauy&YM5fM@1$X>?L(e(>2%qZv2Ul?4YD0+ns+I6y?&5qAqD6R z7&Fl%ul4Z$NZWJdB|rc+Je)xU6TE!;8T(EVCGL|Sp|xFlaLj8WGbY|(*c3Di`O)Fq z>(Zk^t8@x)FmCw12m!^=f!ghq<@H$5FQl9Tb6_y^3PL7`6BCF^fw(;g%)*OI><#o9 zUQBcbpNAiPk+Br&HoSTXppMtg4qr1pz;Q5vS1j9*F_JX-JvsA2G^zmcXgG>|X}FCJ zw8(Hw6~W=nW8C|OybHfda|+VYME<>I63TPR_Q#uf1CgTc!qjJv+iRvhyy!%%AAvfB z$+LL%7mD9N8qp6i`HT>GED*YmeqQNi3O-cqY+MoDO$Um6r5(&cMT7%ZCE0wQ(|G4Y zI-R^6%2S-d5b{u#r{VX4k~{b?5(8iS^xg{!VV2pNZwfdc2IH#e$5oa))ks{K^h+jq zNjdMWpe)_%La#ei%9zVB=FICmFX7EPe!k+XynMk1+)BBXX1lZT41xVFKWV7cX1NkR z3m$w#R*_d-Df3(W^7OP@S;4O--qLZlL!W$t_7=6hB^Ta)q0UAwm1o$_)LKd~O^hei z0Q|7=m)NWC=~iN2;C>>Z)>(I;pCpyUH-75cs9)|;_=(@fUccO}GNE#pIu_zj&{{y= zQp2IL{8~0ZkHKP)Uz`Bl!uN;Pp)Hko_i_16JbxGezc~`ej;nS2Z2uYjeGPx#LcLGg)5P#3$FL5}jez*s80F@X*1%3g^A3^fZ8abg7y-CHN-4DfaUrf+j zIp_;-erB%!ZLY7IE5DcE!5@;d54_`Ma}%xi^s>J5YXq$)_*KJ)cF^P5d9&-ewb+(? z-ax~*4ych;BQb_+Z9tt{OKz#(#69u*I?Ant+Au8Y!l3GE4e8Pr8jAABDn3BqF^azn z1L`ZSWFmpvJNE-J(9g9}>V&~WeQ!|xPAionuF2QFG^m~%1b-^UuS@RiHvEqy)txBg z5c#f_`NqW9JGVBxrM?I%{mVfA+Wsx|q$y7is-rEXY!tWDmwME*<0#)`(H|*4|FqiI zimSsY-}D;ju;1T8R{`Mq^o9k06JxDpoczf+=r5d7k3Th(ls>vS-LJkLS0k-He8+(p z*^fqDBp&3(LI0#|zBMua{?!?j&w);VfycpCU!UKf&(q&SeY78^J=os|x=nSjqYS)? zt<)j)HB|7pFiwB_9NgB2+ywQFwGJMl2g6!^Z&wSk%)|G`M4?}%BZe;W!>NEg+G8Tp z`2YA*iG$Ljn{Vk?7vr$EU+~#ad(RHWA0O!-_UQ{L;Fa9(<8=YQV{6`s@v{DY!5{Ya zwQg$TkMdVqiAcP3MdGEai}8~19lWnw3jLjI99B1gr2Xk4{g9UbFH2oKl1R3O(Y$pGf%O>T zDQ4)9e~s~}?8$?)_kLtEi8uSjVIK*bV(GLmvLppXY?%*Pm=0ARtJb`1_>+Wgpu|f3!Y~ zH~U*s4)E_M0Gj^-W|4)%)ZgZRLCQY=xwV4`CGZp9Kl)g^KWOh(^rzdElJSM%$niAl z?nxod>%=boJ0kHX{!LrTc2frcAdVB@KLR`s7(5j5M}OStl}&qZK=&EEG2ZQO9XJpw zzaad+zoTv_3h2@IELs{tU*(T8Cx44axW(59eR>Uo=13 z?#Kun_T=Np4>!^NL6JWNeobxXhyB7WW%MWV-C^;U`R>mKP`>e?U&eee>c;zIp7QOjOOV8Wqn7_fFO!6y1`6BSb;84o#+Df5}NuY=N zF@B)_{0a3#j3@BhZja8N8XwZ(FzL)E)Yso1&HhNMyW@;Mi8(FbC5Wx-d~=)I*B8{ET%daBM%>&#&u}U+8aK>qGlRU*A?AjCuGaKSg%YCv>#8U)n=|VGr_P zr$1hMd$)8vX#7)d_m;$)*B>sXUaB|tlgFq|^v6y6AWnTgo)Bjl_TXP~lH$Ve`O_!< zYJCa#*DLndb$gPZB00q0E&i;-U*dI)?4(QnSILi%EM@U0=_V4g09ES}uSvJ7OP3K> z^at@0ruSe$Cr3i`qd&v+c-*Ipd~a`ip7=NF zrh0Y!qZ0I|9>efD!lg0(KQ@4JcaL9QU>V{b{ZCo$Px)ocx5)1S`$tet_UQhk{&AFh z<2qhgK813>$xkRh`!wx!K_60*XTLA$m&N{`UcW5%d;0w{?2oh(?e!n}3+u@Nt|zz7 zV?6}?_O-Sw&L>;v*G)N*2-5dHl9GJBb&UCi_ziaX{y%pF`6-cNs!K3mL*J;|mu$C( z?c39<^C_65eL?)6R!@FGB_v-mNxV2HUc-dSbPE z00W-#N!?#?_sBZN2=F_43;am}P;)$g1o(CVPwuopN4_N z^s#^YdV~6&d`b4Kj=G(_4-BZy;3s25&wSc2>7KxFyEhBLpVQy3J$_s zZ$Wz!IvfoiPpo#Olnt6+?#6~CSnf%w2Tgf;pu_&&NBaAT&(;w7Q{rLE`XS5gf2`l1 zY$!edqlN0?Z5{IaS{A|!`-%2`#FG8UzBTddK3S?K7jRiT;{J{Y44G+zH5W( z-lw2Uj&)1@cwE`7uKup^Ke zf~C$6s$riFq5lWvQt9Ncedq(FyFdA9{7W8kd*3JSI$65^VzqMG(seQ@L=wYun2k6R9=k`qw-v<`SW=|@r!2C$0|h*@*v|&nqr0CQz`XR z(4#s@$D;x{7WBnGM|qP_9IYes4~+cEoA_VYLH=$tiT-n>aZy`mc90+DCIH9Zze``e zb5#D_no#%X-u0q>LZObD3a=TQOM=H^x==q6ti*kCQdi`SLCSNV5S9N|?SF^;JEpX} ztuKCzc=DVMd3*!>s@Th{)NdY_l;qf^w|#a1>(TljFrCNvX>A++guKOB3L@L>pnRt+KrDpV*PWout_UgRTW5WABxlm)F(tg9x5KFR8^=gsPqBx&_4i0s6r@BZI=KQlnY(%H*zw!Kp-_52$QBmA5luCx;f-tL;*Gs>yl?A$Vp?QG zKl;l>NC>MoSMyxY$zrC*#d*2r(XP#v^9^B@>sGNmQxTR|Xn2v*|2yIgVS?NVX3=wWJmu2U=*%d>WJ%9pb_3AH)LE#yBE8E&CgaO)73 zVB2O(m8?^Ot3{93xk|a_Ic0Bx;%DUx*@anq#&zZkdQmEtF?S0kh4j}>2cTTHB{Sje znp3Murgh*B;@!*O!>V4x_8PE|^kQeUl~%QwV<7SSP-%PHL}fn!+C~K-9;Z~Ab)D)Q zdtY@cUd3Ci!oz^P#Kr?Jv+xr1|G#;x6Zr1=!n{?PnW+^#nab*rXgm@er?hVC_1$+I zyTg|BcEWZ_MF%N~T@IcAd!LP}(6RZPOcIVhrH{dX+nMlBO6QsWx6Z_-il9ESm*y~~ zaKsET(sR(>0Yp6t7#C3#*WgiLK0Vq)e>>oY*wzET7w}L3zC}@do$2E}^c6*~>9oXC zOs}nIvwfT4eQki6J;(6=9{ArGuAhQ!^jE`V|4Pg=6(@vooVim@>B?dALx-H0a{mNH;k6N5gmcQf+Q@e=tH6RVngUd zAQGW*Z@xP2RovWMv7BGXVP?-dwMwP@P|+EmFA9;HFHS7fAI?@z<{OK(iTq-j0is+CO4rp%&FSxGQa_ogt0wjTt>RDp^mNi8qM}mS0k;) zAWe5rcPDi>PON21Zt5OnsXV=c)~s(AX6KDz zT6@&(q^^!`d1d*c^!G9HmqLWl1!v1|F0ZcsWp!nF6`?aj;U>9HA-C!Uw^po_&5VUN zB+qeY3!Yhs-A21#ABnfZI)~&4qSR2$&W!t>SNH6 z{up#j1azcR8_pcHy9Wc(E>#9iitFe#{AAfH=FMa(BbI16i*&_ji#a$}y?*LOoIM6* z;nLukdwqJc4T4<&<*!er02BL;+xW^V=j<%LsLF}MR5D6=c?_(qqD|t|-M`tyf8fX( zf+sgGI0?bz1)gGL{(PlwEl9VMC=_Ek&nf1u23)Ppl`IvIBudnnOZd!MR-v$M*4@;a z?xIp|R&W}EJDkMcj3E`s#h{0XEIb4yGht8@l(lznbcG!lO|7Hy~`;) zB7g@~8mGDs@?(*+=vO^?k$T_*Wf{pi5&kA_YmQVQJ zr=|Do2c?(XYZZFwo=m=8FOicQ@Df}-USe;%yo6?kY}8B6mPy~s$$*zr-Cpt$$HAscX$*n0fb5o>2Fly1sTt_vC8&qDz}lblL?boEK%vzwPSp?5#2qHn}gu=RL~ zn6jw0r+yqax~05L$%OA~G8J@|Y6|D>jC_2hI0GoLX3|rssiOyPJ}T>&#&d2{ng#Hp zcGC4Mr;4w#hBf1zvNSP=Idu>vZ5;wx*r@(e(bH^fG6x+;D}YGLs!It6yD3KM(VC`u zHj}twj^&oM)+Fnb{2YoyN4hDbT`Y!UA+wM78Wpay2nWmV0h3EIE5MGPhHl{&JpZjE zdAR+SRVe4_CFL}}S!xBYrGx5{$jN5iLLD!2-R*P_Dc_a%5_U6wRO?;?=A9hbFt{`b zM?_>iDz;+WPV;SYPP!wbf%#NKGzs>pVM5_If`kxn?e*7vV!sf}+x&Gzm>qaS?C4nY zM}2@lB%&R7Rsy_n0(UgKS|YMLDjvi|NE3gNkzuxlFF!zN?97PBP*lV)9|>bV86j!h z=&pq(AE@ykYJ49f`A&`QHpqY7u2hq5xRy!q-rFQ8CwMD$>pF6@o=ZKRT?)1MdW#Id zDkgR=Dvf(FmSf`V(ojo$2Xndt`w7;Kh*(x6;#?f`%F8hT&&9-3jVRwm-LxIy1^#Sf zP)ze31@JG$#rGS7gLJzWefNpD_(L4@gM*i2eN(@peCBCcHWPgM16>)z?SA4IG13%gl-&6Q(OpF5m0pVYGVu18-a|hUg)--+hg(;%Vg30P%xH;@lpw*9(V#_Tsfu#3If;iM3()Ys5cVlV?ttz$ao@ z$9TVZjeOXGe1N=1#VyN*czsVqts(0zL;OBYc$5snuL3_#cv=Jhwjo}L3zMB|-YCR0 z=Y#nS%li_qu^;A+7}2-!&-2>HlYQJq&?A4N4FlWJlt%#a$zG~ie7^~Mqm8)2H(>+h zKVu;Mno=I(XFkt(U;bX?_aO0eyyBs~Zht?UEZ4$x4H*x%e-EwuDW84&T@j=Fkp7JW zuglsr>%ZC^GH3^R*I&9*T6IFR1ReTHiFp3vsc-$AjsAaBbU1zjEu%_HFmvhacB?dg88QNA0`6eAhiHZl4UF zMBqOk0V)@=E_B*5t(H~w*wa{|4>qa?Yw<@F@ZyK?(=TdWBvQP3JkBYFL__asT;nj} zjgp%Me78-6m{YGWbCS&OyJ?TC!~d${|C`S#C*AyHUpI^>f2Rz^L#?BA)Bb!N{@-$O z#|IccB-$dn`LC0xYMVmN_NxHxQwY6B{LBpypOo|JbO1)vA_U|dY?%@yQfN}(Qg{0O`mKD|F$j~?}Y=`pB}zYo>EXnH;BebQr)k9^eoq3QLg V_eYOGKJrh;QPb;D$5oG-@1J5L-jDzQ diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin deleted file mode 100644 index bbc7d301593f72433c5b7626638d0e6f0f4a0813..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11656 zcmeHNUu;`f89(-Qnm59nx<+SDBYiKg0b-XzH{#N z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W z)q1gzTc}pns^!&23D~5^eJyQZje z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo zzd zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l z8cXjac$Fp{)2rGySn+!Tz$2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^ zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hwds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe; zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJg+VS(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4slX>$8MT1f}SwbtT6 z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9 zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+ zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G zsDgp>M-8;CXUW!_Miv5%hx6i!lEx1F+_ zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1R{!MY@Lij z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA z*27_}t%&+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z$nlNg!-(Mc zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;%4A;gOT0 z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3 zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mYT8)SViUf}%2 zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p zIl#3z8VlT3jBdQ_aBuXVL3zNWSBWnFWH}S#$6NG> z?z1AncRQ zIiD;oU3r&}+oVD*^JK z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB` z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD> N+sV24aj&l2zW{T-tULe! diff --git a/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin b/cwmiqau7t5rssvjroylm2qwtew7tkyixr7l2y5x22afsem5iac72.cubin deleted file mode 100644 index c5783aaf76e89eda9066225eae34d4470b74a950..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10296 zcmeHN+ix7z89%$TzQ);g45UdDnhC@Z)O5TTU)JGf2!U1&A_Xc`C>>^R_C|Zl?ATq~ z1U6|!tEwVW+lM0c0riQ8wlC#@N>zp0f(j3ahyDRogerv6)OHC_LD~Mk@0>H9-L;Eq z6VwYw@wt5W?>pc5X69_3JbvQta43}O6S9THZ$wkgX?XJnxp?!S93S5Ku$U2P(To0S z5fZ|x&(%HGb26Ceaj`7dJlgfSO0FTS%7Rs_6sp4V@(s`O+@e>l*tKfWF3i{SIoqx{ zUU4CBXB;m(XIH9j*)HX0iyo%7=Q_nou`+8X$9*~5lTe>?+Td+d3OQA;%wV& zshV+0aJA_1I$N#OJ*VQ0Q~a!4J~KaS7hI>D*NalIg1MV7DWt!4+5nXWTQU>gt~>R* zWZDGoAl}^!KCI?7Y_9_axHLfhtZGD$f4q&^1!ZD+ziDeY(a-#Qc9DuVjRUYWy`#1S*Z zaMwY57ZCL*U|d8|+<-@c`Rqs+{XKviVpkXZA;3cc_zp$!4W^HE(N`6{rqdFam|k14 z&h{OK_pAfd?0JUwcESJ7aQzfqM}I9$_7AQD)NCH`k>U0DG?l7fF#T0s^ewbQ3QBWo zhKM{)V{GVbvw2wbx6U>X_w!T;f*ArzFw&LqFJ49Wbv>F36 z-9z1d)ZI$madfMzt1$n1TH4ex2@_{%IYM2Xx`(Ly0(JYTdyJ*>^cq^TzI~WoG=^yH zQMZ@61#~N`E0?9ePm;eBB7`nDTX}0`ZS5~>t1D{=of!%@$$bjBwUBq~#cIV&TT^(! za@^UxXV%O1blee^lgpuG*X??7DQ`|v`Bf#_v{1BXD`vv9GHFwm`qd zCjCuL9Wm21rtzm>Q*pq?fsdcL`CrMR^8QF->LcZ)NnR52r=>p9n@-^`A>rw9=z)*L zdNEVVn^wN!xr^pji|1is)f~Klnb!Q|xM|fq*X)YNR%P2P*^VE%@;VZqfR5xRpdNcD`24&#V zz^HpmYH}R}y8z1Fl1u_74xO;^l~u{w8GKPy;zy}ulydSISQ$l|z^Qv=yNUnk%^L`w z+`ixh1d|tdijn!{>Vh>d-Aa>R3I0F9wM^v5Xi(v9-^FX*N|}-;#6{eND(wBgxci9I83a%J_cR` zBk_d7S47P!c~+gbsfJPxG>8%tlj%5lH(#>miVE!}V5`DUC8nm+lhcW5a@Ez_oWwH% z1RzVnS3*d^Zc<7HVhX0_ay}dnlu?CELLuoZRedaFjhh5bijejBjE|97!|o(g%Fu5X zb9qgWQ>K6dezHb=f=FhLwgUr+UEf2Pe#DG7$PSgd(5@geqqT&Il06tkXPa_Sz}3dxnQI{bs44*#_54*yOh+C-9utdpI~+juketbq4y z`MB?WN_x+JP=d9EUb-jKuiHyx)8sW+wzkuL)!OqI^bu5Q>pDF@!`$Ms?fckRMWTODcHI^ zMNC=Lx~F~|x4NagO^LYgYa$tRmTC&;Zb3f4lAHmQScTM7a_ZQ`9mix@(Rj{nO0xi7 z)K9yf<|pt+bq*QjvqML1Y?kCZ~ ziKFZ9S@}wiUQ^ECtEHajS~{SviJVNv%`f0}uCtx#BIVoiZo*E+k89m+K-tNX4Fk&q zXGCPIU+jdc-S~^_l5^4=841j%BBDvK&kPX?zY)ZRc>AEg?h%KDSlQ*TBf@OM8)8q} znm_6T{2|fbhG!+f8z*r`v#%v0`}@UXxCm+DFETvDw(#W#2#vjI5gF_kQOt+Km`_GX z8aKLYp~;78{D&Gpz=$p6Mt2+Jzi!v6Nhe&(BzX625?xdX5xkYUYZEyNe8q=z%b^xu zZ;|2GMaAAlrSc%gN>rR%9&Cy4Voq0JKgGHc5i5#BT!?{Qc{vK;g{XL@(a(2Lx9>)H zfj`$65Hoy70sPA`@q@;|0Nw8OzxQ-Z{2>PVfq|>BzP(q`Ki)5H#~MxYFGpJ9+gOqQ zIOHuyTjD9TuF-n9CBCQDWPgM16>)z?SA4IH0YB0b-&godREz-sA>m(qx{vhla{JhU z)(oxBi=l?-o4HkpTIM|i_D62SdbUs~w8Fr@3V$0$>j3!0ZfIeCx*vqRSI_r}H)5iv zK|4$U;NkC~#!m4f6048+K|k6U+==uQgDmeparTukvJ!2dH6$)XfRFSMK0^2lu~1J# z-Bpr2maj65)`;(~mViG;0`Z6SXd}wA{CB5nI| z_Lcq!TeRJ(3PlF6WV|&TJ-l*^2bs^A?F#QPAk0D;NRxmx{bM}?N z5%Qno@mv(^p<#cG{G0G%IIGRuNW8dkKpgbK;h(*9;}o%o3+J&m41bOI_t)i_6DIKS zDArNlFJ2=bb|4=h?+J0|iXq-O5K(K$dfO1cj}aavgYav>j}e~Mz`tXN*J8qC=bE<( zF~j*_KF9LD%xmn2xhG2W>-ZOV?c>QlZo}x2zx@pZ+tHLq0P@LRs#$!$348k+F@9VMJjBm@f$_flgUIgz;^%n9LI>U6UN%{-h3OhH9&G;sTK7^u`}VscO8FuE z8v|aKwHelby*p^o4)U(Qe4n`9-NAKo2eH7ud%r{zYE462jEScfQHHVV-G>5AcwKJ@ z|5i-gv-p9ay4z}f%MdTd#2z0HrvJgUZ6EvUU3b`bKlm_yT;u7<`;Q;9ANb1s4^q%R zZJ$QqKOzEDE@WM3H)lpIYwEG5u|ywiR1Y@d?^nRfAHh$*sCAG?@#^t7rxX$my{B=F z!-zLZZWi#JHW6Y@y}ryzGQaPny?GP<*A@TYd`>y(GxuS%_urOn2KNTGwG-IYt5(OKiY&py#YU^R44x*H{n02 z_~XhT(ofIJPX6~d;csrBpZx6PzZU%X*L4PdtoSMB!KV2&jDF82esx?lScuQcss2rw z?5Fe7lwiN#sq_;**qZz@EX4jG5OdNMZ1n#I$fi#IpD6wyEzLXmZ<6^V?(87c{9goK zw~g|T{HD0-|Iemr^>|V#)T53cfw#k__ebl|quwt)2KDjxq1qQsuSdO4dJOWBk9t2e Yy&m=c=rPDg{^>YsdOhm6>QVFk6SbnqFmd`Kl z_@X!|Dk6&eoCpb#?JxBQy+M5uBmEq%NSL3s{!*uDi)?2t+wPoM71=?{9%Ki-_F%PB z>#nwIXU_Fo&01r%v$nc?ZqNqSbhz(^ufJ69wVL<9*=zM%y)}sBM7GyjzB`A&Sid{Y zW^3`>+1i<2eWfL`5?F3`FeY08xb8!A)@m-j*01;bQs_4LI&wNH;NZFgyEd=~Uc1UE zj>YFDiqohsFQ4tzyGxw>Zf|w4Iv950Hqll){XxAm$bDFrWs_|C{?pVLS$W4;eo(f`5zgdNp^+ zyThb^(nY9|34ChSjca{c5xbh!6oT-_ zMzlzTwiS5;#-waypW>ZG9AVJ~6bk)lW)k-hFd}q(u+lv~SnV~I+MVXP21d2B_5SK= z=jnF+c-|7Ck+;6ww-)WL)m{5?FTd94_DlJ~({^F0_;h1%wp@Is(JnVyE9ag)V+|LJ z)-zUrCEFc}(6_(;d;c);T~sUqZ{!sT89PHaJ}{mQ=R!szJR36B!f%Ms_5Zp4?&c5P z1qSG637rUMoDt)$MnlFVQHJ4{L!l4w$lwm?NN7`X55q5oLPm@Phv7{TdjA~&kR67v z3J3p^A&23Y!pOi991DwCN`o&aKzMu>#$3pFkZeChw@={q1LGUd(fGT9?ieEo<31X% z(Cr}Ip1|#f6!{Xt5%O%FMqj{f(>Sb$?-@BFW$E^7;kTeEBZp)sM_i}r_K$4qucfX3 zWZ=&wP$Kd#<973gQK4~{#>Oe!-!uM>ISB*hxE1L}%*H2iyLr=i=Qj!Xs}S9fkl|m& z=(=%?U>U~|DN$ToZ0^-*G9@sbtGIw$*!QTZ2r0rB=<=MYi5- zVn^%O`t7f@GFA~in8mrS)$3%`2AIiWZ#&z`m{umcSjn7SUR|s&*ZSBkpLi07%IsRZ z-(Fm9WwNc#pf}9ic_}|EvfVnigG~0ERm^0&gI*>pH9BTKL_>|`Yq?_|l_t}psgS=r z>-lZS6g)Ejzu1f72ptBW=?vP9^8}tnI&pr^Bo?}l6mp+Dp>T_cDY4Ld#5D6R z4)%d-K2kD)6Z5BPIJ|cnwZ+!ib|?2REt< z&oMuUX30-5HNh&JfQ6p6ozH7R@q}1B({k`g3oJtu0)&WI?Kwt(0f}A9nfI} zwBn;!OaZ-+_u(n@ULN>j&WD%U3GdfKarczdK1zW@&x2l~T;a6a%`OjAI9*P0FAvYF zpazLfmf>5FCJn;2BTqfnko6}VE8!C#RASBmJqLKV1Bgobgq7Ydv@Lc_EN%~>O~uSP|# zk#yH@waCdLy6Mwjlri%?j}CFl)90zvmAyuWs&45tjl-vS#?TKSv@e_xiUx9G<=k?1sja74k;_NDX%-8nRVtM&?umLcyS$o}3VKBj zh9-*A1U1;=95Tm4Ol6eKOW1>CFNH%PO5P-UUJ^&bqzVe?rAX@#Dp*tw52?h( zC7~kKcE#s@iR;V#PKhsqfwS;zyV=q)G^vcBkp^W{t1V^+yPE_Uz3Id`ghwI29hofd?CZQoHi}1fd}BeHTH6`>(0 z%du!#T2e=4*)=LwKvphhjx5%TQCTho)iNl{E$qlrb!k`9`!6-`4~eRAZDrlD>=8f910sJFfLm_#-0=8<+UiP{KAW(bg|^Aw3z zD6bS%Lz9qCFG*Qp21=LQ0?_YNdaNKeR5g07`k>WaU2ZpqS^PBCpdTOW%h^-;haYaO zwHt%la=ky`lZ=!yvot52-2{7r(sU9bXK;3rN7C#m)2D#+HR_F}Rt+z$(1)e- z7jd>Bhw_=DnOx>cI1A?-3({GHzYS+0HR3EP%9^(2Y~FFU4SsS#&& zjke`%!Ev_Yah7%lud~$R+whjeM%+c$Z$Gz1$6eF(_-h9H*2FB?j>8ZeaagNDeoOb< zxh2=(qSs-pA^r?w7H-F3h>bX`=K&nH?%83>b-3(x7z>oo;k=&E_qH^U*oebeoODZ+ z@7dw9<8VIbahP5Q{0{5JyGMs1HsY`@7BXD9XNN1U!v(Ly*mB^o|Exk=OOH-vbSA^w z2Jr+~Xou<1d4&fh%mnXAh{Z}E(Z=K+(n{c8=nLk_0;J_yV)!2@TEQcs*v1mo(h9MSTck@0;?Sp;^Q}FnT_af#V*O{70uZ6bkpCEb2r1tcrKs`A;6TshmMsZe1-~qSF8!I@GTT7Hg(=dNYr3Jijwo zEXQyA=q8T@Hv>`4TnU%E&Yt)$#+>A0zKd#r;eyqe}c*~ zaZ-#=5xggdx)NPJ*+x8wCEMhlfx?Kkm&v#Ryl`6H3Ki?RZk8B4N&DADu%HhR*Of` zW;yl{fqOi3a#*%V(*3=#$~ml#m^jBP$V@IL8QavEIw}hn+?y;jg@zncew;f!XmkMC`rR!1;@b`LO%foK~kve?YlN-*&b5H<5(; zN~ylM*lVrf&ql%P;s`5$>NUz36#S^*Q3qD)4e}vzK7l}lh#Zge*AE#Q@04T4r#$0w zM645iulxuiBJefB`-dFZ5WDa}+)fAPzr}mtkcj*ENk7ROpT)Nq_ic&D{c(|-65?h2 zi_A{5E&iwhY(_dGB9n0el9&x+Y(+>LU-S(H4SuYi|Er$2@Z{h?qi+_xUw0fJ0bD`+ z&r9?OUZH>DkLS;aw%BnB_JJ9Z9x4|W@a)Wp=g&`Xi5D=Y;JyLaj)+agA~upDYSZ@+ zxVDf0^!3Zc4_{1**OTCn z#ZYDO=!D|GFd-f#ULoQ1P<&bvqu&eDiRS z|3D1?PVVA$it^he{ri3T z7wpL?N1pY?_d4)K68iQ8^zFwgh)d~Pup7ji z3~?#3Z<^?=Zz(A*eP$2mm*K?_)(iEG+ad65yeR&DsJ^xPD#3ESaX!mK{VjXKSl1JF z<2dG5#CQ&@TURmPUOUM1pXMjef8;Z5@7f}Gg4P4TDXd59fYp44Jf;swdcJ+F7m1R; zk6y)kv~dONML6ZIFQhNEMe(P2eIb2wVXPF=KJvTsLYU{n*5;KY>_zt5v6L>av_BbH zckM->pZUXby?{QH3l6*fO-I&!a4Lci2_^kCxnA5)@lgC6Z`_R+`GWo!#iRA7?F7>k zDj#WklHo}Qo?>{)fp;)GO)%`G?Ky@yeyYcX80QNnczwtEwd~}j&Exi@ulotbKlc2$0zrf808Oi_!H-TBYzl9@cIZ51y4=-={sb9 z5*_xe3v);?Uy6VCE-yT9$GCr$J=9OGuL)@%@D$Uh9C(u99Rx%FygkkEoI~Hu@NNmi zzU)8t7yUGECk(RB`w8NC{(NLhTn2*nW9-){duE3AN8aDSkM()xDZt&Q(3Qx~Uw#GB z&jUgHfWO)o6Z};yP4c%NBRDoo4acw%0vu1$gkv}v20XQ2pyL^i68~hH07)l!dM_z= z;obWL&TlT9*dwq*61));2b?o0`U(D|?HJD|xnCa_m@hGBzJUJ1a2K4zMe<8CfbS3U znBc=ipOc?tGM(VPVLCRt{OPdBcwu~rd)N!_3=7(DN&W#|A7X6q$EX+fqlfINna@A| z_~Y`t@EFbm3I4d3=jUrzDBlx&UPvL)s1Mc`o^L|RqkgkfrK32NItkU+7{u zm6(?hufN#OI6vxqG<*3oVw(2PNE!$7?;nExIPX^u{R?8|eur*^u%GdMxMVlDqgJ^0Zb3SXlF{I`-;!5#pgOo7Z|g^%_fu$)Hc9Ag4BgB?3HGkoVwi z^sg2naEwAg#Dp!^X@>v~vt`1XSmqxTwJ zBigIH*RM;Ud(w-xhOd*DFhi+7uC;iMTl2`BYD#AyR^IyIXc%i;5_;N zsQ{l)@;OickLw@GDY>{wzW|T9_K|X2`l<6r=NvFOCkC RZ?7DqZX>7jqn}#t{{RQ)El~gf diff --git a/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin b/cx6i7mlkzaxbh5vk47jvftmw7ls63iczwax45psdovflgeuxo4z5.cubin deleted file mode 100644 index c7bbcfbc8082bb2f81a0bf73c6c8c5e63be6fada..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10816 zcmeHN-)~#h9Y6MUg3|`)2T=(tTW>})%9h>uM-scGD}h4Uv?4mnUO?z_?dv24$9Au8 zT&JPbDHB4|RHp4^NSlOsLTt|yn_%h#RCEt$|G*?vp=s(m^d__HKR~kC^N&&*ImPP3U0Y%R>}o)VWpb4O|xXV zg<9UsT5fL1ER~&QvzT8jxR{!*V--q;(xPeEwwdu%ZCgoo$#U}cPN{M7)x1-KzBu=A zu~^PpMZ{ciIh-q(s;*UXXDE?|ozJc;nhTD#oL4}xP{Q2F7bVeytqw$~W(r}Vn^mh? z6+&C!EhN5Mz{6GCy6M*8A=$+eDKCvmA;*y5xMYel5s!et?bh?#o6TCnewV*ubfWS zo~oV66>NL1=-RHme7auF)t9Y&dd8@%vd|lU`sv&KZ|-4i0HKQ(327st&5u6NR>Q-g zkF;~)Kd?}9L#qSQ2B6MT%iON{zA&Y|Db25DYHx6$0#(C#9LhsOh5B=BbHO4M<^b2Pi1x+l5v zNt$wzQ#9L49T#?KMj}_WBQ$-GI$@jaH8+}n6CqQIdd-?wo12?|-GpP!O--Y`yi9qa zMKWr6r&=hN^o)_lBb((c=3TwIY#xkT%)nz94*RNEEu6{gb98Xa!(BIU+%J~&gl=Rr zx;V@qPb8kuzooAooSo9?GnbyyGc;!Kk+^gmxEavnXYc-3y69qfJdysl`ZA<13H=9! zJ+hlg;gb;f%na-x#zM7_E#`G2UviyQeH+K_0cKPzJk06F%G``@R9r{zO360Vn_RgW zpSmGt5;w+7^2V4+-54`7H^$7Y&rB-0osyL`HRn@!9ZNUO?xL>~r%GmK0 zY!!S1(+4Iihcid8^*Jhwzy>Cr!!z+V3a$ZVA5P8y$MzpL@wKs(Gqd=LSc)H_6RT*8 zEofvVZ(?pre|!fv-4l2B(2YZE2XrT(n_|J`OF=A`YsQKQK7kW!GUr+a+o&Vp>Qd2= zX`!$t9ikuXk%d?4iFRQpecEIW^WuWXRs}u{Y3fptE+IhEP6kVgrYoP+8lMXSZ@z6Cx z$y+HJO9e@Gl5KioHj_+eGAT-=qe>}>ixBeQN+FYe0&YY>NFoci?J2}f47{L%MCOWR zk4ly4B*Bw}FV&T-N8p7if=SLwH%}MrykfAWCx`@++@RX3q|z3Z4KB!f1%kv(>G3*N zL+uV2mzep&tkc_X8jv_T&JvenB|GFsl;GCXcplI%pzot z+1e~m*^Fm4*)~f#?97EoFLD;h1y8Vq|l#MrKz1(_|hxBAW!Jyy#~{^gIqKhmO>XoQs<8juOHD2T?+*D@xemn%Yq+-GNp7%u#aBGhUQwUzC|( zlnK7GZLuf1!%V2s{(P%RVqJma^aun>#<#Ve;y{Xfp(f(KSQ9u)I+rxTE7}&vQIBBN z_W7K_LaZxTFLCWA)}eCBl}oj5p>?Qm26aOvFT6}%uW=p?TLDzAl9I+;E?d+$+k zrYSn#z*KBLI;);?T*Ip1ZD`$CaL*WunWs9L#(CPCA;k+RNd-nHt`tjgm-&H#Yr{|A zCmUZ6>^w@6e0WF!2ZxSj;?$tcF4T;Lm15CT$+Zz1j#)flbkX@H7xWYk->3LH7jRBGSC5zZ6#maX&=_ zC6Wvw#`ivs$Igqxe^pHB{2JkTNZ>>#DJSoGk86oT?T65O$)=~bv-n0?&GUnJK)(F5 zvRNlz!?SL%o$X@fzm0S=1_gHt?sj0=%8?HPYXb;`MW&&my~`EQa}T81uOZY2%IFrP1V5IsQYApJ1e1 zP@{KOln>RuEGI#{(n!F$36uYI zvED6uh5PO_=wmH*LDI7^HVXQu@TYb0!cAm<#KCJ_wAQx)UwRSz(H8rG?}Fn5|z95Aqt?YYeh05%3cgjriXudHHUdVCb8y#~5+Mg#OTP z&VYU%2+9}NKg{(H3wrpadL*zueS!7q>!bCFcsw1%e%%r{#RpwV^PmPtmmvdg1z22Lh9L-eQd%Fhky0be@dH5!^|4TUG1NI%c7fSK-SA|KrUeFR6u zi7+qpJ8W3|76bU5Y0#NG{sjFFN7JZc_^MxenBC!sb9OgyRNoU$Op_~)dVUe`5c7Wh z0sW!x5F@pQ#;y&q=T}*j`1|+b0wB6N&xn3?h&{Oa38Qnp)p|~27l+umM~BdV0XEH} z-@pG}^Vp-0;YUWlV82L7iqK-*sI?@qVoCHRcI-V~Mi{f+i@ zFZo*~QXU?jNKl@BISrWLyt=#=m!tsk(DNEUUNMeyF2>TJ2XzwZ1^KYHB>B`3e!YeKn7lT_{IFNvcgK2{AXVBNVmAIIG47WHwMP7` z_c6O-UYYmdXnI)6za{Y`Pp$;z_h4ZVLYIBER|x82@M=WKt95)5!i1Zbd}`38yjo|n zPZy1nQ}|zOA)o0XPbC$U|Kk?&C!~B_Iz;yQbFPGy{Rrak<$t4xec}t+$D=5(U#f3^ z`fPu@8AUxluwHKVfO_XmHY``VtUmd#Z7rt|7#4WVf23#YHC#ZBlLFU)cSDczD})| r8vXY8gHkY|QdX^#8vSwZQ?4q=ltK#bI$d(PuzXro?sx5*}%jVWWQ$fa!$kBw+Z0wVKF{@&BJVh6hHE!n!*$ATt!mb5Wpi?-VHZua zYPscE+ss>Tp=4HT&a_#vr^+s-rt4VcYPmXPmMo{}TGf)7%6JObNmT>7?c(|>bL@ug z%tC9DdplLB<*f>`PQ|3D}oC5B|IN@dEi z>Lnh1-Kn`Xcdm{IefpB?Pyx}YSMvTz)32}?RkuGlZBHAu$;pQ8ioTaiOVP0!u-a!$ z1wVM--S?S--%OfTrEH;p(GvdUVs1CFyL5C4?>vFWJ+0Qv|JnlbmR08l@E<=8@*Mq+>0%qov)u`2~hssu>e267VwP`!iM$y4hv=Vl8wvn)E zZaGmY=M(igw^XZUj8rnwD7*IUb*pgLnz9>0al%xp6V6Q4E#rJ-h4P_8Gl%W0Ggq6d z9hrNq<~%l&ZDwZ=S10Qkw>&m?sBok>b#$(1OB{SXU2A4I-2UIFTi&I$nGbC_wH1KPEu;&JGE?a=9eJi9~B(U`-(#ATDfje?#$zWbl)qEqORRQ6x$%aOh` z^zRh*$Zjr!|CGSzMqvjrmK)`K#nuhG>N<1!W$*DL%&1#DsI@BNCCq9yDBVt#Z+5BKStBV#pMw zu=@==M9;0{9`?+2OF(+X1YkE_Hwzh9=m;D28%DoS(EgZ_bi@o|YNeR5cdwgAqwvV8 z7QJjEpRf%3>9JAxT5~)K@&JmU6A2%&>PE#i8oYva93UWrDU%(~q$$jqicu;{vXg?- zf}Tx{<z~t;IEuxibnZgoTLzpDi6Q{~pyIW!Ovf|_$L(YqHr!USFZ>5fv z#20T-Yzn>gq}_fkkwUB|Qsk7YYDenjajhlgWlANzSW{_#u(YS}-klUjTbg$OHP&P% zo6hbZ+qYkAD~is0o7&7rXN@C{Ygl!>sB9XO?omTA^E9Wi6|Tl0Hnwp%L7Scu<1X{0 zV`=#a8L{nB!@+OLkvw)Yw4Y68?#U(Tg0^CL)|i~BRFujL*G6pE>CnkNbiTYL9Y;6c z(xf{Xz6T52bXL$&p-4PiY3^@^QaL$o8BWzB^JEEmo{F8jyE!;I!`1LyQ1{7)juaj` z#Ia>dg>}Krv(F>t!8Y*G+_OjvV^Yz0qR59jLhe$pmD*^v&W zJqKOopvVMwVl=s*`=Km*Jk-%BAXndJ68=>!mArb27b_$Dgi-FlpDs!&EF!VOdD#%7 z!Ulv>$sA7j(QJn2Aw|bE1*x9bd`7rT9h&?Mqk;iex)jkl?N{XUhRf7_mCVrdFkino zwK@-0|53U_=f^C!h*f$VF)_z>y?l@EYCmV&)gnEV9tVrju=xQQmG4EZeBQBV@j$<( z>v)eo{$11>wXMm8DpS2ez2Ei82Uw&7=0-r*4&vSDmKF=$7-8q=MI&CT zh6V??FZ7;={hFK=6*Bi`-JFq=R>^piaJhntZwwOWMhyPutW zIvC-#-(n1~g%Id5PLC0NAs!Al#hPHG5Bd1TQ3Q4h3Me12H{2ZL{DVS&aJV_d;UO-+ z75g1%Es06%JFqLSDVX;m&%6I_h=u=}^pz=cWMclcTuE zKF<%Q3V4CyAL!$+{kME(fQ`ffZye+vQ$Bta0~{IhVD#UZ8^@ZPzk}4Vg{^Fx8w~#B z)myUM#=>d9n&!dCf21MS4%5Mri~)}EdU1Fm$hJA55YI5z7Z3EKKcN2!cIPtczcsYY z35RL_;+p~TFX^!TfyQ2e{-%cX%%f%J5nmz%I<^XuzofCZPAeGv9|9)+xa5aM_@`s* z0{E0{FmA=r?=se80i=M=qHMgU)b_!{px4Ae!&la^|PE`z^@a3jPGUq zD=@4F-O)ro$o|=_1c#`fxqXs%f-$N;hhIyun;lJ~it;L=exQd%$4~I}5OqTM{)etp z)W6^l2Kxa;359r(AC&XV1UrZPy%i7jJB;@qn@7HppQsaV34M@9eBj5fO#D25#2*F% zDL{UqZ$pm6L8tt`0r*}b6O8=A{(amqAU3bD3vu?;9P5Js>)V2jp6F`7Blkrr{BZ-+$2_w37UoMBdaz?3Sl*zPn6t^x?`}w5E*FJn^XdbUB#?bt zKl2`kn2bpKME76wHw#Fn(Ib&9|3$y=A)mVCe<o8x z3Xk%te$T3S)p$@ERHMotp)W^H)koP;qpFu0{r0GTy}Bp?HL3#2(Jx0RRS(6kMpYj* T`sFA-l}E*|MwM4JD!G3EVVCTx diff --git a/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin b/cxurxwta5vlfbwctjkkticzdokzzr73dnqi2s4asnb4ckdieiii5.cubin deleted file mode 100644 index 4f37daa42cedb56958717d999109c3a635cd248f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 9528 zcmeGiTWlOx^~}uL*>!BMotCtQJ|=AzToJ}Q>(?eJt=sg4NYv045Ut8EyF2S0*qxos zj-7R!w3~nmLa3F150Ln%`b8w*1N8?9p?y_(ZgibRw!oUsdp5@nFtGv)~N|xo> zp}Xi<1v@NOEUzBatg18bhPbrCz;->?o43qtBxh?98WlTmN*{=fz-c(aA`0WS%~$IM zyNa&5A+xnu_Zp$?g;NwiqvRA8=B;vI*Bl8{T@TlRQx%&LY>Hq3XY^e##BYSlYQx1=q1|vNO z?FwRc#jpvn{v_TC^yhcPcKGr^#A`6xga0z(l^A}zp!h1IPxsK*1-+z`5|1%@X~i1b zw;8@~4Ix%9GJJmz{_hMg?}BUFUs1{afi;9!Eg*i!$Xa|_!h~Ni`s;e=+cr$-7$NL`8Z2y@)zL0d%~>%6y3^9?dX*6Rm){})-L8M*U{3TF+{pTON(*Mhk0i@7%J`##Hs;pjqdqrDPN0iH& zs-6X9MQb9kdSUgg)weZG%_`bcs*c#Fak+}?*Fm{>;hncN1+kYetX#N+L@h54W$Fe$!xaTPr$7go^~T~W^} z=m$l}SagDhTle(5F^kVbb};XRdZT9T&Dvntr4n}7hShM7IrYvq@_fGH8>2GFskDjMFkADI;n?>9d?q^Tz|97&e8$V&rzOTGI$;+Yq zy<8vZ%}?T=$?^Fq)Pp{{4YyErbi?t&U`gL>uNw!$xAA#NHx_25bi)q=y(b=)O^GX-I8wR@~}+cJeDiI zW$3Alf9LF7c=i~ofG(vn!JWCuH5A%~sM4Le9O7X2K?`4syrNaW7a%XYk6K2x#E*ed z5VR(qx+k_;_)qR%NAS$n1)C_Cyuh0dnP00f8VlTQ>`$3uXuBn&iJmqpRYSxB8&)R3 z-P!d_Li|>J&8_PzYTc^fNfaFS5W8Xq%Ygi|y^?g##HN`EaL>%G>mlxJ)sQJNq#i^X zu^ndu7FIJ$v35e=sD?&^CBkRBCSG&X*?b=N3uQ;_1txZPmW&+UHq8B)ra!di=%rYVjKRpY2ziYzJRSlmeLlu#$@ zpEyf&3UJDigHaMt;vPNQMBH4)H+4KZ;+2F-ZJ8A6CotTMU;nh8)hTf6sOG+)iCwB$ z2udciy_CcF2-RpA+JbH}-NnGML&vXI-QtpgZ#Tsd@A&v-;9zdwK4;M_hE~;XgzP+X zF4O4p0<*B57YEG_0VJTK%pnVvnImGoiOQGVi>!9LG&hk zL9QNOFlL+gWis+*Cguy)U#Bm0@~|XW$@ls~CvX$a5L=HoGRIx@^`B*CD)MG;%o}z_ zJ5s|>Vaeof_2h<5;%59owjO`*6m32`(~&=BF76OE0iK^SZ~eT;$m;UsY;N|z^gRcV zqH!b$4co^LMNOj|9y25{Z(NjP>K78rV!_TOHWIq6h`R^12^9N4K1&0cO75ajUZ_?j zXO1)RZbI#jI*pK;W9^Q|nT5GrvyH%$+w6#N8rI{ePw~!z$2Znl;tgdF>oYU@38D?0 zFuEO@`_^vnj#r|);|Y9NZaAzxq{PPryHE(6MSP0r9w&N8**Dc*!fwV73hp(aW*5nZ z)N%?Pfy6`-hH$=~)}wq)x)VEM>)8af2zSFEZQ-}%EWo8*5!?sk04t*roB+LJUxOPv zVD?%V<5wWrvCov?zIGVj8CtIW7Jv=6{I!J)FsR8uSiK{>yQ*Sajp1Ov3kmb>MR1UoY<4K#tV2+445ya++wdJ+(x~ zl6{-Fdnbkw_fIubFh{gBUQNUIo2e9i?MeRa*)*IVcu_(TMZU!DukV z3~uLf|9l!g-UOAIM~Ojru!-p`0QiMV8rQG3;afs}VHo*&zT4x^Z6o_h-&w2&rVrtf zAWiaH?X&G|@LC!&&Hi)(5&bIad+lL3d3vy))xwxD2rmp^JfS0=YB0(X6eNsarkP;; z3F1GOR(^F1zh#{g_Fz0SO?8Omt89M)U#kVvnnBS(+|ABt87s&r{g109l z__j2tSZ^4(MbFiKA2d$^z z-W3hb-H+n^9LvA{BewtfHJI&l80{zhtbj&P-|k`1 zgFY62Mt|m3gaP9X&bn7>1JARimpkuId+nI%wZuYU|16XyGnJdf0| z-b}-TOYZ^on|Aw~8oZQ-8>4+>`VhjFb>NHl-D5rQ*yH%)0$Uz_tW zt{+9~e{82KJ(9JK}tRE4G#o?WNv0K10N9-OHBR;~{{#*=4%TVyYDDcEjw}Ni|AxsR~ z)T4j=2F|GSDXJ04FY`#hml1AG(8=i`!7uYn`sscn`6T{_8}NT^9eygQZvNkIz~2=7 zSz!?AXU}s2miph_fd8lK=qEqB_1}#A_^;;-yd?N3=J6rHM_TDA;O9=%@YoXx+piF-2RXD?Y9uKsI&r|D>ByFm?0a!RrU^jSI!K2)z|PnIEY~&N5$ej_YI3OR{hzgP3KW`J^4Ft?Z9FUEINKf{9Y$s7;wM|i_P-uCm0HI|sC-%a6cR9N@ z$8n;VfTC1Yp|pLV>I-cj3hGOx@=!!oMdBh<^?}Fs&2kY$A+1aT1gUA=@Be4!?Afzx zNK@3R5+ixe{I~fp^UpuC=lJ-+LthVvLiwwOY+>&%xOyY_EzI(El)mLLK>j@`ICgTPq6JkFY3S)3~!cFGI3 zFeT8aH!&t%0l3_UXfBm{^irqX=}4h1@MXkwP{6~ryKbrLcKv#lQS3{fnOK}^xzU(& z%I$d$e%onvTixY0%qCi^+3A*>-OOi2StiL9_y14Hq_gP$1$)749X{N#yE1Q8K59C$ ztW?6HX!!nn4&GCe{H08(+^Cm3!{hZuZptn6DYP*@sMgWX5znMyply1Gk@Y(BZVYlMeF5PtW-EE1u0MPG!`QZ_oK zc>jWDShUfHLVr(gLmvW@2<_=EwD)vdPIbQCtSweCD$SKUtyc5VdU=oYm=IOxv2LMT zd35eEyV|KN7VGtVcm8laZ+CJFPOCcCXieqpDXZ|9b>v8K&T;IDJ7>0+Md;OcfAq#x zzrl*d;ks1(T323avEZe6Z; z7UE@+A2(Q_d9MP@)iluUP@C^u`G zKbK*C5Us*KQ8{dD6}hZO*llF{WpbL7hmMxx&6Ov-5}e-WrxaL%ZOf5ixM#J^Mpr3o zXIWVe@|cln_Ty%UeQjr18__a~=97~eF*{k}^EQVyQUQX${s83=ei#WlEbJptOr0zu z0R#OJl2%hhi=cGo9WO)_QZ3;NUEYf+OfN!9FGP9isGQRBhbYf(O3_?Dg`FgN9{PQA zaZm|QOXtIr?+)Q-`sZQK36G7^C%~RF!9t9EGqIganCVx|o)aOLgKlp*^^!@;;A=6H zoq`d1xi#Iha0|M$6n<|(mj*-27IbMhv}{3_=0nSHy4)0bXxW0!tC)wDE$9dgRsdQQ zT`ZJqW}~H2+9_~?P)akMnX$zL4j_+(RS;t~lgU%!iG{_6`9!^DYnB>U0iFzNz_O}B zvHBX-DU`PC)V?929H9b~1VojLde9c3t)QzZ8P(3x>SD{vU7|^(Q?+O!Si|Gc;+YpF}PzQVwbEm!f>oq<~*pS}RSXWsqeC zkw$D`c*ezCRmU!O?RKkCuP&STeO{$EAmxU6D7Sa7y;QGuOO0};%O{AG%9@bsRV-8y zHIuGkC{Y6#t*ys{@Ui|`SZLs+k@pAlP_|DkSy3%l=j{^i+g)?1zy*5+4sepmWQCVb z!8nGqi5$xrcNrPuVM5lZwECS!^sm5aNDVlRCFYrW4(Yjj7%XUj2kWSv&RvPqoZFN_ z4wsQ>&*`GiX(S_@?%%aCQ0OUN9V&eO4a>*^s8n_R1j=!T2w5tkehOPcl-x9>v2XXY zc|^%i2h_05L@7*D68mK2mIkFrCAW{n^^SzBso|BxW6;`EC1Gu~ESpoyVnYoSAz3R6 z@ID~Q=4escaD$?peg(uQAj;NwqS%mwqVNSlMF0b$yj@ZBq{VdC7?hPO1&*LB+vCY1 z?+0b&Vj?U-SvJU%1$9tXE;r&5lx2fFS(L~@S(y+BOi-2$@??48d$+ryJZNBUF@p^A z-4l!KE8TWf0af?MBKtwqDJwu>D|3*@LJMtBEdmpKKZW{|vdBIc*~lW@?Rq!WBD+Ku z_V0=!=bRFRCe%&!GENn)_(PLUaSF9*Ww3x9^IuQVE4QX+>q#txg^6fi|Bc>R(ve(MpmCF>SrO_M{_MTmxhS8O>V|JFCRU?kceTG^b zbXpX>?(zqKB_qnc)gM<&h8!aOakW^Z6tvVatX9xsJ17}~zHtPVBpt%Na$|7_EoPwI zjnhXhlbRHCER;1Pe*^W?XqPhaE?h;2G9ccPAy>49GQdB{fC@R30g;moSp1<32%coH z3Ynn{2&81dRv5~Fh)Ra(88TAf7|t8WS9J_iAow($(gzEjxiiKwI$OY77{)v8=gnq3 z=Hf8snLg&M$7~H_#$@VH=c7O8aZe59&h~NVJ?`nD+}?$O=f*`s`8YF_JKr~V(c^}R zL&BFW^s!r#-Lbpg&*SXg-cNVCS)*Ue$LOn;%@?t_`aV;xR2+K=uLy$e*Z?bk!U zLTqgJV18lggF|AZkDn)jeB?g7bG~U)L~kAuKftT%HT*@#60D0~=7KRYZiwi%5pfO1 zV_}S~D9y%=-W}86pX&J!^}K{9T;oRXYW;7uZ`lI>wJq=`RzjOR0V%ybDaM!8+uz#& zHxObHHv?Y+%T3JtbnUvUU?50UlqDNE&@FT~CUixY1}7!vj0v7U(Bj zd^zR!i9o)Y5GM(Ix7@2!;*_!g{4vCvoAP(kO=3dI70mb6y<*ZGyV`>xA9KlDG~)k5 zN<29}GR8C9VkX3kN%3Snn)KlHuy}1*eOoyNncHE%J34v^q8tylAMfW0=6KTjL7FE3 zKl$sXN5LWE0e{l&4*JmI#gpPs8O8Aj|G1mpB=Iz@2g}13z&iezFP+ga^griBW5j>p zH1gw*snGVL$sGi5u05Ng^|g+s)5@F?${)m==}C$Y=C_U}wYeh%PrzQ*FX_<@|t!?uB<8I3D-x%T-q4f=aBK4nG32%y%Kv3FN zlE|-AN=2Og$NG?tFRO2Iv~ABM0VDdW7!$mU==1>$yadk^3~N^vyo>mWfE8ta zf{|~lewd;QkBE+Xl6K<>14G^~uip!JTVhtj@9=(zOaHLZIp5+5ipQwhFKPFh#3Mfb zw3|$<_F#(7HYtz5V0{5!BVi;e=9_f4#mS;D&$ox@Nl9Pg{fG5RywIBu>zz!T?t>GF zH4S4v%iR%w1TZ(im~T|h*JXVt$Hg{^aC97ne*Gz*K3+e_kH$|+X{wi057hdi9`O9B zxPfqUd!6)M7pL%ZgMX85n)93eL4H{7_3^bHJ!-k*@mbn%)R;fP`7QlVjh(FHb07Ml zuB8ga@@HZK@!CmrhSy`_p6>$Swi*#C1$6wI?!T(7&+*@DNyOYZz?Af<| ztGH*mf4;kyF9|w7H#eV(h|?)?gGUG7{{Yt3eewJv0{@K&P^qB$8f>!)kHk(hnrGAIk@?G2hD-tRHVm`v>$tujCD%JPoyey^d!TZkC9?_FE;d zQDi^q*K&IO_d!3+Ig3Ak)AE{+&`tet)Bh7h_EY|^YPxzJ$CKv#Y5>1Fp%A20HHsgd_5St}0FtNq{mqm2BQYjh z{cRlx!hOY$#8}H`e4Be%{_xoke*J&7foa4IN}+!0_z{|1yAnArE&7LXK%tJee){#1 Yt=g}ESPxY3_RA4!D>)q>{nT>*0?5{&i~s-t diff --git a/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin b/cypnonjrppp5umygzc6ki342jaz36lphtfa4cmfci42ahak2v7dj.cubin deleted file mode 100644 index cf975523f82abbc0139fa3d9c7423217617869cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 13832 zcmeHOYiwKBeLv)-MCxIQlAlWAn)amDfdh&!kyPx)s_l3PU?ahTVOz6|p+xH8Q6zIo zIajeg%UKX~K-;X_fHv5dBA?m;!yX+F^uw}rZL^@juy)&rZl3}pABw~s6x&;ZHLdph z{m;3Vmk%YDmkk(}3;5o1{_ofMpL2ETs}G+$8w>=}VWqF2dR1+hF&~>5JxL_=^Ygnt zujW)<_2Ir-1(b?6mK#mGS)9kneXg(SxjYMvN}xpEEnx^X$O>cxly(+f+eBicDZtA27$16XOyM# z{Mu4s(Jro*Ra{RhD^-lCt7Bel15_^-TKGbv*l1{`ZQw=3w41?$)tgSC=`{U%jZxxD zo0&+?La|a=vWxX)34YzKHEYfFI?N`nR&6wk)n?+OqO6PLuJ8Ysipi~_$5+d%@!H~I zqufM2Ay}p!T|BC+P(V`L`QzswK3~xEg+!rPSt&Mp$Lo{Y)Jy0?YV!`d$j4N=Ddfmf zqg|Q$CLo7>(2yDb7|+<($9x#y5N)&bDgtIHm%D|n3cRlcFrz;fcz+lCO@X`JyoKHy zr2N4az>JcBk4Ia6Z5Wcrgnqn>ehb&6fpW|Xc3 z2DrLH=b%jUhN6bH0zZPPj#(h^vynmE1AwT&baS;n-K^OQ%PZB=+5$#}rDCI2t6o?s zPS=Y{E!2w_nw4gKsb)LQZ&%ivYt^+Sr?xb+R9n4Jzp%XMRF~@M#`64;xy8!U zsd#-|1^)3r|M2_$Z|qZQ0M_U$6tKnuH?6^7G+;dwOa!a}j@E-eRDm0}ZoIpB%X$`o zcW>VKcNtjy!B_yJSb%eV2Yo*|Nl0h{_h*7%?+aK5DXYfY7;bOhc<09L+mMYpwcNB8 zDeNB3{iB2jdrff&nkCX4z-{y2tN?XgB{YB==NK*o-vsSJJp`?2ULe;&kOzNgA-a#i z>7Zit+v5#@FCNO;9k6zh^E*oOM+3L4m#=dCuLAkYtBj6i6SAAP25*euO>2U~-{NhA zxBa|b;El$^OWf|GkxRVYt6>;Ta`Yr`;>~wO&#R(S7+2`?-wX7zD1Q|En>tt1WP6zIh^hiCk$l&B zRlvgpydt=N62LVXe~Mrxb#qgcY)W0+(8(?lE?~=(w+X^;iGY8Rxc-TZe4tTPx33E7ZIa!(DP?W_CdRiVz&C@xA4CCh^~5ih?FJ5Wv3xv-GhfkODmP<| z)xvC|sN%&^3EN?#&{%o49Lw^E8ONcpY*%Au_l(7{F)meO$!sh>pN}n7YV*ZPp@FUT zi6@U!_)24CzEY0G%hjg69{ccp;HZk%i`YwI@wIFw7Oyw$SX@i=)O>`5g76m-4}4sb zl0Hf4)SYQh-33qD$MfIwJ)`dDN%HAxbEOnZrt<0%r%MPY4&SkXsrrdj?xEubHV>Z$ zrtA~5i53Xz0V*S%Iidmx)fM@A+<(GC(`e5Y6cp0jNr0AMyagW0z6~XShhW%8CE-q zU;yEdg$^p=&;tWo@lG+U!UDK2u4#!Yu_?sC7oxOwRR<5DXoWQ`eiDyx`@rHM!EFIe zOF#)uLbH#YZrcKHmo82d-k!3w_$=^twc0|#rrITm%W2^0T1#_ATsDsTliZM^ttUN; zv(V%w*+puz5rSSwZ94+d3#lD|@LovqA`RiaklGOl?}5|{gVE`QL>Lid@b^MW9vKZH z)#_R$zP#d=SVlNm>G_%b>})!dm9hm*r54w*urpc!C{c!uwRsP&Yh7cJbP?@illW3W zTGoOuGZNg4#;3_xEcx|$rJ?Dd6SSF(Va1M7{$d zQ6VCk=)h=iiJ|&-!)Nr;fI81`iODl7rLt>&Nv}{b(-`CSapthZZqCOngd+=Wth$yn zpgSEO%Aw(Ul5;Tt$J-3@xy9}>7Sap~5cu@h}yb=+{xsz9t@A0At zx_O0U28MU?nh@zn8rW`LiO38r@8H$T$kU6`>gGjqr)=i3lUJPac-d0eVco5r5!O2g zS_|^yLotFbiq-5xQKpN6L_doDyQo|{itf26G(A5Tt#?r9dzL-EJ@z z*Ik4+3AQeST~Ubd%#ID7X&IdH8;lOpZZH<_-53nEE`!~50E4qTHaOcdIOjK*Cr*F; z@<4tU29vGJVDvP%Byu}8IOiFhO85*;p#!!X>=xe6YB&Y9E`!}QSOHq@1R@QHSb$PQ9MeM4bNbX01)SNy=TpWw(hf}lZ+$pxS#q{Y@xwE;mG_B4v ze!E>P&Baq^K9|p*NuQvj zUf058<*cZAAkL3rUJq}2MqX^?}!#%-@g{71TFQs9A_u)z_ zhK^2Nalzy5a4;v)H*dl`oR~i^%iHnig^RPXWfH2=%bxvYWPM&u0M+}z( zVeBED<_`iBsV#v-xuSY7*27fU|bfRLe1EomYf8EVd`L!(C0KGjc&T1$;8Ab zrtI9T9A9R(K^{wHH6ELAFXm(El;HUu)Dxe{Ydj*;6Q9j!JnZg?&nGmVA?jiLu~}`O zyruTSCzHlNESLGB9dE2x7h(gnw)H2DyjKfcyRBM~24xeQ>L}^F)Pl}lp04?X>0$su zdn)phYR#C3H?5b~yv`^%W_Wb)6;@sx$#2;NduPa@o@+*sO!cNAIYu;_ZYH@)!3|tzxqC(RV`Re#7 zs0)qhF*S$AedAn6esdDqKP*Ot)B&Y#A6D+%QhV@2lre8E->H1_fQq!yixBWzkKvo= zy;~}DUqr8lVM+5gbLqp#8R@n0yCmhYB(`n;RXxLFv+EagO(^7AZLSFxnMCcc; zylD7m$VW7&=i^RfNb=|52g3As%f;Jardn-%0D8!S{Qf!0AHN(D`F8&Rzk&+>dx8&Z zysh!h;E+_QCqEpfzx_r&B$+8-49N8!)N5d9Opwr6k-)A~3P z4gKSY4_1T6e{Et{M85h@#D~?(K{aL%QoY*x@r#oGtTQJ0BaHcmJwtOC*2lH|m=Eju z#SrmXXK3&~;N`;nB`4BRF9RL*GBCh;iFoxgFwkBvk+yo7w1?a36Z%Kyz+WG48vVP& zM@2bqZ;ghs&ff5e)?6W8W36~iNBf)+jom_0b-av!b&)sajPGvIkE&}c!>xGz53GkN zXU|^0zL&5bIB%^t{8A34$|dWquaEHydGYJ(YmZ;3Eq>R2V-%ed7xR6@!g{-R$X{b8ze1x%(e-`xWUa zzy8#JSd7_(q@>yU~# z!F|`hc;d4vup#R|5(!!-zx0j221|hJ?JIVukNQr&hJH(XoW1*Fn0x)(2}}RnFa4A2 zfA8Kmp>XRp^ee~g=<+t&Lpj5TT#&efbydj^+|sMe$v73 zE?j+BZ<9{HdHulQ?{Q~CPzZXXYzlRG}?9ueQpK70v*UcRE z9rO-zNCM^)&MGbu@VIFq(Q)f7|>8p<^N7uLXhc~8CFS}g( zVnS2w*%=Pm|6rsh?d&-U|~dLTUWrjB)@4{hJ>F zKh854ian?|^pn>9iu!+HT}i!YdDH5=dwx+`-Y>6zpm?6!+WK8fy*8>2cz6i?e=vua zpIo0r;D0Rw>^1L|O+wlJ? zoyDbJHuRYHPX7BfiaS)>_0$nPoitE4N8a9K;J>Dy^*J+lAI}^9KQaLF%Q@L4FwfVu z{ayMG{E{Y!8M;pWZXSOEJZ+jYbnYW>_}#gy-{o`je;oQhjNhdrKTE2U-`R%$F{9tl z&ywim|Eq2ImwWKj&QAUmU=V>r7da9{iizUT>Xh)3(>277*m?){=cRcwH_Uya9r-wjUT~x n%f~u!<4f88%Z<1D^y`zBuuWV%87AI-J^^jR=f=l| z&Q{_(^=>$D?A5XVqj|0{TMcJ3ay!0K+N}U`#0`@2!;6xx?8mtN_uT_(@n7lq9jm*!8T$!t1C&kPNXm|z!*Q@YN`K|bi&vb~ z-zht8yX9iN|Htm0M|w0L*X?EFwlipU;^zAqkTV&yAmu-zH-A*hSW&LHKrZ=S&x zWnj0;w+jM)A`jCy1g>_&ee=%+etaL0((izunA^{PsHcWq;XgOwKSDbzQI@$O?>D(G z&0HT2&rxx7eR!@YL?Xr)C!(Z)8msUs^psmr75E7tokCnS?QgTkan+C!X-i2bTuQo8 zz1a%9tvV#9;l|x=@U@n^)VoSl?_I6gcDULP8f$&ecemH-Z}hiUlBjEYp_{a-og|3b zD}Hl(d%NFk58}<0uUp}Ov|s-D`|mvQ7gSU# zc0VvQ==XN@+d6V{MYde6gV_SeL?c8q=Jh$vn8xKJqlE}Y%=&p;_6!f#-4E{HeSc3l z+~0fe{vO6NHT`=$2!fH-^P{-c4a{|G4JVNsHT=YkJI;l&OP1?-c+O)dZf*PKYBlw5 z#jRS~H!VL%qJjCaXX6}Mp^Ib0w6<2OrWGcUX*Js2n%g!l>=uo{tUNS^BO*BE#m5(` zl8Lo!KWYYchV^l0vrNzPsrE*Yv^=w7uhR~z4U`DywmG#BURwF=MG4iA;?zQP$*%50 z7$1;#X&n%qeaXRFLr`~WexntXU*MH$duipZnsi-RG{5q&cvU^>Z9{3;B!uR zwm6wiw`XmobEvE>nhW*BZFyE7iO0>hC1+Pzhb3Zrr3_1!a{(5Vi5VG!T^0lw?Th3F zo(g1F7-#EBc^#3u(b!@Oi?CN<%2<|#1@@H+q7jb2&}t`EEV2&Mh_GUz%4o>rphFrM zLB|vPvf)`px}Imna$Q7l537UaTsvqIVLdq5fVxBk4CYCpw zO9d0p4^y3lFvYwV?kSaweRwh)8+jlcuA)el)p0iVem2>P3W|r=lNA+lJUFO|iZaAW zt_)(X%W3pQ^T?ub84$oW4-Zhy1~_~o93k?P@#9i?VgF(F1CO7cYy3^&*Awj0Q}Wfy zt<@sGhb<%Du1&a#PmD>&Jn?bqlL>U(I_EI8GlhxZ2wW=C6#5e^&!@7$Ph{k^0u9;z z%nY0GzH*Z2!!u)DC!CDC^J873;sLuL!QXfp+)s_D@aZDGjN>lEuP`?wV)N<8KjV~1 zh2uryH<>vd@@j#jiN^05Ec`?Ef0z9O^kOow@!8JvQSG!8^7!Q=aK%&ojU8N&3waoq+xTuA|$mvweD6IOx!S{asq< z>(lJ7vwr7Gg_%B0)3=|du(oT^yCqs>UA+*24?CsyJ%i(2f2*L28J+&jOgcaLE-B}; z&@YPkMUMa5l6L)vGw?^T*uQDet+%Mi{K9E0^qtSsl~Yhe8yfW2XX(~}iZk#}?Irz+ zsxPsuSMz81 l-{tW-s`_NdVp-Ko_1W=6jgP2HK2DY==6nQC)kpP;_fJyYA>IH0 diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 90571b0751e..93bb438b8ee 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -57,6 +57,25 @@ if [[ -n "$MODEL_ARG" ]]; then echo "Model argument: $MODEL_ARG" fi +# Cleanup function to remove temporary files and directories +cleanup_temp_files() { + echo "Cleaning up temporary files and directories..." + + # Remove temporary files with specific extensions + rm -f *.cubin + rm -f *.pte + rm -f *.so + rm -f *kernel_metadata.json + rm -f *kernel.cpp + rm -f *wrapper_metadata.json + rm -f *wrapper.cpp + + echo "Cleanup completed." +} + +# Run cleanup at the start +cleanup_temp_files + # Function definitions for each step install_executorch() { echo "Installing executorch..." diff --git a/export_aoti.py b/export_aoti.py index d798654ffe0..229d6e567e3 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -44,7 +44,7 @@ def __init__(self): self.linear = nn.Linear(3, 5) def forward(self, x: torch.Tensor): - return self.linear(x).cpu() + return self.linear(x) class SingleConv2d(nn.Module): @@ -63,7 +63,7 @@ def __init__(self): super(Add, self).__init__() def forward(self, x: torch.Tensor, y: torch.Tensor): - return (x + y).cpu() + return x + y # Model registry mapping model names to their configurations @@ -132,7 +132,7 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"): # 2. to_edge: Make optimizations for Edge devices print("Step 2: Converting to Edge program...") edge_program = to_edge(aten_dialect) - print(edge_program.exported_program().graph) + print(edge_program.exported_program().graph.print_tabular()) print("Step 3: Converting to backend...") edge_program = edge_program.to_backend(AotiPartitioner([])) From 9b9c28bc737bb5a6e10563c7a74fc7f1ed5132b0 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 20 Aug 2025 10:11:50 -0700 Subject: [PATCH 15/50] remove temp dir in test script --- ...l4fepetv42wg64xygsadkkb43zczod6.kernel.cpp | 6 + ...wg64xygsadkkb43zczod6.kernel_metadata.json | 1 + ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 0 -> 8968 bytes ...3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin | Bin 0 -> 13832 bytes ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 0 -> 11656 bytes ...qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp | 965 ++++++++++++++++++ ...jkb2gr6xgxxo6t35umkq.wrapper_metadata.json | 1 + export_and_run_aoti.sh | 40 +- 8 files changed, 999 insertions(+), 14 deletions(-) create mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp create mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json create mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin create mode 100644 ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin create mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin create mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp create mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp new file mode 100644 index 00000000000..02ec4e5c2af --- /dev/null +++ b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp @@ -0,0 +1,6 @@ +// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp + +// Compile cmd +// g++ /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o +// Link cmd +// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin new file mode 100644 index 0000000000000000000000000000000000000000..000ca4c1209b77cdaec3c8757e532677b79ccc0f GIT binary patch literal 8968 zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8YXC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7 zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM# zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew$8dP>`$ zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1) zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5GuFVIK-CY(EZ`C4GTUaK$2L zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3 zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty z02F5dZe(2`e4Kc=$%pwRQS}@3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8KFIpOH77K*R{H z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn|U zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC@q#ggXYY*W& z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM# z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f zaF0gz)9(?V>4(C7I_d98v%g93M{PB?<|F=0Y6OQboct=_OL6Henm2?q5@Rb8dR?2(87mIVOHSAc?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma z2PCeK{b#acB;VbBF-P?YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS? zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy zX;T0AF8WpVCl87*-V#gk(k}YzYNH|^)Jf!hx{MDSt=TG81f(G@hxR@S;eJuxU_y-WA7n6z+AV6L z&H=?w^}nge(LnE3Wb$^wM) zi|1Dg%XV?Ctm1l7t<*54uC95p15n#2wDE;TvC+^>JHU&GX)l8Zt2doO(`owk8l%LQ z4l|LQrDC$OI+SZgLeEy}t`?)v_JshHd)O`Qhzyk?{CJNP z4RCdZ&On*w4Mh!Y6@CO&9kW2-qp@M!1AwT&Y;&zX+g!JoRx7pA`6Y}BE5*k8dhJ}L zID2kIsiku(>GW!*oKDqO)3*KP3psmvv1xBCK3_by?p$2@@?3Q}y?&uuDnDP}sHE++ z`o&tjzNrE~*!ueS2jASM)DW!ES14dj1a4Tv!Dzs`5KIKDA&xeKKU9J1H?M!Nebc%K zzy~+3|GNyV!C)+aQ7pi@zJq>{oFpVPf%}EvHwFUMLCRX^Z34IVuD^f%)-A}!oLX*J z%M^AG=l)T`gT1ae1kDO*4&b)^Z&rZ1t`Zu+jdKi_f^UQNpdNx&G%u0sAjpG1v=H4V z;B-(i`knCxz!wi??G9MG$oU zycv%6@b-id{XiH>GCl+F^=<1RVo&q-eLU z0=)kkZ%27s#OX?lSTY-nFXm$_)%C?W^D7@0BELO|0c)8ZJH)Ef?4;)qTdJ%g`EPg(liN)(pI~LaxJvE;qp&TwHi0hB_VyrCVmbGKg&E2h*I1+ zp^CMVOP@*zJ&@_-3AMOfc2%U39^!Z`16k{~$CDD2Xi6nc_%Uf>3~RmvlMxK?$t*1{ zH|z0g(QZ3shtwJ&ok+jKsX1uWF@nqf7^Sje3-F9lW7YP2W?1bc zf&qj-7CNYeLk|pW$2-Na3Jc)AxTYnp#HJ7jUx?D$RUJHpq8-+>_(?p%?E{O41a|~9 zEdeDs3C%uox@`-*Q@S`!cxTGe;~GM)%CcJg`Lp?K#4MJoL}_dy4E!oNf*&RHi<74 zq-8DmG9$swXndNC#gbp2R~niQTCo*+S|z?2&`_!$A+aSQ(}j?@5<%yD9CK13MC7{= z5)~qni7t%xmKdtCq z=;alXIT+r}YeJ+SX<&PKB_eaMyo*;aBTp|%tCttaow1qAZeDT5<7G=>hjq7hMp*A0 zXf4Q(55)+&C|0u%MVT%N68$Lp@1k;@D7xpO(DeLVwBALbi2Nu9%%EiL_{?A#xF{5X zpNp%f&JFW+;~dB)E9jAf6&3NV}lRkS{)13xm zaot6DlVIyH*cFBN&fT%Wb8Uk&euL3LIt|9+y&HqU)?=`{4q$Nhjt$PX4bJ%u=84l^ zzdVrNg~4R&F&I6~Es5M68=UhDP9=N>r_cdA4R#Cf&T2RXwjP7sWFf-&J2p7qHaP7! z82e4nOAOyoX6y{^tZ+HsmY+ArkUCA%NcP%Bxmu4e>vy?1`nsHcEcr<8@yw$fpL#5n znoG~0mR?Uy?payC{nh2oMijBDb|bkT2~unYmB^OEZ#cRPDd=(ySWFx>16 zhI?Hbla;fg=7Bgrh8w@(;lA(V;ihRY-1dDK$&HrAUEhb%c8hJ?^nDn*You{S`QG}n z5@aSE!u)U%!(20(lx2?eS&4vV>pCrGB?MXFB&yE=NpKWl+-3FB>Lt}n1>Vd=Y^cap4j<$b2f1?5_=xD=MoU+d#lb#?8(W~F8+w& zav)5;KcVT^n|S5Kvz+86hxGemoD;(-&J>KxqEo0DJJXVrATUf_EE4*hW~9+g_cNK8 z_{@x*o0sFu+z!ZN$*jg>6Yj@+ES(ZO--G(%GkJ|iWcuQ>8I6bCeewB(#xq2Hj6XK7 z?UT3Ee)we47>MPvShVAf&Dv6Ih}O3M#F6)Efor#23wfypVpCluotIkB*~`;4pD?_r`;JlKYuKhKsI+^6yZr7NC{gqW%x4u%Q5z* z`4|#t4%$YeK16#5Qy#AS$cbHn07^(Eb`p_ww=OxK_whl?u**&F501$P+J6i!*Gl~R z;}!m_qAYI{L*@qp#l=Otyn#Quc3)R|Naa@+y@cJ2pVhqAfVJWhZ5X;VbVY?`Bl6Yp z3s4uD&|_*2P5Q>Uko@K(w0~5L3aJB1-8!t?xuy2thbR-?T)tEJ<^dJypcf(Fx1Pc` z&wE=cbYDcphLoz~Unn{%y5wUb2(77@3Jpip2*%MM#@P_1$;EG%9Q@oo|I<7_#uF)$ zi{A?UUsVt80Dp7`_zRZ;EfGMaN5<6DW=K38`&Oq0~F$ zYTDi^JO*Z zL_>!-|4`dE?clq(M?W68#~B&%zPG}jUv$FJOgf!5{)9#y6{S7(=e8~N?l|NPj!1-l z?#hdXKSDmDK|P;zA|sMN4?h&9zbzMUhnZ^C`WW<(2l;~wls|bnB=YUSA$|oF{PzSO z)_7avo#7FwQcr$3On(QBe8`Jv`iq8sbcFJhp`URgVcH)Q`X}Mf*a-a-ytZd}0n_>< z5)J*6h!0kS$A4{VS46)0PsK;ot3fqk4^zEr{rF|cf7Y3h{1L`{!=8}^4C|BHe#}Sp zd?Q4B))^VT4|usSf5nM()yq&hTtg|>w5+3f%Eoy!!PAvs$8<(1_l_vkQcv!fzJ4aI^uWj*T>N*aWUUVEUdSCNBs2` zbwQP=Y{#TC&u=)`A3lNsW6tHz7W4@d!n5m%x9dzd;R_lIOaDX zjQ1O}zz@s%llp}}*M=qsNw4iAe`LSMPyGDEuvf3N-`?Zb7o2f|Qyx9~&$avawDYx! z`PA;9Cx0=3{YMY##ofBA;R^+CJpFbz72(~*U_&KpNY$ZQg5vfUmj=u-{V+euYY1bVGr(QyjxMwzeV{`i~6KL0YB~F zcNeZcthZ@rQ1T`Ek*`U|vbev9KG37z4t|W(%N+RZWlQ9NBtmO9}VhL0_QJ> z4%_2F@7GK3j1L^P$AYFq$o@wDQ8r-m|0E#t#$#qK@+O1cubhzo!}dhb`_&fWL;kUZ zpMFnJ`ZYkXXYjDSJGh%;Uf&xR`4bBm8+&E{ehG8#-^*iy-@^qf_GtRu&vnix>t+u7 z4tfVUBmwg&XB5A;a`)Fm$p647^D7tdFK$LC8Na8(zb}rdNqa!*&E4;&oDf&CQmikX zkAWzr2G9Lg`cXZK{Q~}A&eNBwM|FRlb_NC(Fb})?sWTed_SMIXqw8Dx!<)0HmtC%X za(?(5Y>Vmz2w*%=Pt(pWsh?d&-b+&|L1L|P$^#wC`A1@mIKQI9D%Q@L4FwZx% z{XO~*{DLNk8M_}#gy-{o`je**eHiQlCoKTE2c-`RoxDWl)d z&ywio|LYz2SNrhO&Tjq`U=V>r7Q>DMPMVTZVQGEBVvd;;2m&yA1!bou@ZKN71C literal 0 HcmV?d00001 diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin new file mode 100644 index 0000000000000000000000000000000000000000..bbc7d301593f72433c5b7626638d0e6f0f4a0813 GIT binary patch literal 11656 zcmeHNUu;`f89(-Qnm59nx<+SDBYiKg0b-XzH{#N z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W z)q1gzTc}pns^!&23D~5^eJyQZje z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo zzd zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l z8cXjac$Fp{)2rGySn+!Tz$2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^ zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hwds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe; zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJg+VS(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4slX>$8MT1f}SwbtT6 z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9 zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+ zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G zsDgp>M-8;CXUW!_Miv5%hx6i!lEx1F+_ zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1R{!MY@Lij z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA z*27_}t%&+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z$nlNg!-(Mc zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;%4A;gOT0 z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3 zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mYT8)SViUf}%2 zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p zIl#3z8VlT3jBdQ_aBuXVL3zNWSBWnFWH}S#$6NG> z?z1AncRQ zIiD;oU3r&}+oVD*^JK z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB` z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD> N+sV24aj&l2zW{T-tULe! literal 0 HcmV?d00001 diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp new file mode 100644 index 00000000000..e91fc32554a --- /dev/null +++ b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp @@ -0,0 +1,965 @@ + +#include +// Definition of AOTI runtime interface functions + +#include +#include + +#include +#include + +#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ + try { \ + __VA_ARGS__ \ + } catch (const std::exception& e) { \ + std::cerr << "Error: " << e.what() << '\n'; \ + return AOTI_RUNTIME_FAILURE; \ + } catch (...) { \ + std::cerr << "Unknown exception occurred.\n"; \ + return AOTI_RUNTIME_FAILURE; \ + } \ + return AOTI_RUNTIME_SUCCESS; + +#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ + do { \ + AOTI_RUNTIME_CHECK( \ + actual_size == expected_size, \ + "expected " + std::string(name) + " vector size to be " + \ + std::to_string(expected_size) + ", but got " + \ + std::to_string(actual_size)); \ + } while (0) + +// AOTInductor uses at::addmm_out, which doesn't supports +// arguments that requires gradient. For this reason, we +// enforce no_grad context for run APIs. +// +// A RAII, thread local (!) guard that enables or disables grad mode upon +// construction, and sets it back to the original value upon destruction. +struct AOTINoGradGuard { + AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(false); + } + AOTINoGradGuard(const AOTINoGradGuard&) = delete; + AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; + ~AOTINoGradGuard() { + aoti_torch_grad_mode_set_enabled(prev_mode); + } + AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; + AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; + bool prev_mode{aoti_torch_grad_mode_is_enabled()}; +}; + +extern "C" { + +AOTIRuntimeError AOTInductorModelContainerCreate( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + bool is_cpu, + const char* cubin_dir) { + return AOTInductorModelContainerCreateWithDevice( + container_handle, + num_models, + is_cpu ? "cpu" : "cuda", + cubin_dir); +} + +AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( + AOTInductorModelContainerHandle* container_handle, + size_t num_models, + const char* device_str, + const char* cubin_dir) { + if (num_models == 0) { + std::cerr << "Error: num_models must be positive, but got 0\n"; + return AOTI_RUNTIME_FAILURE; + } + CONVERT_EXCEPTION_TO_ERROR_CODE({ + std::optional cubin_dir_opt; + if (cubin_dir != nullptr) { + cubin_dir_opt.emplace(cubin_dir); + } + auto* container = new torch::aot_inductor::AOTInductorModelContainer( + num_models, std::string(device_str), cubin_dir_opt); + *container_handle = + reinterpret_cast(container); + }) +} + +AOTIRuntimeError AOTInductorModelContainerDelete( + AOTInductorModelContainerHandle container_handle) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto* container = + reinterpret_cast( + container_handle); + delete container; + }); +} + +AOTIRuntimeError AOTInductorModelContainerRun( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( + AOTInductorModelContainerHandle container_handle, + AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + size_t num_inputs, + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + size_t num_outputs, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); + AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); + + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_single_threaded( + input_handles, output_handles, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumConstants( + AOTInductorModelContainerHandle container_handle, + size_t* num_constants) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *num_constants = container->num_constants(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantName( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** name) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *name = container->constant_name(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( + AOTInductorModelContainerHandle container_handle, + size_t idx, + const char** original_fqn) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *original_fqn = container->constant_original_fqn(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( + AOTInductorModelContainerHandle container_handle, + size_t idx, + bool* from_folded) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantType( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* type) { + auto* container = + reinterpret_cast(container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( + AOTInductorModelContainerHandle container_handle, + size_t idx, + int32_t* dtype) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *dtype = container->constant_dtype(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( + AOTInductorModelContainerHandle container_handle, + size_t idx, + size_t* data_size) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *data_size = container->constant_data_size(idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive) { + auto* container = + reinterpret_cast( + container_handle); + auto constants_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { const auto ret = container->extract_constants_map(use_inactive); + for (const auto& pair: ret) { + constants_map->emplace(pair.first, pair.second); + } + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update, /* user_managed = */ true); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle, + bool use_inactive, + bool validate_full_update) { + auto* container = + reinterpret_cast( + container_handle); + auto input_map = reinterpret_cast*>(constant_map_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->update_constant_buffer( + *input_map, use_inactive, validate_full_update); + }) +} + +AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle, + AOTInductorConstantMapHandle constant_map_handle) { + return AOTInductorModelContainerUpdateConstantBuffer(container_handle, + constant_map_handle, + /*use_inactive*/ true, + /*validate_full_update*/ true); +} + +AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->free_inactive_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( + AOTInductorModelContainerHandle container_handle, + bool use_inactive, + AOTInductorStreamHandle stream_handle, + AOTIProxyExecutorHandle proxy_executor_handle) { + auto* container = + reinterpret_cast( + container_handle); + auto stream = + reinterpret_cast(stream_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + container->run_const_fold(use_inactive, stream, proxy_executor_handle); + }) +} + +AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( + AOTInductorModelContainerHandle container_handle) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + container->swap_constant_buffer(); + }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumInputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_inputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_inputs = container->num_inputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetInputName( + AOTInductorModelContainerHandle container_handle, + size_t input_idx, + const char** ret_input_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_input_names = container->input_name(input_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( + AOTInductorModelContainerHandle container_handle, + size_t* ret_num_outputs) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_num_outputs = container->num_outputs(); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetOutputName( + AOTInductorModelContainerHandle container_handle, + size_t output_idx, + const char** ret_output_names) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE( + { *ret_output_names = container->output_name(output_idx); }) +} + +AOTIRuntimeError AOTInductorModelContainerGetCallSpec( + AOTInductorModelContainerHandle container_handle, + const char** in_spec, + const char** out_spec) { + auto* container = + reinterpret_cast( + container_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + *in_spec = container->get_in_spec(); + *out_spec = container->get_out_spec(); + }) +} + +AOTIRuntimeError AOTInductorModelCreate( + AOTInductorModelHandle* model_handle, + AOTInductorConstantMapHandle constant_map_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto constant_array = std::make_shared>(); + auto input_map = reinterpret_cast*>(constant_map_handle); + + auto model = new torch::aot_inductor::AOTInductorModel( + constant_map, + constant_array, + "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models + "" + ); + + if (input_map) { + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + } else { + model->load_constants(); + } + + *model_handle = reinterpret_cast(model); + })} + +AOTIRuntimeError AOTInductorModelRun( + AOTInductorModelHandle model_handle, + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + AOTINoGradGuard guard; + model->run_impl( + input_handles, + output_handles, + (torch::aot_inductor::DeviceStreamType) nullptr, + nullptr); + }) +} + +AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast( + model_handle); + delete model; + })} + +AOTIRuntimeError AOTInductorModelGetNumOutputs( + AOTInductorModelHandle model_handle, + size_t* ret_num_outputs) { + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto model = reinterpret_cast(model_handle); + *ret_num_outputs = model->num_outputs(); + }) +} + +AOTIRuntimeError AOTInductorModelUpdateConstantsMap( + AOTInductorModelHandle model_handle, + AOTInductorConstantMapHandle constant_map_handle) { + auto model = + reinterpret_cast(model_handle); + CONVERT_EXCEPTION_TO_ERROR_CODE({ + auto constant_map = std::make_shared(); + auto input_map = + reinterpret_cast*>( + constant_map_handle); + + for (auto const& kv : *input_map) { + constant_map->emplace(kv.first, kv.second); + } + model->update_constants_map(std::move(constant_map)); + }) +} + +} // extern "C" + + +#define CUDA_DRIVER_CHECK(EXPR) \ +do { \ + CUresult code = EXPR; \ + const char *msg; \ + CUresult code_get_error = cuGetErrorString(code, &msg); \ + if (code_get_error != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string("invalid error code!")); \ + } \ + if (code != CUDA_SUCCESS) { \ + throw std::runtime_error( \ + std::string("CUDA driver error: ") + \ + std::string(msg)); \ + } \ +} while (0); + +static inline CUfunction loadKernel( + std::string filePath, + const std::string &funcName, + uint32_t sharedMemBytes, + const std::optional &cubinDir = std::nullopt) { + if (cubinDir) { + std::filesystem::path p1{*cubinDir}; + std::filesystem::path p2{filePath}; + filePath = (p1 / p2.filename()).string(); + } + + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { + CUmodule mod; + CUfunction func; + CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); + CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); + if (sharedMemBytes > 0) { + CUDA_DRIVER_CHECK(cuFuncSetAttribute( + func, + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + sharedMemBytes + )) + } + return func; +} + +static inline void launchKernel( + CUfunction func, + uint32_t gridX, + uint32_t gridY, + uint32_t gridZ, + uint32_t numWarps, + uint32_t sharedMemBytes, + void* args[], + cudaStream_t stream) { + CUDA_DRIVER_CHECK(cuLaunchKernel( + func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr + )); +} +CACHE_TORCH_DTYPE(float32); +CACHE_TORCH_DEVICE(cuda); +CACHE_TORCH_LAYOUT(strided); +namespace torch::aot_inductor { +namespace { +class AOTInductorModelKernels : public AOTInductorModelKernelsBase { + public: + CUfunction triton_poi_fused_convolution_0{nullptr}; + CUfunction triton_poi_fused_convolution_1{nullptr}; + CUfunction triton_poi_fused_convolution_2{nullptr}; +}; +} // namespace + + + +AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, + std::shared_ptr> constants_array, + const std::string& device_str, + std::optional cubin_dir) + : AOTInductorModelBase(1, + 1, + 1, + device_str, + std::move(cubin_dir), + true) { + inputs_info_[0].name = "arg2_1"; + constants_info_[0].name = "conv_weight"; + constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); + constants_info_[0].offset = 0; + constants_info_[0].data_size = 540; + constants_info_[0].from_folded = false; + constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); + constants_info_[0].shape = {5, 3, 3, 3}; + constants_info_[0].stride = {27, 9, 3, 1}; + constants_info_[0].layout = static_cast(cached_torch_layout_strided); + constants_info_[0].original_fqn = "conv.weight"; + update_constants_map(std::move(constants_map)); + update_constants_array(std::move(constants_array)); + in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; + out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; + outputs_info_[0].name = "output0"; + this->kernels_ = std::make_unique(); +} + +std::unordered_map AOTInductorModel::const_run_impl( + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor, + bool initialization +) { + + if (!initialization) { + std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " + << "aot_inductor.use_runtime_constant_folding=False\n"; + } + return {}; +} +} // namespace torch::aot_inductor +using namespace torch::aot_inductor; + +template +static inline void call_triton_poi_fused_convolution_0( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_0', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 12 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_0 == nullptr) { + kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); + } + CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); + int var_2 = ynumel; + int var_3 = xnumel; + CUdeviceptr global_scratch_4 = 0; + void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; + launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_1( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_1', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 15 + xnumel = 9 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y3 = yindex + y0 = (yindex % 3) + y1 = yindex // 3 + tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') + tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); + uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_1 == nullptr) { + kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); + } + CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); + int var_7 = ynumel; + int var_8 = xnumel; + CUdeviceptr global_scratch_9 = 0; + void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; + launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); +} + +template +static inline void call_triton_poi_fused_convolution_2( + const in_ptr0_type_& in_ptr0, + const out_ptr0_type_& out_ptr0, + int64_t ynumel, + int64_t xnumel, + int32_t device_idx_, + cudaStream_t stream_, + kernels_type_& kernels_, + const std::optional& cubin_dir_ = std::nullopt +){ + /* + async_compile.triton('triton_poi_fused_convolution_2', ''' + import triton + import triton.language as tl + + from torch._inductor.runtime import triton_helpers, triton_heuristics + from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math + from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties + triton_helpers.set_driver_to_gpu() + + @triton_heuristics.pointwise( + size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, + filename=__file__, + triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, + inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, + min_elem_per_thread=0 + ) + @triton.jit + def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): + ynumel = 20 + xnumel = 64 + yoffset = tl.program_id(1) * YBLOCK + yindex = yoffset + tl.arange(0, YBLOCK)[:, None] + ymask = yindex < ynumel + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[None, :] + xmask = xindex < xnumel + x2 = xindex + y0 = (yindex % 5) + y1 = yindex // 5 + y3 = yindex + tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') + tmp1 = y0 + tmp2 = tl.full([1, 1], 2, tl.int64) + tmp3 = tmp1 < tmp2 + tmp4 = tl.full([1, 1], 1, tl.int64) + tmp5 = tmp1 < tmp4 + tmp6 = 0.1508762389421463 + tmp7 = -0.15852206945419312 + tmp8 = tl.where(tmp5, tmp6, tmp7) + tmp9 = tl.full([1, 1], 3, tl.int64) + tmp10 = tmp1 < tmp9 + tmp11 = tl.full([1, 1], 4, tl.int64) + tmp12 = tmp1 < tmp11 + tmp13 = -0.047068577259778976 + tmp14 = 0.010523972101509571 + tmp15 = tl.where(tmp12, tmp13, tmp14) + tmp16 = 0.07869197428226471 + tmp17 = tl.where(tmp10, tmp16, tmp15) + tmp18 = tl.where(tmp3, tmp8, tmp17) + tmp19 = tmp0 + tmp18 + tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) + ''', device_str='cuda') + */ + uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); + uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); + uint32_t grid_2 = 1; + if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; + if (kernels_.triton_poi_fused_convolution_2 == nullptr) { + kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); + } + CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); + CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); + int var_12 = ynumel; + int var_13 = xnumel; + CUdeviceptr global_scratch_14 = 0; + void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; + launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); +} + +namespace torch::aot_inductor { + +void AOTInductorModel::_const_run_impl( + std::vector& output_handles, + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) {} + +AOTI_NOINLINE static void check_input_0( + AtenTensorHandle* input_handles +) { + ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); + int32_t arg2_1_dtype; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); + + int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); + if (arg2_1_expected_dtype != arg2_1_dtype) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dtype, " + << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " + << "but got: " << arg2_1_dtype << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_size = arg2_1.sizes(); + + if (4 != arg2_1_size[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 0, " + << "expected: 4, " << "but got: " << arg2_1_size[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (3 != arg2_1_size[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 1, " + << "expected: 3, " << "but got: " << arg2_1_size[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 2, " + << "expected: 8, " << "but got: " << arg2_1_size[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_size[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched dim value at 3, " + << "expected: 8, " << "but got: " << arg2_1_size[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + auto arg2_1_stride = arg2_1.strides(); + + if (192 != arg2_1_stride[0]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 0, " + << "expected: 192, " << "but got: " << arg2_1_stride[0] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (64 != arg2_1_stride[1]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 1, " + << "expected: 64, " << "but got: " << arg2_1_stride[1] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (8 != arg2_1_stride[2]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 2, " + << "expected: 8, " << "but got: " << arg2_1_stride[2] + << "\n"; + throw std::runtime_error(ss.str()); + } + + if (1 != arg2_1_stride[3]) { + std::stringstream ss; + ss << "input_handles[0]: unmatched stride value at 3, " + << "expected: 1, " << "but got: " << arg2_1_stride[3] + << "\n"; + throw std::runtime_error(ss.str()); + } + int32_t arg2_1_device_type; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); + + int32_t arg2_1_expected_device_type = 1; + if (arg2_1_expected_device_type != arg2_1_device_type) { + std::stringstream ss; + ss << "input_handles[0]: unmatched device type, " + << "expected: " << arg2_1_expected_device_type << "1(cuda), " + << "but got: " << arg2_1_device_type << "\n"; + throw std::runtime_error(ss.str()); + } +} + +static bool _check_aoti_runtime_check_inputs_env() { + const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); + const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; + return result; +} + +AOTI_NOINLINE static void __check_inputs_outputs( + AtenTensorHandle* input_handles, + AtenTensorHandle* output_handles) { + if (!_check_aoti_runtime_check_inputs_env()){ + return; + } + check_input_0(input_handles); +} + +void AOTInductorModel::run_impl( + AtenTensorHandle* + input_handles, // array of input AtenTensorHandle; handles + // are stolen; the array itself is borrowed + AtenTensorHandle* + output_handles, // array for writing output AtenTensorHandle; handles + // will be stolen by the caller; the array itself is + // borrowed + DeviceStreamType stream, + AOTIProxyExecutorHandle proxy_executor +) { + __check_inputs_outputs(input_handles, output_handles); + + auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); + auto arg2_1 = std::move(inputs[0]); + [[maybe_unused]] auto& conv_weight = constants_->at(0); + + if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { + AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); + AtenTensorHandle arg2_1_aligned; + aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); + arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); + } + inputs.clear(); + [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); + + AOTICudaStreamGuard stream_guard(stream, this->device_idx_); + static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; + static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; + AtenTensorHandle buf0_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); + RAIIAtenTensorHandle buf0(buf0_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + arg2_1.reset(); + static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; + static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; + AtenTensorHandle buf1_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); + RAIIAtenTensorHandle buf1(buf1_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + AtenTensorHandle buf2_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); + RAIIAtenTensorHandle buf2(buf2_handle); + buf0.reset(); + buf1.reset(); + static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; + static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; + AtenTensorHandle buf3_handle; + AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); + RAIIAtenTensorHandle buf3(buf3_handle); + // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] + call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); + buf2.reset(); + output_handles[0] = buf3.release(); +} // AOTInductorModel::run_impl +} // namespace torch::aot_inductor + + + + +// Compile cmd +// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o +// Link cmd +// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json new file mode 100644 index 00000000000..bd5d2c60334 --- /dev/null +++ b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json @@ -0,0 +1 @@ +{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 93bb438b8ee..7aa4950c790 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -61,6 +61,17 @@ fi cleanup_temp_files() { echo "Cleaning up temporary files and directories..." + # Remove temporary directories + for file in *wrapper.cpp; do + if [[ -f "$file" ]]; then + basename="${file%wrapper.cpp}" + if [[ -d "$basename" ]]; then + echo "Removing directory: $basename" + rm -rf "$basename" + fi + fi + done + # Remove temporary files with specific extensions rm -f *.cubin rm -f *.pte @@ -117,27 +128,28 @@ run_inference() { case "$MODE" in "reinstall_all") echo "Mode: reinstall_all - Full reinstall and run" - install_executorch # Line 1 - export_aoti_model # Line 2 - clean_install_executorch # Line 3 - build_runtime # Lines 6-16 - run_inference # Lines 17-18 + install_executorch + export_aoti_model + clean_install_executorch + build_runtime + run_inference ;; "reinstall_aot") - echo "Mode: reinstall_aot - Reinstall AOT components only" - install_executorch # Line 1 - export_aoti_model # Line 2 - run_inference # Lines 17-18 + echo "Mode: reinstall_aot - Reinstall AOT components and run e2e" + install_executorch + export_aoti_model + run_inference ;; "reinstall_runtime") - echo "Mode: reinstall_runtime - Rebuild runtime and run" - build_runtime # Lines 6-16 - run_inference # Lines 17-18 + echo "Mode: reinstall_runtime - Rebuild runtime and run e2e" + export_aoti_model + build_runtime + run_inference ;; "inference") echo "Mode: inference - Export model and run inference only" - export_aoti_model # Line 2 - run_inference # Lines 17-18 + export_aoti_model + run_inference ;; *) echo "Error: Unknown mode '$MODE'" From a7ae3b7350f187b9dde7bb2e0239da42dc06bbc7 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 20 Aug 2025 10:13:57 -0700 Subject: [PATCH 16/50] ignore aoti temp files --- .gitignore | 7 + ...l4fepetv42wg64xygsadkkb43zczod6.kernel.cpp | 6 - ...wg64xygsadkkb43zczod6.kernel_metadata.json | 1 - ...b7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin | Bin 8968 -> 0 bytes ...3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin | Bin 13832 -> 0 bytes ...rqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin | Bin 11656 -> 0 bytes ...qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp | 965 ------------------ ...jkb2gr6xgxxo6t35umkq.wrapper_metadata.json | 1 - 8 files changed, 7 insertions(+), 973 deletions(-) delete mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp delete mode 100644 c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json delete mode 100644 cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin delete mode 100644 ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin delete mode 100644 cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin delete mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp delete mode 100644 cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json diff --git a/.gitignore b/.gitignore index b166f8c9512..78268c70d8c 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,13 @@ tokenizer.json !test_bpe_tokenizer.bin !test_tiktoken_tokenizer.model +# AOTI temporary files +*.cubin +*kernel_metadata.json +*kernel.cpp +*wrapper_metadata.json +*wrapper.cpp + # Editor temporaries *.idea *.sw[a-z] diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp deleted file mode 100644 index 02ec4e5c2af..00000000000 --- a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp +++ /dev/null @@ -1,6 +0,0 @@ -// Triton kernels are embedded as comments in /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp - -// Compile cmd -// g++ /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o -// Link cmd -// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json b/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file diff --git a/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin b/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin deleted file mode 100644 index 000ca4c1209b77cdaec3c8757e532677b79ccc0f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8968 zcmeHNTZ|i589w%mH|u0~z1g&-+Y6bNEVMM5_1NA^vk_SVg_lIFs9K~}l^NT!YZtFQ znVH>=b5S>;s@exyN_i;afr=;kjzE1uk$_Oa3y(bTfL48|QX(yB8YXC9T|NVUbf6mN!^YjD9zmZO*ihG1?Y4K~Zspd3n-z69MJLUM;_+w&OEQ$>J z10p4a>DT?h3+xJJ(b!p&YaYvfy;Tc^+1fB0t(CSg11Aj3z-t8UR@rSg$}8)>Q!7{7 zt&MhbJ!pVx^`x&y&ac~^Q~Lyxp5r^-22}FG^ql7P1q4C+`Xp;kWqq~0;@NACFy*4z zXkkvi5^H;a(Ap^X`DNeseW|ny-3F$^3O$+|gykR%;(k?7yqAG6k({dCY_58?TW9cH zuN}05j*GB~*KYZN-3sy_hq5e^8}9#~iYcn1hu54nv%Rw7I|1qmV5xeH_^7gS8A*xO z51)A8L|Mw0^JTl)u>GC!`m8qP5&DSQypCJsQ!3p8a-@meP^Ml3MO~51)0M=ZF&eM# zSF6N;Ycttw&@>Tg_qi>mXZi#+`w7!WhUkA~dUTuj**B-j{@y-8%`DKTCS&_jt{!6f zsUi6;+95@yIW@$6hQ{33xy{Xc#Ax^2<~^f46@uu8hpZ@q@}<*rMC$63iuHSpY2l(z zrT#gw7kvssA~hSVxwAprtJWK>+IkhU@~Z8(+pQ-X_G}CC)s}15+E3Pt%}Q8ZDc1bb z$@PsV8(ydMl-nq_=KZzRx?fv&7fyw~-wIb&PF6Qew$8dP>`$ zdiR?4RN6>s+4N*e+erUOq^?}P^7i&SZ-bIgX{q#Pis-MVQrp-5u61a64Z{UkO#eVb z;1dWuEoftdTLB^FN@xdC+ULo~6m|Qkdz88_Q}-?Ms~w=JPu+g%zKiZ1?Pi*`(QRut zQJg7sSF{{aS#%t;f3VR@S1(_d+c*UpZ3{Acsr%*A=-<&MX+BHcN$PkxhS??UrzD{L zMWBCITc-I%nrn6HZX>s^v&6l$)@e;{-`&1;<;o?^#`LOo5dD?(g%nUAbj=ON^YO4Z z7R?2`6WHFW6Bzzl`To2uOuJUYb?KM=#wo{`FOfK28XT`>sN2;radED;3~SynD~rZz zvt6;9Wgl1Z<4-J+@rK{1G#$fqT7lOwKKh=xN0_dSyT&lr=Szm^2A*L`i#<0VW1-CY z<^17K%aRqdR4iPd_reY66k|I7-rh^%UV76w*$Nsp!zwI_)3jU#oOn4ivNP^c%lgWa zqE)b`?2LD`G}kA=JxH~q7SY6^<7K=UwyNcdv)X9ok5I`pYcfw}MakZO*m(S-QyV1) zMKPpp<*!%Wf;t6+yN4oKYj2qAatCvVjhSj-H)>`FLHqTlskSx`h6ip5GuFVIK-CY(EZ`C4GTUaK$2L zKMVsIi!r1V#v#FBETut$Ckou>e7TYfv-kJ+q*&4o3=gv;Mi!L86Km^Dv)+i3WJx{3 zsZ_M)=jImX3CuOSW;WZVwCxpSqyS7xBeH}5=Z;Y839z518Z{>}Qln(dur0uZE7J7U zIp74>SQA39jR*%dS)u^_geC(YX%-MSRb_NysXn+2T*Mn^P$DG*c*GeLpv!2v03pty z02F5dZe(2`e4Kc=$%pwRQS}@3{|HrEyI<8^DZxO^SCu&fb3Q6YTYU09Rl`H8KFIpOH77K*R{H z=RpRG(z$_Pp*0jN5MfIQw(yCAE%bt&iv>%Cmn|U zL;S)06d4!d&SU@a02mn=bCJ$%EH;LwC@q#ggXYY*W& z`1Y>Q?-&)^8ghhBZhdl$eenk-1hr{H=zB-SKFlZ6n9u8EjT?Q>(&R%mexSzp)kyoJ zM&FR}uWj}7=@9+vEIll*$p7)-{OMGe1Eme#Jt3w$0EOBN{S7yHZd*CXtFPs|K-roQKMr*rkcwls05b5;hxnzKKlnhrm=iw?Ga34|L4WUsoOmM# z`Ai0x6vsxC{25&w!y1kCr#tFLf-iz{cTOA%HH|){yW)jCVry1RhoVdV(zn3L8!w8f zaF0gz)9(?V>4(C7I_d98v%g93M{PB?<|F=0Y6OQboct=_OL6Henm2?q5@Rb8dR?2(87mIVOHSAc?(MKM6B=Y}~r)Ow?$1b2gF6L5{Kb(A#CcJ>p{F!ma z2PCeK{b#acB;VbBF-P?YmpTg=q1PEr{?>N`zIN5uX@;AzBl|Rlu)d%mN z<)!_s%rDNvNLM_o?A0b{Kb(J;3P;K_+>!iLD70SvFkjQjen#yF`hx3kit7dWPiqwa z2>Y8A&-r4UD};=^^&qWB_t7mv>#Y~XoxN)v`S@E-Jk!aMImxe2h-arI-ug5yxvejV z6VEdj-KHik9fW)~D-0!%^AEiFd~kfo7wVPs!SVbdC!XyHu4ifQpg7Tq+slb>cHS3s z7k0bf)5L|GxV6WJ-2dP@ivF|lGX(w{2;h|VpFc5&%W9rekJE@n^6^IRB)#WXjPZS? zclKlS4?rg&!d52LW7bd(NrvtnzPm8e9ZYQs{A5f-&#OO0)M*iGN!f$D=)a=${}|Jy zX;T0AF8WpVCl87*-V#gk(k}YzYNH|^)Jf!hx{MDSt=TG81f(G@hxR@S;eJuxU_y-WA7n6z+AV6L z&H=?w^}nge(LnE3Wb$^wM) zi|1Dg%XV?Ctm1l7t<*54uC95p15n#2wDE;TvC+^>JHU&GX)l8Zt2doO(`owk8l%LQ z4l|LQrDC$OI+SZgLeEy}t`?)v_JshHd)O`Qhzyk?{CJNP z4RCdZ&On*w4Mh!Y6@CO&9kW2-qp@M!1AwT&Y;&zX+g!JoRx7pA`6Y}BE5*k8dhJ}L zID2kIsiku(>GW!*oKDqO)3*KP3psmvv1xBCK3_by?p$2@@?3Q}y?&uuDnDP}sHE++ z`o&tjzNrE~*!ueS2jASM)DW!ES14dj1a4Tv!Dzs`5KIKDA&xeKKU9J1H?M!Nebc%K zzy~+3|GNyV!C)+aQ7pi@zJq>{oFpVPf%}EvHwFUMLCRX^Z34IVuD^f%)-A}!oLX*J z%M^AG=l)T`gT1ae1kDO*4&b)^Z&rZ1t`Zu+jdKi_f^UQNpdNx&G%u0sAjpG1v=H4V z;B-(i`knCxz!wi??G9MG$oU zycv%6@b-id{XiH>GCl+F^=<1RVo&q-eLU z0=)kkZ%27s#OX?lSTY-nFXm$_)%C?W^D7@0BELO|0c)8ZJH)Ef?4;)qTdJ%g`EPg(liN)(pI~LaxJvE;qp&TwHi0hB_VyrCVmbGKg&E2h*I1+ zp^CMVOP@*zJ&@_-3AMOfc2%U39^!Z`16k{~$CDD2Xi6nc_%Uf>3~RmvlMxK?$t*1{ zH|z0g(QZ3shtwJ&ok+jKsX1uWF@nqf7^Sje3-F9lW7YP2W?1bc zf&qj-7CNYeLk|pW$2-Na3Jc)AxTYnp#HJ7jUx?D$RUJHpq8-+>_(?p%?E{O41a|~9 zEdeDs3C%uox@`-*Q@S`!cxTGe;~GM)%CcJg`Lp?K#4MJoL}_dy4E!oNf*&RHi<74 zq-8DmG9$swXndNC#gbp2R~niQTCo*+S|z?2&`_!$A+aSQ(}j?@5<%yD9CK13MC7{= z5)~qni7t%xmKdtCq z=;alXIT+r}YeJ+SX<&PKB_eaMyo*;aBTp|%tCttaow1qAZeDT5<7G=>hjq7hMp*A0 zXf4Q(55)+&C|0u%MVT%N68$Lp@1k;@D7xpO(DeLVwBALbi2Nu9%%EiL_{?A#xF{5X zpNp%f&JFW+;~dB)E9jAf6&3NV}lRkS{)13xm zaot6DlVIyH*cFBN&fT%Wb8Uk&euL3LIt|9+y&HqU)?=`{4q$Nhjt$PX4bJ%u=84l^ zzdVrNg~4R&F&I6~Es5M68=UhDP9=N>r_cdA4R#Cf&T2RXwjP7sWFf-&J2p7qHaP7! z82e4nOAOyoX6y{^tZ+HsmY+ArkUCA%NcP%Bxmu4e>vy?1`nsHcEcr<8@yw$fpL#5n znoG~0mR?Uy?payC{nh2oMijBDb|bkT2~unYmB^OEZ#cRPDd=(ySWFx>16 zhI?Hbla;fg=7Bgrh8w@(;lA(V;ihRY-1dDK$&HrAUEhb%c8hJ?^nDn*You{S`QG}n z5@aSE!u)U%!(20(lx2?eS&4vV>pCrGB?MXFB&yE=NpKWl+-3FB>Lt}n1>Vd=Y^cap4j<$b2f1?5_=xD=MoU+d#lb#?8(W~F8+w& zav)5;KcVT^n|S5Kvz+86hxGemoD;(-&J>KxqEo0DJJXVrATUf_EE4*hW~9+g_cNK8 z_{@x*o0sFu+z!ZN$*jg>6Yj@+ES(ZO--G(%GkJ|iWcuQ>8I6bCeewB(#xq2Hj6XK7 z?UT3Ee)we47>MPvShVAf&Dv6Ih}O3M#F6)Efor#23wfypVpCluotIkB*~`;4pD?_r`;JlKYuKhKsI+^6yZr7NC{gqW%x4u%Q5z* z`4|#t4%$YeK16#5Qy#AS$cbHn07^(Eb`p_ww=OxK_whl?u**&F501$P+J6i!*Gl~R z;}!m_qAYI{L*@qp#l=Otyn#Quc3)R|Naa@+y@cJ2pVhqAfVJWhZ5X;VbVY?`Bl6Yp z3s4uD&|_*2P5Q>Uko@K(w0~5L3aJB1-8!t?xuy2thbR-?T)tEJ<^dJypcf(Fx1Pc` z&wE=cbYDcphLoz~Unn{%y5wUb2(77@3Jpip2*%MM#@P_1$;EG%9Q@oo|I<7_#uF)$ zi{A?UUsVt80Dp7`_zRZ;EfGMaN5<6DW=K38`&Oq0~F$ zYTDi^JO*Z zL_>!-|4`dE?clq(M?W68#~B&%zPG}jUv$FJOgf!5{)9#y6{S7(=e8~N?l|NPj!1-l z?#hdXKSDmDK|P;zA|sMN4?h&9zbzMUhnZ^C`WW<(2l;~wls|bnB=YUSA$|oF{PzSO z)_7avo#7FwQcr$3On(QBe8`Jv`iq8sbcFJhp`URgVcH)Q`X}Mf*a-a-ytZd}0n_>< z5)J*6h!0kS$A4{VS46)0PsK;ot3fqk4^zEr{rF|cf7Y3h{1L`{!=8}^4C|BHe#}Sp zd?Q4B))^VT4|usSf5nM()yq&hTtg|>w5+3f%Eoy!!PAvs$8<(1_l_vkQcv!fzJ4aI^uWj*T>N*aWUUVEUdSCNBs2` zbwQP=Y{#TC&u=)`A3lNsW6tHz7W4@d!n5m%x9dzd;R_lIOaDX zjQ1O}zz@s%llp}}*M=qsNw4iAe`LSMPyGDEuvf3N-`?Zb7o2f|Qyx9~&$avawDYx! z`PA;9Cx0=3{YMY##ofBA;R^+CJpFbz72(~*U_&KpNY$ZQg5vfUmj=u-{V+euYY1bVGr(QyjxMwzeV{`i~6KL0YB~F zcNeZcthZ@rQ1T`Ek*`U|vbev9KG37z4t|W(%N+RZWlQ9NBtmO9}VhL0_QJ> z4%_2F@7GK3j1L^P$AYFq$o@wDQ8r-m|0E#t#$#qK@+O1cubhzo!}dhb`_&fWL;kUZ zpMFnJ`ZYkXXYjDSJGh%;Uf&xR`4bBm8+&E{ehG8#-^*iy-@^qf_GtRu&vnix>t+u7 z4tfVUBmwg&XB5A;a`)Fm$p647^D7tdFK$LC8Na8(zb}rdNqa!*&E4;&oDf&CQmikX zkAWzr2G9Lg`cXZK{Q~}A&eNBwM|FRlb_NC(Fb})?sWTed_SMIXqw8Dx!<)0HmtC%X za(?(5Y>Vmz2w*%=Pt(pWsh?d&-b+&|L1L|P$^#wC`A1@mIKQI9D%Q@L4FwZx% z{XO~*{DLNk8M_}#gy-{o`je**eHiQlCoKTE2c-`RoxDWl)d z&ywio|LYz2SNrhO&Tjq`U=V>r7Q>DMPMVTZVQGEBVvd;;2m&yA1!bou@ZKN71C diff --git a/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin b/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin deleted file mode 100644 index bbc7d301593f72433c5b7626638d0e6f0f4a0813..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11656 zcmeHNUu;`f89(-Qnm59nx<+SDBYiKg0b-XzH{#N z^|iB=G9e*$q?~)c|9{SRzI%Od9=q$vr^BI8s#nMn7QYtjYE75g0SWx#R(ZXD=zcLL zW<(F>Q4tbCuP@gdb|XKJm3wWjNSLp=`f{b<2)(kVmnw@@p*M<7LvPrnMzxZwRZF?W z)q1gzTc}pns^!&23D~5^eJyQZje z&aV`OE`f5Xf;HI+!1*>rWi8jD=j!=-T?%c3&m*Uu0v@i`aB>Z&;kT=t;#k_;#O5sI z%jG3IUt8wn*X(Me+Gy6`Hqok;dLv(HBtIz2vPiCb{$DC4w~FpvDX!?%#l?EDA?sGv zqf1AX<#N~*7ryuCT}N}0KbOqq%cXq1d%ixZO?iagr#7#li+o6>YhsTy5$o8gr-8ZI zhmEN9pKwiF9Pv?nOSH}4$OESdSGq;5FutdSP^<4Ven$uXMaH|`yhR=llm3AgLahwo zzd zY-Ct*X+VWV4NxfbkLUpA5HKP%-B_tjH>&o+a;Z{SUBD{0l&@E-m4{0CX$SlZPR+Dy z4?nWHP|Q>+=5lGZR4ixf*+=tBYpKOXvz{tEWLD?v)_it3W3A2G4Xdu#nj&=d>)-zE zjz43^`VqPuk&reLdPD0E4~Mj7I2pRCt%Y9{p*LQ;^4Gt;@!A`&N=hhvF%$wN5z+>T zb1i%cuv)f|pfK(c@Lbix#NH&ytE^h<4gXOP25i0IO%V!T>Os&y>@;srYKD|rdPGyw0xMR8)#ahX+KRNM6T_n)iS0l z8cXjac$Fp{)2rGySn+!Tz$2pizRe+MUnGWUn(n13Nz)f-GHCidO(v$-U)M^sK1LH~n$r7v_{lJ~h{{G^ zE82BD#Uy6*EFO09_ENEtsITN^lX;=%3kBQ<>bZL9(PF|%6FZ)UigqP|TOU2fBy`*n zmMRIuO6c=5iKTLNK3~q&agTZ6!Gol{R4>h!iwV70Y1qxg`!~S-La*h~L=*a|l}_lj zhMmx*M$gO#XvndAE_vX?(q#BFrOa!yZeE8>$|v)m#hwds-FZ;q<`Gl>lzk|hZlPcvsKOyL3!IodlEV{kWg$0TTq;$PcTmZc3v!?I zdBtmF4kR9U|Gjm023651Ze6Rm0devOU6ou}sjlg(G6W-gAThPj$d?Lw6K>a+%eo5A zm;uEmdKv`3$SerRlI|P?zwK%=lgtky_~#dk9^UZq2NJ1+qH4PqOp_=k=radJzEbe; zDObYCz_d>w?Fzs#$oMG8Bvi4&00l`RZl=S$zyn-OFE^BXHG|zVq1iAIPk<;rgz|1g zcGHA@BKk9Awvvh*`IBl&KR~wv1WKxHV1_PVE>fIT6L>2LiB`lZ_FIwLa*^`Vij1^V zq{oupPT|awqyfi#RyePOH?gZeJZF{gDfsKdQ{*<`({R^^=eQZ)ZkNNBrPJg+VS(TDRVrL#H6^gEGfqMz7fUhj6`W$;vpl*i-X=D#}5#`*b z!4VKuGU|fb;lttfWJ-Xn1)O7fZd?-CbV=$+xg-j{X2=;0hEAJHqVQc3QsXC)3oZ#E z_(@b!E(tFBNmNiSX||n2AuE!J!{V2sVsc4=xC}ZME{*z*Y&^&`a*LRf4slX>$8MT1f}SwbtT6 z`q=&)JWSwzC7(j{BSxE9@?s&suw2aH>As<_rMdB+Kmf9!5ui=GW2Y0RaEGJ7%*5@9 zWa2?W7O08)gGTnRBWOA`!Jx6ltf>`{tGk84h6Z@Bk8aTB^#skeO(kS=_nGm6&iI0+ zb1GKy1-)E#&s&;hs0kP|vW*f`fQMCbpXP*uBf!81FNJi=j8oD3&OGZ)LrI$Ix<$xv zcP2uX4wsL>TS$c2c2yZ}0z@#l+I=F@GEvev{q6o`WJ7{E&%Z9+qvfV@Ml~MjM8O!G zsDgp>M-8;CXUW!_Miv5%hx6i!lEx1F+_ zhzyJeW!WFkE{;j3EN7FpA}Grad9t8(%5t4%U_B_y4tcWZsGYLB1GJYxS$4>ir2?it zs2)KxY?2;+GD*~*?*^7(h4i>YRpyi69VhB6JumqwyrCpv(Bm2%Xx9pGz8_$1R{!MY@Lij z!4C`WbYLaFKtA-J=tm%t={|nzoPfQ^h+LEDNZhy1M);d!WcMJE@D_ZJ5O3ZrTv!uh zLTrwBa9H4ls1FX&i)0@^D*>z>!#nT2TOx8}pSXxu<(v463=gs`ewhqLZ6YBe1ASsA z*27_}t%&+~AN-5kg88@J1^iM>3_7Bh^ws7!HG<<#Sfh+Y-y{3v&xjy;Z$nlNg!-(Mc zVUOy;`)q{$oN!`WRFh%Kx6H?24KV9tcsNG&BlS;%4A;gOT0 z?Eco~g&6Tep5alsU#O4Kaj!lko%1aeJ?_Lcs^3AX5BPiLM0iV_1A@x55T*Kv^Zp)3 zePDlYb;jdgy6?XGRDFqGJ&I_~D1PMgiMrUu@srTy3j~k#Ar)LS8+VC)US~yqUyF&e zP4$hBitch0@F+g~VNLLUqQkp61uqkf>OQXE{lpJz>wXyZaM2H={w^sP{UL7mMd^gg z{l5qK>K%U;aGdgi{YSigbAW4c1jh9|-beXU^^g7NkJ9-XqVw@mYT8)SViUf}%2 zoq-{8ozD-~%V0Fo;*UUo{~X|29Ono5nsWNaDPLjUk7?-N8QtUZ|AzKI9^LQ4um|~p zIl#3z8VlT3jBdQ_aBuXVL3zNWSBWnFWH}S#$6NG> z?z1AncRQ zIiD;oU3r&}+oVD*^JK z>VMtGL(9kTM=YNtz&F;X6yPQtAnjkf6(ODmj^s(de|YkKB-Z4se{29jxKH?zSiAB` z-{J07Km2R~e)s>118T$`WzfC4`6D!ic0F=b2VQ=I23-T{>VjmCe0Adrig~cgw_lD> N+sV24aj&l2zW{T-tULe! diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp deleted file mode 100644 index e91fc32554a..00000000000 --- a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp +++ /dev/null @@ -1,965 +0,0 @@ - -#include -// Definition of AOTI runtime interface functions - -#include -#include - -#include -#include - -#define CONVERT_EXCEPTION_TO_ERROR_CODE(...) \ - try { \ - __VA_ARGS__ \ - } catch (const std::exception& e) { \ - std::cerr << "Error: " << e.what() << '\n'; \ - return AOTI_RUNTIME_FAILURE; \ - } catch (...) { \ - std::cerr << "Unknown exception occurred.\n"; \ - return AOTI_RUNTIME_FAILURE; \ - } \ - return AOTI_RUNTIME_SUCCESS; - -#define AOTI_VECTOR_SIZE_CHECK(actual_size, expected_size, name) \ - do { \ - AOTI_RUNTIME_CHECK( \ - actual_size == expected_size, \ - "expected " + std::string(name) + " vector size to be " + \ - std::to_string(expected_size) + ", but got " + \ - std::to_string(actual_size)); \ - } while (0) - -// AOTInductor uses at::addmm_out, which doesn't supports -// arguments that requires gradient. For this reason, we -// enforce no_grad context for run APIs. -// -// A RAII, thread local (!) guard that enables or disables grad mode upon -// construction, and sets it back to the original value upon destruction. -struct AOTINoGradGuard { - AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(false); - } - AOTINoGradGuard(const AOTINoGradGuard&) = delete; - AOTINoGradGuard(AOTINoGradGuard&&) noexcept = delete; - ~AOTINoGradGuard() { - aoti_torch_grad_mode_set_enabled(prev_mode); - } - AOTINoGradGuard& operator=(const AOTINoGradGuard&) = delete; - AOTINoGradGuard& operator=(AOTINoGradGuard&&) noexcept = delete; - bool prev_mode{aoti_torch_grad_mode_is_enabled()}; -}; - -extern "C" { - -AOTIRuntimeError AOTInductorModelContainerCreate( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - bool is_cpu, - const char* cubin_dir) { - return AOTInductorModelContainerCreateWithDevice( - container_handle, - num_models, - is_cpu ? "cpu" : "cuda", - cubin_dir); -} - -AOTIRuntimeError AOTInductorModelContainerCreateWithDevice( - AOTInductorModelContainerHandle* container_handle, - size_t num_models, - const char* device_str, - const char* cubin_dir) { - if (num_models == 0) { - std::cerr << "Error: num_models must be positive, but got 0\n"; - return AOTI_RUNTIME_FAILURE; - } - CONVERT_EXCEPTION_TO_ERROR_CODE({ - std::optional cubin_dir_opt; - if (cubin_dir != nullptr) { - cubin_dir_opt.emplace(cubin_dir); - } - auto* container = new torch::aot_inductor::AOTInductorModelContainer( - num_models, std::string(device_str), cubin_dir_opt); - *container_handle = - reinterpret_cast(container); - }) -} - -AOTIRuntimeError AOTInductorModelContainerDelete( - AOTInductorModelContainerHandle container_handle) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto* container = - reinterpret_cast( - container_handle); - delete container; - }); -} - -AOTIRuntimeError AOTInductorModelContainerRun( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunSingleThreaded( - AOTInductorModelContainerHandle container_handle, - AtenTensorHandle* input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - size_t num_inputs, - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - size_t num_outputs, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - AOTI_VECTOR_SIZE_CHECK(num_inputs, container->num_inputs(), "inputs"); - AOTI_VECTOR_SIZE_CHECK(num_outputs, container->num_outputs(), "outputs"); - - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_single_threaded( - input_handles, output_handles, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumConstants( - AOTInductorModelContainerHandle container_handle, - size_t* num_constants) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *num_constants = container->num_constants(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantName( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** name) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *name = container->constant_name(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantOriginalFQN( - AOTInductorModelContainerHandle container_handle, - size_t idx, - const char** original_fqn) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *original_fqn = container->constant_original_fqn(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantFromFolded( - AOTInductorModelContainerHandle container_handle, - size_t idx, - bool* from_folded) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *from_folded = container->constant_from_folded(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantType( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* type) { - auto* container = - reinterpret_cast(container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ *type = container->constant_type(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDtype( - AOTInductorModelContainerHandle container_handle, - size_t idx, - int32_t* dtype) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *dtype = container->constant_dtype(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetConstantDataSize( - AOTInductorModelContainerHandle container_handle, - size_t idx, - size_t* data_size) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *data_size = container->constant_data_size(idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerExtractConstantsMap( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive) { - auto* container = - reinterpret_cast( - container_handle); - auto constants_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { const auto ret = container->extract_constants_map(use_inactive); - for (const auto& pair: ret) { - constants_map->emplace(pair.first, pair.second); - } - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateUserManagedConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update, /* user_managed = */ true); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle, - bool use_inactive, - bool validate_full_update) { - auto* container = - reinterpret_cast( - container_handle); - auto input_map = reinterpret_cast*>(constant_map_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->update_constant_buffer( - *input_map, use_inactive, validate_full_update); - }) -} - -AOTIRuntimeError AOTInductorModelContainerUpdateInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle, - AOTInductorConstantMapHandle constant_map_handle) { - return AOTInductorModelContainerUpdateConstantBuffer(container_handle, - constant_map_handle, - /*use_inactive*/ true, - /*validate_full_update*/ true); -} - -AOTIRuntimeError AOTInductorModelContainerFreeInactiveConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->free_inactive_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerRunConstantFolding( - AOTInductorModelContainerHandle container_handle, - bool use_inactive, - AOTInductorStreamHandle stream_handle, - AOTIProxyExecutorHandle proxy_executor_handle) { - auto* container = - reinterpret_cast( - container_handle); - auto stream = - reinterpret_cast(stream_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - container->run_const_fold(use_inactive, stream, proxy_executor_handle); - }) -} - -AOTIRuntimeError AOTInductorModelContainerSwapConstantBuffer( - AOTInductorModelContainerHandle container_handle) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - container->swap_constant_buffer(); - }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumInputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_inputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_inputs = container->num_inputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetInputName( - AOTInductorModelContainerHandle container_handle, - size_t input_idx, - const char** ret_input_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_input_names = container->input_name(input_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetNumOutputs( - AOTInductorModelContainerHandle container_handle, - size_t* ret_num_outputs) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_num_outputs = container->num_outputs(); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetOutputName( - AOTInductorModelContainerHandle container_handle, - size_t output_idx, - const char** ret_output_names) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE( - { *ret_output_names = container->output_name(output_idx); }) -} - -AOTIRuntimeError AOTInductorModelContainerGetCallSpec( - AOTInductorModelContainerHandle container_handle, - const char** in_spec, - const char** out_spec) { - auto* container = - reinterpret_cast( - container_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - *in_spec = container->get_in_spec(); - *out_spec = container->get_out_spec(); - }) -} - -AOTIRuntimeError AOTInductorModelCreate( - AOTInductorModelHandle* model_handle, - AOTInductorConstantMapHandle constant_map_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto constant_array = std::make_shared>(); - auto input_map = reinterpret_cast*>(constant_map_handle); - - auto model = new torch::aot_inductor::AOTInductorModel( - constant_map, - constant_array, - "cpu", // device_str is hardcoded, as AOTInductorModelCreate is only use for CPU models - "" - ); - - if (input_map) { - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - } else { - model->load_constants(); - } - - *model_handle = reinterpret_cast(model); - })} - -AOTIRuntimeError AOTInductorModelRun( - AOTInductorModelHandle model_handle, - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - AOTINoGradGuard guard; - model->run_impl( - input_handles, - output_handles, - (torch::aot_inductor::DeviceStreamType) nullptr, - nullptr); - }) -} - -AOTIRuntimeError AOTInductorModelDelete(AOTInductorModelHandle model_handle){ - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast( - model_handle); - delete model; - })} - -AOTIRuntimeError AOTInductorModelGetNumOutputs( - AOTInductorModelHandle model_handle, - size_t* ret_num_outputs) { - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto model = reinterpret_cast(model_handle); - *ret_num_outputs = model->num_outputs(); - }) -} - -AOTIRuntimeError AOTInductorModelUpdateConstantsMap( - AOTInductorModelHandle model_handle, - AOTInductorConstantMapHandle constant_map_handle) { - auto model = - reinterpret_cast(model_handle); - CONVERT_EXCEPTION_TO_ERROR_CODE({ - auto constant_map = std::make_shared(); - auto input_map = - reinterpret_cast*>( - constant_map_handle); - - for (auto const& kv : *input_map) { - constant_map->emplace(kv.first, kv.second); - } - model->update_constants_map(std::move(constant_map)); - }) -} - -} // extern "C" - - -#define CUDA_DRIVER_CHECK(EXPR) \ -do { \ - CUresult code = EXPR; \ - const char *msg; \ - CUresult code_get_error = cuGetErrorString(code, &msg); \ - if (code_get_error != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string("invalid error code!")); \ - } \ - if (code != CUDA_SUCCESS) { \ - throw std::runtime_error( \ - std::string("CUDA driver error: ") + \ - std::string(msg)); \ - } \ -} while (0); - -static inline CUfunction loadKernel( - std::string filePath, - const std::string &funcName, - uint32_t sharedMemBytes, - const std::optional &cubinDir = std::nullopt) { - if (cubinDir) { - std::filesystem::path p1{*cubinDir}; - std::filesystem::path p2{filePath}; - filePath = (p1 / p2.filename()).string(); - } - - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoad(&mod, filePath.c_str())); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) { - CUmodule mod; - CUfunction func; - CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start)); - CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str())); - if (sharedMemBytes > 0) { - CUDA_DRIVER_CHECK(cuFuncSetAttribute( - func, - CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, - sharedMemBytes - )) - } - return func; -} - -static inline void launchKernel( - CUfunction func, - uint32_t gridX, - uint32_t gridY, - uint32_t gridZ, - uint32_t numWarps, - uint32_t sharedMemBytes, - void* args[], - cudaStream_t stream) { - CUDA_DRIVER_CHECK(cuLaunchKernel( - func, gridX, gridY, gridZ, 32*numWarps, 1, 1, sharedMemBytes, stream, args, nullptr - )); -} -CACHE_TORCH_DTYPE(float32); -CACHE_TORCH_DEVICE(cuda); -CACHE_TORCH_LAYOUT(strided); -namespace torch::aot_inductor { -namespace { -class AOTInductorModelKernels : public AOTInductorModelKernelsBase { - public: - CUfunction triton_poi_fused_convolution_0{nullptr}; - CUfunction triton_poi_fused_convolution_1{nullptr}; - CUfunction triton_poi_fused_convolution_2{nullptr}; -}; -} // namespace - - - -AOTInductorModel::AOTInductorModel(std::shared_ptr constants_map, - std::shared_ptr> constants_array, - const std::string& device_str, - std::optional cubin_dir) - : AOTInductorModelBase(1, - 1, - 1, - device_str, - std::move(cubin_dir), - true) { - inputs_info_[0].name = "arg2_1"; - constants_info_[0].name = "conv_weight"; - constants_info_[0].dtype = static_cast(cached_torch_dtype_float32); - constants_info_[0].offset = 0; - constants_info_[0].data_size = 540; - constants_info_[0].from_folded = false; - constants_info_[0].type = static_cast(torch::aot_inductor::ConstantType::Parameter); - constants_info_[0].shape = {5, 3, 3, 3}; - constants_info_[0].stride = {27, 9, 3, 1}; - constants_info_[0].layout = static_cast(cached_torch_layout_strided); - constants_info_[0].original_fqn = "conv.weight"; - update_constants_map(std::move(constants_map)); - update_constants_array(std::move(constants_array)); - in_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}, {"type": "builtins.dict", "context": "[]", "children_spec": []}]}])"; - out_spec_ = R"([1, {"type": "builtins.tuple", "context": "null", "children_spec": [{"type": null, "context": null, "children_spec": []}]}])"; - outputs_info_[0].name = "output0"; - this->kernels_ = std::make_unique(); -} - -std::unordered_map AOTInductorModel::const_run_impl( - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor, - bool initialization -) { - - if (!initialization) { - std::cerr << "[WARNING] Calling constant_folding in model, but compiled with config: " - << "aot_inductor.use_runtime_constant_folding=False\n"; - } - return {}; -} -} // namespace torch::aot_inductor -using namespace torch::aot_inductor; - -template -static inline void call_triton_poi_fused_convolution_0( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_0', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_0', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 6144, 'x': 3072}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_0(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 12 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 64*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 192*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (64 - 1)) / (64)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_0 == nullptr) { - kernels_.triton_poi_fused_convolution_0 = loadKernel("/home/gasoonjia/executorch/cuj3mxjkcttcfshkrqr3bbv27ng2dlykmtde7rpiylednxszoer5.cubin", "triton_poi_fused_convolution_0", 4352, cubin_dir_); - } - CUdeviceptr var_0 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_1 = reinterpret_cast(out_ptr0.data_ptr()); - int var_2 = ynumel; - int var_3 = xnumel; - CUdeviceptr global_scratch_4 = 0; - void* kernel_args_[] = {&var_0, &var_1, &var_2, &var_3, &global_scratch_4}; - launchKernel(kernels_.triton_poi_fused_convolution_0, grid_0, grid_1, grid_2, 4, 4352, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_1( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_1', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 16, 'x': 16}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_1', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 1080, 'x': 540}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_1(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 15 - xnumel = 9 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y3 = yindex - y0 = (yindex % 3) - y1 = yindex // 3 - tmp0 = tl.load(in_ptr0 + (x2 + 9*y3), xmask & ymask, eviction_policy='evict_last') - tl.store(out_ptr0 + (y0 + 3*x2 + 27*y1), tmp0, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (16 - 1)) / (16)); - uint32_t grid_1 = ((ynumel + (16 - 1)) / (16)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_1 == nullptr) { - kernels_.triton_poi_fused_convolution_1 = loadKernel("/home/gasoonjia/executorch/cg7g6znwyjx7worxb7hbjz5rypindv6rgyiqidang4zm47hs6h7u.cubin", "triton_poi_fused_convolution_1", 1088, cubin_dir_); - } - CUdeviceptr var_5 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_6 = reinterpret_cast(out_ptr0.data_ptr()); - int var_7 = ynumel; - int var_8 = xnumel; - CUdeviceptr global_scratch_9 = 0; - void* kernel_args_[] = {&var_5, &var_6, &var_7, &var_8, &global_scratch_9}; - launchKernel(kernels_.triton_poi_fused_convolution_1, grid_0, grid_1, grid_2, 4, 1088, kernel_args_, stream_); -} - -template -static inline void call_triton_poi_fused_convolution_2( - const in_ptr0_type_& in_ptr0, - const out_ptr0_type_& out_ptr0, - int64_t ynumel, - int64_t xnumel, - int32_t device_idx_, - cudaStream_t stream_, - kernels_type_& kernels_, - const std::optional& cubin_dir_ = std::nullopt -){ - /* - async_compile.triton('triton_poi_fused_convolution_2', ''' - import triton - import triton.language as tl - - from torch._inductor.runtime import triton_helpers, triton_heuristics - from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math - from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties - triton_helpers.set_driver_to_gpu() - - @triton_heuristics.pointwise( - size_hints={'y': 32, 'x': 64}, tile_hint=TileHint.SQUARE, - filename=__file__, - triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*fp32', 'ynumel': 'i32', 'xnumel': 'i32', 'YBLOCK': 'constexpr', 'XBLOCK': 'constexpr'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [{(0,): [['tt.divisibility', 16]], (1,): [['tt.divisibility', 16]], (3,): [['tt.divisibility', 16]]}]}, - inductor_meta={'grid_type': 'Grid2D', 'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_convolution_2', 'mutated_arg_names': [], 'optimize_mem': True, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': '4F87BAC7C78026030CE21ABCD241F4211145E4ACCDC53C53E0CC97717CB6F329', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': True, 'tiling_scores': {'y': 5120, 'x': 10240}}, - min_elem_per_thread=0 - ) - @triton.jit - def triton_poi_fused_convolution_2(in_ptr0, out_ptr0, ynumel, xnumel, YBLOCK : tl.constexpr, XBLOCK : tl.constexpr): - ynumel = 20 - xnumel = 64 - yoffset = tl.program_id(1) * YBLOCK - yindex = yoffset + tl.arange(0, YBLOCK)[:, None] - ymask = yindex < ynumel - xoffset = tl.program_id(0) * XBLOCK - xindex = xoffset + tl.arange(0, XBLOCK)[None, :] - xmask = xindex < xnumel - x2 = xindex - y0 = (yindex % 5) - y1 = yindex // 5 - y3 = yindex - tmp0 = tl.load(in_ptr0 + (y0 + 5*x2 + 320*y1), xmask & ymask, eviction_policy='evict_last') - tmp1 = y0 - tmp2 = tl.full([1, 1], 2, tl.int64) - tmp3 = tmp1 < tmp2 - tmp4 = tl.full([1, 1], 1, tl.int64) - tmp5 = tmp1 < tmp4 - tmp6 = 0.1508762389421463 - tmp7 = -0.15852206945419312 - tmp8 = tl.where(tmp5, tmp6, tmp7) - tmp9 = tl.full([1, 1], 3, tl.int64) - tmp10 = tmp1 < tmp9 - tmp11 = tl.full([1, 1], 4, tl.int64) - tmp12 = tmp1 < tmp11 - tmp13 = -0.047068577259778976 - tmp14 = 0.010523972101509571 - tmp15 = tl.where(tmp12, tmp13, tmp14) - tmp16 = 0.07869197428226471 - tmp17 = tl.where(tmp10, tmp16, tmp15) - tmp18 = tl.where(tmp3, tmp8, tmp17) - tmp19 = tmp0 + tmp18 - tl.store(out_ptr0 + (x2 + 64*y3), tmp19, xmask & ymask) - ''', device_str='cuda') - */ - uint32_t grid_0 = ((xnumel + (32 - 1)) / (32)); - uint32_t grid_1 = ((ynumel + (32 - 1)) / (32)); - uint32_t grid_2 = 1; - if (grid_0 == 0 || grid_1 == 0 || grid_2 == 0) return; - if (kernels_.triton_poi_fused_convolution_2 == nullptr) { - kernels_.triton_poi_fused_convolution_2 = loadKernel("/home/gasoonjia/executorch/ckh2jw4qzbo6bg3d3ft7jfqzeusq2y2hz662iuqm5tpxbodupud4.cubin", "triton_poi_fused_convolution_2", 4608, cubin_dir_); - } - CUdeviceptr var_10 = reinterpret_cast(in_ptr0.data_ptr()); - CUdeviceptr var_11 = reinterpret_cast(out_ptr0.data_ptr()); - int var_12 = ynumel; - int var_13 = xnumel; - CUdeviceptr global_scratch_14 = 0; - void* kernel_args_[] = {&var_10, &var_11, &var_12, &var_13, &global_scratch_14}; - launchKernel(kernels_.triton_poi_fused_convolution_2, grid_0, grid_1, grid_2, 4, 4608, kernel_args_, stream_); -} - -namespace torch::aot_inductor { - -void AOTInductorModel::_const_run_impl( - std::vector& output_handles, - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) {} - -AOTI_NOINLINE static void check_input_0( - AtenTensorHandle* input_handles -) { - ConstantHandle arg2_1 = ConstantHandle(input_handles[0]); - int32_t arg2_1_dtype; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_dtype(arg2_1, &arg2_1_dtype)); - - int32_t arg2_1_expected_dtype = aoti_torch_dtype_float32(); - if (arg2_1_expected_dtype != arg2_1_dtype) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dtype, " - << "expected: " << arg2_1_expected_dtype << "(at::kFloat), " - << "but got: " << arg2_1_dtype << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_size = arg2_1.sizes(); - - if (4 != arg2_1_size[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 0, " - << "expected: 4, " << "but got: " << arg2_1_size[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (3 != arg2_1_size[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 1, " - << "expected: 3, " << "but got: " << arg2_1_size[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 2, " - << "expected: 8, " << "but got: " << arg2_1_size[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_size[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched dim value at 3, " - << "expected: 8, " << "but got: " << arg2_1_size[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - auto arg2_1_stride = arg2_1.strides(); - - if (192 != arg2_1_stride[0]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 0, " - << "expected: 192, " << "but got: " << arg2_1_stride[0] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (64 != arg2_1_stride[1]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 1, " - << "expected: 64, " << "but got: " << arg2_1_stride[1] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (8 != arg2_1_stride[2]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 2, " - << "expected: 8, " << "but got: " << arg2_1_stride[2] - << "\n"; - throw std::runtime_error(ss.str()); - } - - if (1 != arg2_1_stride[3]) { - std::stringstream ss; - ss << "input_handles[0]: unmatched stride value at 3, " - << "expected: 1, " << "but got: " << arg2_1_stride[3] - << "\n"; - throw std::runtime_error(ss.str()); - } - int32_t arg2_1_device_type; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_get_device_type(arg2_1, &arg2_1_device_type)); - - int32_t arg2_1_expected_device_type = 1; - if (arg2_1_expected_device_type != arg2_1_device_type) { - std::stringstream ss; - ss << "input_handles[0]: unmatched device type, " - << "expected: " << arg2_1_expected_device_type << "1(cuda), " - << "but got: " << arg2_1_device_type << "\n"; - throw std::runtime_error(ss.str()); - } -} - -static bool _check_aoti_runtime_check_inputs_env() { - const static char* env_var_value = getenv("AOTI_RUNTIME_CHECK_INPUTS"); - const static bool result = env_var_value != nullptr && env_var_value[0] != '0'; - return result; -} - -AOTI_NOINLINE static void __check_inputs_outputs( - AtenTensorHandle* input_handles, - AtenTensorHandle* output_handles) { - if (!_check_aoti_runtime_check_inputs_env()){ - return; - } - check_input_0(input_handles); -} - -void AOTInductorModel::run_impl( - AtenTensorHandle* - input_handles, // array of input AtenTensorHandle; handles - // are stolen; the array itself is borrowed - AtenTensorHandle* - output_handles, // array for writing output AtenTensorHandle; handles - // will be stolen by the caller; the array itself is - // borrowed - DeviceStreamType stream, - AOTIProxyExecutorHandle proxy_executor -) { - __check_inputs_outputs(input_handles, output_handles); - - auto inputs = steal_from_raw_handles_to_raii_handles(input_handles, 1); - auto arg2_1 = std::move(inputs[0]); - [[maybe_unused]] auto& conv_weight = constants_->at(0); - - if ((long(arg2_1.data_ptr()) & (16 -1)) != 0) { - AOTI_TORCH_WARN("Input 0 was compiled as 16-bytes aligned, but it is not aligned at run time. Copying to an aligned tensor to guarantee correctness, but expect a performance hit."); - AtenTensorHandle arg2_1_aligned; - aoti_torch_clone_preserve_strides(arg2_1, &arg2_1_aligned); - arg2_1 = std::move(RAIIAtenTensorHandle(arg2_1_aligned)); - } - inputs.clear(); - [[maybe_unused]] auto& kernels = static_cast(*this->kernels_.get()); - - AOTICudaStreamGuard stream_guard(stream, this->device_idx_); - static constexpr int64_t int_array_0[] = {4L, 3L, 8L, 8L}; - static constexpr int64_t int_array_1[] = {192L, 1L, 24L, 3L}; - AtenTensorHandle buf0_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_0, int_array_1, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf0_handle)); - RAIIAtenTensorHandle buf0(buf0_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_0(arg2_1, buf0, 12L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - arg2_1.reset(); - static constexpr int64_t int_array_2[] = {5L, 3L, 3L, 3L}; - static constexpr int64_t int_array_3[] = {27L, 1L, 9L, 3L}; - AtenTensorHandle buf1_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_2, int_array_3, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf1_handle)); - RAIIAtenTensorHandle buf1(buf1_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_1(conv_weight, buf1, 15L, 9L, this->device_idx_, stream, kernels, this->cubin_dir_); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - AtenTensorHandle buf2_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_cuda_convolution(buf0, buf1, nullptr, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, std::array{1L, 1L}.cbegin(), 2, 0, std::array{0L, 0L}.cbegin(), 2, 1L, &buf2_handle)); - RAIIAtenTensorHandle buf2(buf2_handle); - buf0.reset(); - buf1.reset(); - static constexpr int64_t int_array_4[] = {4L, 5L, 8L, 8L}; - static constexpr int64_t int_array_5[] = {320L, 64L, 8L, 1L}; - AtenTensorHandle buf3_handle; - AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_empty_strided(4, int_array_4, int_array_5, cached_torch_dtype_float32, cached_torch_device_type_cuda, this->device_idx_, &buf3_handle)); - RAIIAtenTensorHandle buf3(buf3_handle); - // Topologically Sorted Source Nodes: [aten_convolution_default], Original ATen: [aten.convolution] - call_triton_poi_fused_convolution_2(buf2, buf3, 20L, 64L, this->device_idx_, stream, kernels, this->cubin_dir_); - buf2.reset(); - output_handles[0] = buf3.release(); -} // AOTInductorModel::run_impl -} // namespace torch::aot_inductor - - - - -// Compile cmd -// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.cpp -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -fPIC -O1 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -include /tmp/torchinductor_gasoonjia/precompiled_headers/c4cub4usfsuwqkbp3pfgzit6fkb6qpm3anlkt22y6d2ks3tdluhg.h -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -c -o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o -// Link cmd -// g++ /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper.o /home/gasoonjia/executorch/c26meop4u3hf2hh76dw6zl4fepetv42wg64xygsadkkb43zczod6.kernel.o /home/gasoonjia/executorch/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq/c5rhpvrttznyqa5pe725yxk3av45bswzgxcmk7tdg4j7yptcotin.o -D TORCH_INDUCTOR_CPP_WRAPPER -D STANDALONE_TORCH_HEADER -D C10_USING_CUSTOM_GENERATED_MACROS -D CPU_CAPABILITY_AVX512 -D USE_CUDA -shared -fPIC -O3 -DNDEBUG -fno-trapping-math -funsafe-math-optimizations -ffinite-math-only -fno-signed-zeros -fno-math-errno -fexcess-precision=fast -fno-finite-math-only -fno-unsafe-math-optimizations -ffp-contract=off -fno-tree-loop-vectorize -march=native -Wall -std=c++17 -Wno-unused-variable -Wno-unknown-pragmas -pedantic -fopenmp -I/home/gasoonjia/.conda/envs/aoti/include/python3.10 -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include -I/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/include/torch/csrc/api/include -I/usr/local/cuda-12.6/include -mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma -o /home/gasoonjia/executorch/aoti.so -ltorch -ltorch_cpu -lgomp -lc10 -lc10_cuda -lcuda -ltorch_cuda -L/home/gasoonjia/.conda/envs/aoti/lib -L/home/gasoonjia/.conda/envs/aoti/lib/python3.10/site-packages/torch/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib -L/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs diff --git a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json b/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json deleted file mode 100644 index bd5d2c60334..00000000000 --- a/cwbxrbt67v2w5ftivnaod7qulkhz5pxljkb2gr6xgxxo6t35umkq.wrapper_metadata.json +++ /dev/null @@ -1 +0,0 @@ -{"AOTI_DEVICE_KEY": "cuda"} \ No newline at end of file From 3ec8a38160213e37f0652296a938f3ecdad02773 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 20 Aug 2025 12:16:59 -0700 Subject: [PATCH 17/50] add get_device_type and get_device_index shim layer --- backends/aoti/aoti_backend.py | 2 ++ backends/aoti/runtime/shims/memory.cpp | 12 +++++++---- .../aoti/runtime/shims/tensor_attribute.cpp | 20 +++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 5aa547d789c..a60806815ef 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -45,6 +45,8 @@ def preprocess( options: dict[str, typing.Any] = { "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, + "aot_inductor.debug_compile": True, + "aot_inductor.repro_level": 3 } so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index ab5d35efd9f..83030647691 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -179,14 +179,18 @@ AOTITorchError aoti_torch_empty_strided( // Store the tensor tensors.insert(tensor); + *ret_new_tensor = tensor.get(); + is_tensor_own_memory[tensor.get()] = true; - std::cout << "sizes.data(): " << sizes.data() + std::cout << "Finished. Created tensor " << tensor.get() << " with sizes " + << std::endl + << "sizes.data(): " << sizes.data() << ", tensor->sizes().data(): " << tensor->sizes().data() << std::endl; std::cout << "Size[0] of tensor " << tensor.get() << " is " - << tensor->sizes()[0] << std::endl; - *ret_new_tensor = tensor.get(); - is_tensor_own_memory[tensor.get()] = true; + << tensor->sizes()[0] << std::endl + << std::endl; + return Error::Ok; } diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index b5333f50ea9..a75af9ae128 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -103,6 +103,26 @@ AOTITorchError aoti_torch_get_storage_size( throw std::runtime_error("Cannot get storage size on ETensor"); } +AOTITorchError aoti_torch_get_device_type( + AOTITensorHandle tensor, + int32_t* ret_device_type) { + // Let's assume all tensors AOTI using are on CUDA device + *ret_device_type = aoti_torch_device_type_cuda(); // CUDA device type + std::cout << "getting device_type from tensor " << tensor << " = " + << *ret_device_type << std::endl; + return Error::Ok; +} + +AOTITorchError aoti_torch_get_device_index( + AOTITensorHandle tensor, + int32_t* ret_device_index) { + // Let's assume all tensors AOTI using are on CUDA:0 + *ret_device_index = 0; + std::cout << "getting device_index from tensor " << tensor << " = " + << *ret_device_index << std::endl; + return Error::Ok; +} + int32_t aoti_torch_device_type_cpu() { // Let's say cpu is 0 for ET as well return 0; From 28d1294ed88ca85884a5b46c694c2f693a0e4817 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 21 Aug 2025 10:57:34 -0700 Subject: [PATCH 18/50] e2e runnable on conv2d --- backends/aoti/aoti_backend.py | 7 +- backends/aoti/runtime/shims/cuda_ops.cpp | 282 ++++++++++++++++++ backends/aoti/runtime/shims/cuda_ops.h | 54 ++++ backends/aoti/runtime/shims/memory.cpp | 3 +- .../aoti/runtime/shims/tensor_attribute.cpp | 2 +- export_aoti.py | 5 + 6 files changed, 350 insertions(+), 3 deletions(-) create mode 100644 backends/aoti/runtime/shims/cuda_ops.cpp create mode 100644 backends/aoti/runtime/shims/cuda_ops.h diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index a60806815ef..a0793bdb80f 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -46,8 +46,13 @@ def preprocess( "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, "aot_inductor.debug_compile": True, - "aot_inductor.repro_level": 3 + "aot_inductor.repro_level": 3, + "aot_inductor.debug_intermediate_value_printer": "3", + "max_autotune": True, + "max_autotune_gemm_backends": "TRITON", + "max_autotune_conv_backends": "TRITON", } + so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] assert so_path == output_path, f"Expected {output_path} but got {so_path}" diff --git a/backends/aoti/runtime/shims/cuda_ops.cpp b/backends/aoti/runtime/shims/cuda_ops.cpp new file mode 100644 index 00000000000..d89f1d36e7e --- /dev/null +++ b/backends/aoti/runtime/shims/cuda_ops.cpp @@ -0,0 +1,282 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cuda_ops.h" +#include "memory.h" +#include "tensor_attribute.h" +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +// Global cuDNN handle +static cudnnHandle_t cudnn_handle = nullptr; + +// Initialize cuDNN handle +static void init_cudnn() { + if (cudnn_handle == nullptr) { + cudnnCreate(&cudnn_handle); + } +} + +extern "C" { + +AOTITorchError aoti_torch_cuda_addmm_out( + AtenTensorHandle out, + AtenTensorHandle self, + AtenTensorHandle mat1, + AtenTensorHandle mat2, + double beta, + double alpha) { + + std::cout << "aoti_torch_cuda_addmm_out called with beta=" << beta << ", alpha=" << alpha << std::endl; + + // Get tensor dimensions + auto mat1_sizes = mat1->sizes(); + auto mat2_sizes = mat2->sizes(); + auto self_sizes = self->sizes(); + auto out_sizes = out->sizes(); + + // mat1: [M, K], mat2: [K, N], result: [M, N] + int64_t M = mat1_sizes[0]; + int64_t K = mat1_sizes[1]; + int64_t N = mat2_sizes[1]; + + std::cout << "ADDMM: mat1[" << M << "," << K << "] @ mat2[" << K << "," << N << "] -> out[" << M << "," << N << "]" << std::endl; + + // Use cuBLAS for matrix multiplication + cublasHandle_t cublas_handle; + cublasStatus_t cublas_status = cublasCreate(&cublas_handle); + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "Failed to create cuBLAS handle" << std::endl; + return Error::Internal; + } + + // Set cuBLAS to use tensor op math for better performance + cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); + + const float f_alpha = static_cast(alpha); + const float f_beta = static_cast(beta); + + // Perform: out = beta * self + alpha * (mat1 @ mat2) + // First: out = beta * self (copy self to out and scale) + if (beta != 0.0) { + Error copy_err = aoti_torch_copy_(out, self, 0); + if (copy_err != Error::Ok) { + cublasDestroy(cublas_handle); + return copy_err; + } + + // Scale by beta if not 1.0 + if (beta != 1.0) { + cublas_status = cublasSscal(cublas_handle, M * N, &f_beta, + static_cast(out->mutable_data_ptr()), 1); + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "cuBLAS scale failed" << std::endl; + cublasDestroy(cublas_handle); + return Error::Internal; + } + } + } else { + // Zero out the output tensor + cudaMemset(out->mutable_data_ptr(), 0, M * N * sizeof(float)); + } + + // Then: out += alpha * (mat1 @ mat2) + // cuBLAS uses column-major, so we compute: C = alpha * A^T * B^T + beta * C + // Which gives us: out = alpha * mat1 @ mat2 + beta * out + const float gemm_beta = 1.0f; // Since we already handled the beta scaling above + + cublas_status = cublasSgemm( + cublas_handle, + CUBLAS_OP_N, CUBLAS_OP_N, // No transpose for column-major interpretation + N, M, K, // Dimensions swapped for column-major + &f_alpha, // alpha + static_cast(mat2->data_ptr()), N, // B matrix (mat2) + static_cast(mat1->data_ptr()), K, // A matrix (mat1) + &gemm_beta, // beta (1.0 since we pre-scaled) + static_cast(out->mutable_data_ptr()), N // C matrix (out) + ); + + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + std::cerr << "cuBLAS GEMM failed: " << cublas_status << std::endl; + cublasDestroy(cublas_handle); + return Error::Internal; + } + + cublasDestroy(cublas_handle); + + std::cout << "aoti_torch_cuda_addmm_out completed successfully" << std::endl; + return Error::Ok; +} + +AOTITorchError aoti_torch_cuda_convolution( + AtenTensorHandle input, + AtenTensorHandle weight, + AtenTensorHandle* bias, + const int64_t* stride, + int64_t stride_len_, + const int64_t* padding, + int64_t padding_len_, + const int64_t* dilation, + int64_t dilation_len_, + int32_t transposed, + const int64_t* output_padding, + int64_t output_padding_len_, + int64_t groups, + AtenTensorHandle* ret0) { + + std::cout << "aoti_torch_cuda_convolution called" << std::endl; + + init_cudnn(); + + // Get input dimensions + auto input_sizes = input->sizes(); + auto weight_sizes = weight->sizes(); + + int batch_size = input_sizes[0]; + int input_channels = input_sizes[1]; + int input_height = input_sizes[2]; + int input_width = input_sizes[3]; + + int output_channels = weight_sizes[0]; + int kernel_height = weight_sizes[2]; + int kernel_width = weight_sizes[3]; + + // Calculate output dimensions + int output_height = (input_height + 2 * padding[0] - dilation[0] * (kernel_height - 1) - 1) / stride[0] + 1; + int output_width = (input_width + 2 * padding[1] - dilation[1] * (kernel_width - 1) - 1) / stride[1] + 1; + + std::cout << "Conv2d: input[" << batch_size << "," << input_channels << "," << input_height << "," << input_width << "]" + << " -> output[" << batch_size << "," << output_channels << "," << output_height << "," << output_width << "]" << std::endl; + + // Create output tensor + std::vector output_sizes = {batch_size, output_channels, output_height, output_width}; + + AOTITensorHandle output_handle; + Error create_err = aoti_torch_empty_strided( + output_sizes.size(), + output_sizes.data(), + nullptr, // use default strides + 6, // float32 dtype + 1, // cuda device + 0, // device index + &output_handle); + + if (create_err != Error::Ok) { + std::cerr << "Failed to create output tensor for convolution" << std::endl; + return create_err; + } + + // Setup cuDNN descriptors + cudnnTensorDescriptor_t input_desc, output_desc, bias_desc; + cudnnFilterDescriptor_t weight_desc; + cudnnConvolutionDescriptor_t conv_desc; + + cudnnCreateTensorDescriptor(&input_desc); + cudnnCreateTensorDescriptor(&output_desc); + cudnnCreateTensorDescriptor(&bias_desc); + cudnnCreateFilterDescriptor(&weight_desc); + cudnnCreateConvolutionDescriptor(&conv_desc); + + // Set tensor descriptors + cudnnSetTensorNdDescriptor(input_desc, CUDNN_DATA_FLOAT, 4, + (int*)input_sizes.data(), + (int*)input->strides().data()); + + cudnnSetTensorNdDescriptor(output_desc, CUDNN_DATA_FLOAT, 4, + (int*)output_sizes.data(), + (int*)output_handle->strides().data()); + + cudnnSetFilterNdDescriptor(weight_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, + (int*)weight_sizes.data()); + + // Set convolution descriptor + cudnnSetConvolutionNdDescriptor(conv_desc, 2, + (int*)padding, (int*)stride, (int*)dilation, + CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); + + if (groups > 1) { + cudnnSetConvolutionGroupCount(conv_desc, groups); + } + + // Find best convolution algorithm + cudnnConvolutionFwdAlgo_t algo; + cudnnGetConvolutionForwardAlgorithm(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo); + + // Get workspace size + size_t workspace_size; + cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, algo, &workspace_size); + + // Allocate workspace + void* workspace = nullptr; + if (workspace_size > 0) { + cudaMalloc(&workspace, workspace_size); + } + + // Perform convolution + const float alpha = 1.0f, beta = 0.0f; + cudnnStatus_t conv_status = cudnnConvolutionForward( + cudnn_handle, + &alpha, + input_desc, input->data_ptr(), + weight_desc, weight->data_ptr(), + conv_desc, algo, + workspace, workspace_size, + &beta, + output_desc, output_handle->mutable_data_ptr()); + + if (conv_status != CUDNN_STATUS_SUCCESS) { + std::cerr << "cuDNN convolution failed: " << cudnnGetErrorString(conv_status) << std::endl; + if (workspace) cudaFree(workspace); + cudnnDestroyTensorDescriptor(input_desc); + cudnnDestroyTensorDescriptor(output_desc); + cudnnDestroyTensorDescriptor(bias_desc); + cudnnDestroyFilterDescriptor(weight_desc); + cudnnDestroyConvolutionDescriptor(conv_desc); + aoti_torch_delete_tensor_object(output_handle); + return Error::Internal; + } + + // Add bias if present + if (bias && *bias) { + auto bias_sizes = (*bias)->sizes(); + cudnnSetTensorNdDescriptor(bias_desc, CUDNN_DATA_FLOAT, 4, + (int*)bias_sizes.data(), + (int*)(*bias)->strides().data()); + + cudnnAddTensor(cudnn_handle, &alpha, bias_desc, (*bias)->data_ptr(), + &alpha, output_desc, output_handle->mutable_data_ptr()); + } + + // Cleanup + if (workspace) cudaFree(workspace); + cudnnDestroyTensorDescriptor(input_desc); + cudnnDestroyTensorDescriptor(output_desc); + cudnnDestroyTensorDescriptor(bias_desc); + cudnnDestroyFilterDescriptor(weight_desc); + cudnnDestroyConvolutionDescriptor(conv_desc); + + *ret0 = output_handle; + + std::cout << "aoti_torch_cuda_convolution completed successfully" << std::endl; + return Error::Ok; +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch \ No newline at end of file diff --git a/backends/aoti/runtime/shims/cuda_ops.h b/backends/aoti/runtime/shims/cuda_ops.h new file mode 100644 index 00000000000..699c87322d2 --- /dev/null +++ b/backends/aoti/runtime/shims/cuda_ops.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include "tensor_attribute.h" + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// CUDA addmm operation: out = beta * self + alpha * (mat1 @ mat2) +AOTITorchError aoti_torch_cuda_addmm_out( + AtenTensorHandle out, + AtenTensorHandle self, + AtenTensorHandle mat1, + AtenTensorHandle mat2, + double beta, + double alpha); + +// CUDA convolution operation +AOTITorchError aoti_torch_cuda_convolution( + AtenTensorHandle input, + AtenTensorHandle weight, + AtenTensorHandle* bias, + const int64_t* stride, + int64_t stride_len_, + const int64_t* padding, + int64_t padding_len_, + const int64_t* dilation, + int64_t dilation_len_, + int32_t transposed, + const int64_t* output_padding, + int64_t output_padding_len_, + int64_t groups, + AtenTensorHandle* ret0); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch \ No newline at end of file diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 83030647691..2d2bf940833 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -174,10 +174,11 @@ AOTITorchError aoti_torch_empty_strided( for (int i = 0; i < ndim; i++) { sizes[i] = sizes_ptr[i]; } + // ETensor creation auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr); - // Store the tensor + // Store the tensor so it doesn't get destroyed tensors.insert(tensor); *ret_new_tensor = tensor.get(); is_tensor_own_memory[tensor.get()] = true; diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index a75af9ae128..eb3d0e22371 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -107,7 +107,7 @@ AOTITorchError aoti_torch_get_device_type( AOTITensorHandle tensor, int32_t* ret_device_type) { // Let's assume all tensors AOTI using are on CUDA device - *ret_device_type = aoti_torch_device_type_cuda(); // CUDA device type + *ret_device_type = aoti_torch_device_type_cuda(); std::cout << "getting device_type from tensor " << tensor << " = " << *ret_device_type << std::endl; return Error::Ok; diff --git a/export_aoti.py b/export_aoti.py index 229d6e567e3..3ca5287b3b9 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -122,6 +122,11 @@ def get_model_and_inputs( def export_model(model, example_inputs, output_filename="aoti_model.pte"): """Export model through the AOTI pipeline.""" + all_one_input = tuple( + torch.ones_like(example_input) for example_input in example_inputs + ) + + print("label", model(*all_one_input)) print(f"Starting export process...") From df1bec55defe32519bb22481fe1ed52fde2e722b Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 21 Aug 2025 11:25:22 -0700 Subject: [PATCH 19/50] remove extra code --- backends/aoti/runtime/shims/cuda_ops.cpp | 282 ----------------------- backends/aoti/runtime/shims/cuda_ops.h | 54 ----- 2 files changed, 336 deletions(-) delete mode 100644 backends/aoti/runtime/shims/cuda_ops.cpp delete mode 100644 backends/aoti/runtime/shims/cuda_ops.h diff --git a/backends/aoti/runtime/shims/cuda_ops.cpp b/backends/aoti/runtime/shims/cuda_ops.cpp deleted file mode 100644 index d89f1d36e7e..00000000000 --- a/backends/aoti/runtime/shims/cuda_ops.cpp +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "cuda_ops.h" -#include "memory.h" -#include "tensor_attribute.h" -#include -#include -#include - -namespace executorch { -namespace backends { -namespace aoti { - -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - -// Global cuDNN handle -static cudnnHandle_t cudnn_handle = nullptr; - -// Initialize cuDNN handle -static void init_cudnn() { - if (cudnn_handle == nullptr) { - cudnnCreate(&cudnn_handle); - } -} - -extern "C" { - -AOTITorchError aoti_torch_cuda_addmm_out( - AtenTensorHandle out, - AtenTensorHandle self, - AtenTensorHandle mat1, - AtenTensorHandle mat2, - double beta, - double alpha) { - - std::cout << "aoti_torch_cuda_addmm_out called with beta=" << beta << ", alpha=" << alpha << std::endl; - - // Get tensor dimensions - auto mat1_sizes = mat1->sizes(); - auto mat2_sizes = mat2->sizes(); - auto self_sizes = self->sizes(); - auto out_sizes = out->sizes(); - - // mat1: [M, K], mat2: [K, N], result: [M, N] - int64_t M = mat1_sizes[0]; - int64_t K = mat1_sizes[1]; - int64_t N = mat2_sizes[1]; - - std::cout << "ADDMM: mat1[" << M << "," << K << "] @ mat2[" << K << "," << N << "] -> out[" << M << "," << N << "]" << std::endl; - - // Use cuBLAS for matrix multiplication - cublasHandle_t cublas_handle; - cublasStatus_t cublas_status = cublasCreate(&cublas_handle); - if (cublas_status != CUBLAS_STATUS_SUCCESS) { - std::cerr << "Failed to create cuBLAS handle" << std::endl; - return Error::Internal; - } - - // Set cuBLAS to use tensor op math for better performance - cublasSetMathMode(cublas_handle, CUBLAS_DEFAULT_MATH); - - const float f_alpha = static_cast(alpha); - const float f_beta = static_cast(beta); - - // Perform: out = beta * self + alpha * (mat1 @ mat2) - // First: out = beta * self (copy self to out and scale) - if (beta != 0.0) { - Error copy_err = aoti_torch_copy_(out, self, 0); - if (copy_err != Error::Ok) { - cublasDestroy(cublas_handle); - return copy_err; - } - - // Scale by beta if not 1.0 - if (beta != 1.0) { - cublas_status = cublasSscal(cublas_handle, M * N, &f_beta, - static_cast(out->mutable_data_ptr()), 1); - if (cublas_status != CUBLAS_STATUS_SUCCESS) { - std::cerr << "cuBLAS scale failed" << std::endl; - cublasDestroy(cublas_handle); - return Error::Internal; - } - } - } else { - // Zero out the output tensor - cudaMemset(out->mutable_data_ptr(), 0, M * N * sizeof(float)); - } - - // Then: out += alpha * (mat1 @ mat2) - // cuBLAS uses column-major, so we compute: C = alpha * A^T * B^T + beta * C - // Which gives us: out = alpha * mat1 @ mat2 + beta * out - const float gemm_beta = 1.0f; // Since we already handled the beta scaling above - - cublas_status = cublasSgemm( - cublas_handle, - CUBLAS_OP_N, CUBLAS_OP_N, // No transpose for column-major interpretation - N, M, K, // Dimensions swapped for column-major - &f_alpha, // alpha - static_cast(mat2->data_ptr()), N, // B matrix (mat2) - static_cast(mat1->data_ptr()), K, // A matrix (mat1) - &gemm_beta, // beta (1.0 since we pre-scaled) - static_cast(out->mutable_data_ptr()), N // C matrix (out) - ); - - if (cublas_status != CUBLAS_STATUS_SUCCESS) { - std::cerr << "cuBLAS GEMM failed: " << cublas_status << std::endl; - cublasDestroy(cublas_handle); - return Error::Internal; - } - - cublasDestroy(cublas_handle); - - std::cout << "aoti_torch_cuda_addmm_out completed successfully" << std::endl; - return Error::Ok; -} - -AOTITorchError aoti_torch_cuda_convolution( - AtenTensorHandle input, - AtenTensorHandle weight, - AtenTensorHandle* bias, - const int64_t* stride, - int64_t stride_len_, - const int64_t* padding, - int64_t padding_len_, - const int64_t* dilation, - int64_t dilation_len_, - int32_t transposed, - const int64_t* output_padding, - int64_t output_padding_len_, - int64_t groups, - AtenTensorHandle* ret0) { - - std::cout << "aoti_torch_cuda_convolution called" << std::endl; - - init_cudnn(); - - // Get input dimensions - auto input_sizes = input->sizes(); - auto weight_sizes = weight->sizes(); - - int batch_size = input_sizes[0]; - int input_channels = input_sizes[1]; - int input_height = input_sizes[2]; - int input_width = input_sizes[3]; - - int output_channels = weight_sizes[0]; - int kernel_height = weight_sizes[2]; - int kernel_width = weight_sizes[3]; - - // Calculate output dimensions - int output_height = (input_height + 2 * padding[0] - dilation[0] * (kernel_height - 1) - 1) / stride[0] + 1; - int output_width = (input_width + 2 * padding[1] - dilation[1] * (kernel_width - 1) - 1) / stride[1] + 1; - - std::cout << "Conv2d: input[" << batch_size << "," << input_channels << "," << input_height << "," << input_width << "]" - << " -> output[" << batch_size << "," << output_channels << "," << output_height << "," << output_width << "]" << std::endl; - - // Create output tensor - std::vector output_sizes = {batch_size, output_channels, output_height, output_width}; - - AOTITensorHandle output_handle; - Error create_err = aoti_torch_empty_strided( - output_sizes.size(), - output_sizes.data(), - nullptr, // use default strides - 6, // float32 dtype - 1, // cuda device - 0, // device index - &output_handle); - - if (create_err != Error::Ok) { - std::cerr << "Failed to create output tensor for convolution" << std::endl; - return create_err; - } - - // Setup cuDNN descriptors - cudnnTensorDescriptor_t input_desc, output_desc, bias_desc; - cudnnFilterDescriptor_t weight_desc; - cudnnConvolutionDescriptor_t conv_desc; - - cudnnCreateTensorDescriptor(&input_desc); - cudnnCreateTensorDescriptor(&output_desc); - cudnnCreateTensorDescriptor(&bias_desc); - cudnnCreateFilterDescriptor(&weight_desc); - cudnnCreateConvolutionDescriptor(&conv_desc); - - // Set tensor descriptors - cudnnSetTensorNdDescriptor(input_desc, CUDNN_DATA_FLOAT, 4, - (int*)input_sizes.data(), - (int*)input->strides().data()); - - cudnnSetTensorNdDescriptor(output_desc, CUDNN_DATA_FLOAT, 4, - (int*)output_sizes.data(), - (int*)output_handle->strides().data()); - - cudnnSetFilterNdDescriptor(weight_desc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, - (int*)weight_sizes.data()); - - // Set convolution descriptor - cudnnSetConvolutionNdDescriptor(conv_desc, 2, - (int*)padding, (int*)stride, (int*)dilation, - CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); - - if (groups > 1) { - cudnnSetConvolutionGroupCount(conv_desc, groups); - } - - // Find best convolution algorithm - cudnnConvolutionFwdAlgo_t algo; - cudnnGetConvolutionForwardAlgorithm(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &algo); - - // Get workspace size - size_t workspace_size; - cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle, input_desc, weight_desc, conv_desc, output_desc, algo, &workspace_size); - - // Allocate workspace - void* workspace = nullptr; - if (workspace_size > 0) { - cudaMalloc(&workspace, workspace_size); - } - - // Perform convolution - const float alpha = 1.0f, beta = 0.0f; - cudnnStatus_t conv_status = cudnnConvolutionForward( - cudnn_handle, - &alpha, - input_desc, input->data_ptr(), - weight_desc, weight->data_ptr(), - conv_desc, algo, - workspace, workspace_size, - &beta, - output_desc, output_handle->mutable_data_ptr()); - - if (conv_status != CUDNN_STATUS_SUCCESS) { - std::cerr << "cuDNN convolution failed: " << cudnnGetErrorString(conv_status) << std::endl; - if (workspace) cudaFree(workspace); - cudnnDestroyTensorDescriptor(input_desc); - cudnnDestroyTensorDescriptor(output_desc); - cudnnDestroyTensorDescriptor(bias_desc); - cudnnDestroyFilterDescriptor(weight_desc); - cudnnDestroyConvolutionDescriptor(conv_desc); - aoti_torch_delete_tensor_object(output_handle); - return Error::Internal; - } - - // Add bias if present - if (bias && *bias) { - auto bias_sizes = (*bias)->sizes(); - cudnnSetTensorNdDescriptor(bias_desc, CUDNN_DATA_FLOAT, 4, - (int*)bias_sizes.data(), - (int*)(*bias)->strides().data()); - - cudnnAddTensor(cudnn_handle, &alpha, bias_desc, (*bias)->data_ptr(), - &alpha, output_desc, output_handle->mutable_data_ptr()); - } - - // Cleanup - if (workspace) cudaFree(workspace); - cudnnDestroyTensorDescriptor(input_desc); - cudnnDestroyTensorDescriptor(output_desc); - cudnnDestroyTensorDescriptor(bias_desc); - cudnnDestroyFilterDescriptor(weight_desc); - cudnnDestroyConvolutionDescriptor(conv_desc); - - *ret0 = output_handle; - - std::cout << "aoti_torch_cuda_convolution completed successfully" << std::endl; - return Error::Ok; -} - -} // extern "C" - -} // namespace aoti -} // namespace backends -} // namespace executorch \ No newline at end of file diff --git a/backends/aoti/runtime/shims/cuda_ops.h b/backends/aoti/runtime/shims/cuda_ops.h deleted file mode 100644 index 699c87322d2..00000000000 --- a/backends/aoti/runtime/shims/cuda_ops.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include "tensor_attribute.h" - -namespace executorch { -namespace backends { -namespace aoti { - -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - -extern "C" { - -// CUDA addmm operation: out = beta * self + alpha * (mat1 @ mat2) -AOTITorchError aoti_torch_cuda_addmm_out( - AtenTensorHandle out, - AtenTensorHandle self, - AtenTensorHandle mat1, - AtenTensorHandle mat2, - double beta, - double alpha); - -// CUDA convolution operation -AOTITorchError aoti_torch_cuda_convolution( - AtenTensorHandle input, - AtenTensorHandle weight, - AtenTensorHandle* bias, - const int64_t* stride, - int64_t stride_len_, - const int64_t* padding, - int64_t padding_len_, - const int64_t* dilation, - int64_t dilation_len_, - int32_t transposed, - const int64_t* output_padding, - int64_t output_padding_len_, - int64_t groups, - AtenTensorHandle* ret0); - -} // extern "C" - -} // namespace aoti -} // namespace backends -} // namespace executorch \ No newline at end of file From db8a40070c37ffb04edef4907afc3a8a7afcf4e3 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 21 Aug 2025 13:25:08 -0700 Subject: [PATCH 20/50] solved crash when destroying backend --- backends/aoti/runtime/aoti_backend.cpp | 52 ++++++++++++-------------- export_aoti.py | 4 +- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 4c065fbeeb6..425e078c549 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -166,33 +166,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { size_t n_inputs; AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); - // for (int i = 0; i < n_inputs; i++) { - // const char* input_name; - // AOTInductorModelContainerGetInputName( - // handle->container_handle, i, &input_name); - // ET_LOG(Debug, "AOTIBackend %d-th input name %s", i, input_name); - // } - - // AOTInductorModelContainerGetNumConstants( - // handle->container_handle, &n_constants); - // size_t n_user_inputs = n_inputs - n_constants; - - // if (n_user_inputs != n_inputs) { - // ET_LOG( - // Error, - // "number of user input does not match number of inputs. - // n_user_inputs %zd, n_constant %zd, n_inputs %zd. Exit.", - // n_user_inputs, - // n_constants, - // n_inputs); - // return Error::InvalidArgument; - // } - - // ET_LOG( - // Debug, - // "AOTIBackend n_inputs %zd generated, where %zd is constant input, - // %zd is user input", n_inputs, n_constants, n_user_inputs); - size_t n_outputs; AOTInductorModelContainerGetNumOutputs( handle->container_handle, &n_outputs); @@ -381,8 +354,29 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { void destroy(DelegateHandle* handle_) const override { ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - dlclose(handle->so_handle); - AOTInductorModelContainerDelete(handle->container_handle); + + // Delete the container BEFORE closing the shared library + if (handle->container_handle != nullptr) { + AOTIRuntimeError delete_result = + AOTInductorModelContainerDelete(handle->container_handle); + if (delete_result != Error::Ok) { + ET_LOG( + Error, + "AOTInductorModelContainerDelete failed with error code %d", + delete_result); + } else { + ET_LOG( + Debug, + "AOTIBackend container_handle %p deleted", + handle->container_handle); + } + } + + // Now close the shared library + if (handle->so_handle != nullptr) { + dlclose(handle->so_handle); + } + free(handle); cleanup_memory(); cleanup_tensor_metadata(); diff --git a/export_aoti.py b/export_aoti.py index 3ca5287b3b9..ff78ce3be95 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -41,7 +41,7 @@ def forward(self, x: torch.Tensor): class Linear(torch.nn.Module): def __init__(self): super(Linear, self).__init__() - self.linear = nn.Linear(3, 5) + self.linear = nn.Linear(7, 101) def forward(self, x: torch.Tensor): return self.linear(x) @@ -76,7 +76,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): }, "linear": { "model_class": Linear, - "input_shapes": [(4, 3)], + "input_shapes": [(127, 7)], "device": "cuda", "description": "Simple linear layer model", }, From ece2776c20ea46087e4d9d4093d8602795b6e0e1 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 21 Aug 2025 14:42:50 -0700 Subject: [PATCH 21/50] change to use to_edge_transform_and_lower --- backends/aoti/runtime/aoti_backend.cpp | 7 +------ export_aoti.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 425e078c549..453613d47f8 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -352,7 +352,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { } void destroy(DelegateHandle* handle_) const override { - ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; // Delete the container BEFORE closing the shared library @@ -364,11 +363,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { Error, "AOTInductorModelContainerDelete failed with error code %d", delete_result); - } else { - ET_LOG( - Debug, - "AOTIBackend container_handle %p deleted", - handle->container_handle); } } @@ -380,6 +374,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { free(handle); cleanup_memory(); cleanup_tensor_metadata(); + ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); } }; diff --git a/export_aoti.py b/export_aoti.py index ff78ce3be95..bfcf38408d7 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -21,7 +21,7 @@ import torch from executorch.backends.aoti.aoti_partitioner import AotiPartitioner -from executorch.exir import to_edge +from executorch.exir import to_edge_transform_and_lower from torch import nn from torch.export import export from torchvision import models @@ -135,13 +135,17 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"): aten_dialect = export(model, example_inputs) # 2. to_edge: Make optimizations for Edge devices - print("Step 2: Converting to Edge program...") - edge_program = to_edge(aten_dialect) - print(edge_program.exported_program().graph.print_tabular()) + # print("Step 2: Converting to Edge program...") + # edge_program = to_edge(aten_dialect) + # print(edge_program.exported_program().graph.print_tabular()) - print("Step 3: Converting to backend...") - edge_program = edge_program.to_backend(AotiPartitioner([])) - print("To backend done.") + # print("Step 3: Converting to backend...") + # edge_program = edge_program.to_backend(AotiPartitioner([])) + # print("To backend done.") + + edge_program = to_edge_transform_and_lower( + aten_dialect, partitioner=[AotiPartitioner([])] + ) # 3. to_executorch: Convert the graph to an ExecuTorch program print("Step 4: Converting to ExecuTorch program...") From 8e42a30d96b4cad123a5296f10de183f93a58c02 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 21 Aug 2025 23:09:03 -0700 Subject: [PATCH 22/50] use aoti decomposition on lowable graph --- backends/aoti/aoti_partitioner.py | 349 +++++++++++---------- backends/arm/third-party/serialization_lib | 1 + export_aoti.py | 3 + 3 files changed, 186 insertions(+), 167 deletions(-) create mode 160000 backends/arm/third-party/serialization_lib diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index f72b97f0253..8a17a5364ae 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -7,7 +7,7 @@ # pyre-unsafe import operator -from typing import cast, final, List +from typing import Callable, cast, Dict, final, List, Optional, Set, Tuple import torch from executorch.backends.aoti.aoti_backend import AotiBackend # usort: skip @@ -24,167 +24,168 @@ from torch.fx.passes.operator_support import OperatorSupportBase -supported_fallback_operators = [] - -inductor_fallback_ops: dict[str, dict[str, list[str]]] = { - "aten._adaptive_avg_pool2d_backward.default": {}, - "aten._adaptive_avg_pool2d.default": {}, - "aten._adaptive_avg_pool3d_backward.default": {}, - "aten._adaptive_avg_pool3d.default": {}, - "aten._addmm_activation.default": {}, - "aten._cdist_backward.default": {}, - "aten._cdist_forward.default": {}, - "aten._cudnn_rnn.default": {}, - "aten._dyn_quant_matmul_4bit.default": {}, - "aten._dyn_quant_pack_4bit_weight.default": {}, - "aten._efficient_attention_backward.default": {}, - "aten._efficient_attention_forward.default": {}, - "aten._efficientzerotensor.default": {}, - "aten._embedding_bag_dense_backward.default": {}, - "aten._embedding_bag_forward_only.default": {}, - "aten._embedding_bag_per_sample_weights_backward.default": {}, - "aten._embedding_bag.default": {}, - "aten._fft_c2c.default": {}, - "aten._fft_r2c.default": {}, - "aten._flash_attention_backward.default": {}, - "aten._flash_attention_forward.default": {}, - "aten._fused_moving_avg_obs_fq_helper_functional.default": {}, - "aten._fused_moving_avg_obs_fq_helper.default": {}, - "aten._fused_rms_norm.default": {}, - "aten._histogramdd_from_bin_cts.default": {}, - "aten._int_mm.out": {}, - "aten._pdist_backward.default": {}, - "aten._pdist_forward.default": {}, - "aten._scaled_dot_product_attention_math_for_mps.default": {}, - "aten._scaled_dot_product_cudnn_attention_backward.default": {}, - "aten._scaled_dot_product_cudnn_attention.default": {}, - "aten._scaled_dot_product_efficient_attention_backward.default": {}, - "aten._scaled_dot_product_efficient_attention.default": {}, - "aten._scaled_dot_product_flash_attention_backward.default": {}, - "aten._scaled_dot_product_flash_attention_for_cpu_backward.default": {}, - "aten._scaled_dot_product_flash_attention_for_cpu.default": {}, - "aten._scaled_dot_product_flash_attention.default": {}, - "aten._scaled_dot_product_fused_attention_overrideable_backward.default": {}, - "aten._scaled_dot_product_fused_attention_overrideable.default": {}, - "aten._scaled_mm.default": {}, - "aten._scaled_mm.out": {}, - "aten._segment_reduce_backward.default": {}, - "aten._thnn_fused_lstm_cell.default": {}, - "aten._to_sparse.default": {}, - "aten._trilinear.default": {}, - "aten._weight_int4pack_mm.default": {}, - "aten._weight_int8pack_mm.default": {}, - "aten.abs.default": {}, - "aten.adaptive_max_pool2d_backward.default": {}, - "aten.adaptive_max_pool2d.default": {}, - "aten.adaptive_max_pool3d_backward.default": {}, - "aten.adaptive_max_pool3d.default": {}, - "aten.add.Scalar": {}, - "aten.add.Tensor": {}, - "aten.addbmm.default": {}, - "aten.addmm.out": {}, - "aten.addmv.default": {}, - "aten.angle.default": {}, - "aten.avg_pool2d_backward.default": {}, - "aten.avg_pool2d.default": {}, - "aten.avg_pool3d_backward.default": {}, - "aten.avg_pool3d.default": {}, - "aten.baddbmm.out": {}, - "aten.bernoulli_.float": {}, - "aten.bernoulli_.Tensor": {}, - "aten.bmm.out": {}, - "aten.bucketize.Tensor": {}, - "aten.cat.default": {}, - "aten.cholesky_inverse.default": {}, - "aten.cholesky_solve.default": {}, - "aten.convolution_backward.default": {}, - "aten.convolution.default": {}, - "aten.cummax.default": {}, - "aten.cummin.default": {}, - "aten.cumprod.default": {}, - "aten.cumsum.default": {}, - "aten.exponential.default": {}, - "aten.fill_.Scalar": {}, - "aten.fractional_max_pool2d_backward.default": {}, - "aten.fractional_max_pool2d.default": {}, - "aten.fractional_max_pool3d_backward.default": {}, - "aten.fractional_max_pool3d.default": {}, - "aten.gcd.default": {}, - "aten.geqrf.default": {}, - "aten.grid_sampler_2d_backward.default": {}, - "aten.hann_window.default": {}, - "aten.histc.default": {}, - "aten.histogram.bin_ct": {}, - "aten.index_put.default": {}, - "aten.index_reduce.default": {}, - "aten.index.Tensor": {}, - "aten.kthvalue.default": {}, - "aten.logcumsumexp.default": {}, - "aten.lu_unpack.default": {}, - "aten.masked_scatter_backward.default": {}, - "aten.masked_scatter.default": {}, - "aten.masked_select.default": {}, - "aten.max_pool2d_with_indices_backward.default": {}, - "aten.max_pool2d_with_indices.default": {}, - "aten.max_pool3d_with_indices_backward.default": {}, - "aten.max_pool3d_with_indices.default": {}, - "aten.max_unpool2d.default": {}, - "aten.max_unpool3d.default": {}, - "aten.median.default": {}, - "aten.mm.out": {}, - "aten.mode.default": {}, - "aten.mul.Scalar": {}, - "aten.mul.Tensor": {}, - "aten.nanmedian.default": {}, - "aten.narrow.default": {}, - "aten.native_dropout.default": {}, - "aten.nonzero.default": {}, - "aten.normal_functional.default": {}, - "aten.ormqr.default": {}, - "aten.pad.default": {}, - "aten.permute.default": {}, - "aten.polar.default": {}, - "aten.pow.Scalar": {}, - "aten.pow.Tensor_Scalar": {}, - "aten.pow.Tensor_Tensor": {}, - "aten.rand.default": {}, - "aten.rand.generator": {}, - "aten.randint.default": {}, - "aten.randint.generator": {}, - "aten.randint.low_out": {}, - "aten.randint.low": {}, - "aten.randn.default": {}, - "aten.randn.generator": {}, - "aten.randperm.default": {}, - "aten.repeat_interleave.Tensor": {}, - "aten.replication_pad1d_backward.default": {}, - "aten.replication_pad2d_backward.default": {}, - "aten.reshape.default": {}, - "aten.resize_.default": {}, - "aten.resize_as_.default": {}, - "aten.scatter_reduce.two_out": {}, - "aten.scatter.src_out": {}, - "aten.scatter.value_out": {}, - "aten.searchsorted.Scalar": {}, - "aten.searchsorted.Tensor": {}, - "aten.segment_reduce.default": {}, - "aten.set_.source_Tensor": {}, - "aten.slice.Tensor": {}, - "aten.soft_margin_loss_backward.default": {}, - "aten.sort.default": {}, - "aten.sort.stable": {}, - "aten.squeeze.dim": {}, - "aten.to_sparse.default": {}, - "aten.topk.default": {}, - "aten.triangular_solve.default": {}, - "aten.uniform.default": {}, - "aten.upsample_bicubic2d_backward.default": {}, - "aten.upsample_linear1d_backward.default": {}, - "aten.upsample_trilinear3d_backward.default": {}, - "aten.view_as_complex.default": {}, - "aten.view_as_real.default": {}, - "aten.view.dtype": {}, - "aten._weight_int4pack_mm_with_scales_and_zeros.default": {}, +# exist fallback operators in et namespace; should map to inductor_fallback_ops +supported_fallback_operators: Dict[str, Dict[str, List[str]]] = {} + +inductor_fallback_ops: Set[str] = { + "aten._adaptive_avg_pool2d_backward.default", + "aten._adaptive_avg_pool2d.default", + "aten._adaptive_avg_pool3d_backward.default", + "aten._adaptive_avg_pool3d.default", + "aten._addmm_activation.default", + "aten._cdist_backward.default", + "aten._cdist_forward.default", + "aten._cudnn_rnn.default", + "aten._dyn_quant_matmul_4bit.default", + "aten._dyn_quant_pack_4bit_weight.default", + "aten._efficient_attention_backward.default", + "aten._efficient_attention_forward.default", + "aten._efficientzerotensor.default", + "aten._embedding_bag_dense_backward.default", + "aten._embedding_bag_forward_only.default", + "aten._embedding_bag_per_sample_weights_backward.default", + "aten._embedding_bag.default", + "aten._fft_c2c.default", + "aten._fft_r2c.default", + "aten._flash_attention_backward.default", + "aten._flash_attention_forward.default", + "aten._fused_moving_avg_obs_fq_helper_functional.default", + "aten._fused_moving_avg_obs_fq_helper.default", + "aten._fused_rms_norm.default", + "aten._histogramdd_from_bin_cts.default", + "aten._int_mm.out", + "aten._pdist_backward.default", + "aten._pdist_forward.default", + "aten._scaled_dot_product_attention_math_for_mps.default", + "aten._scaled_dot_product_cudnn_attention_backward.default", + "aten._scaled_dot_product_cudnn_attention.default", + "aten._scaled_dot_product_efficient_attention_backward.default", + "aten._scaled_dot_product_efficient_attention.default", + "aten._scaled_dot_product_flash_attention_backward.default", + "aten._scaled_dot_product_flash_attention_for_cpu_backward.default", + "aten._scaled_dot_product_flash_attention_for_cpu.default", + "aten._scaled_dot_product_flash_attention.default", + "aten._scaled_dot_product_fused_attention_overrideable_backward.default", + "aten._scaled_dot_product_fused_attention_overrideable.default", + "aten._scaled_mm.default", + "aten._scaled_mm.out", + "aten._segment_reduce_backward.default", + "aten._thnn_fused_lstm_cell.default", + "aten._to_sparse.default", + "aten._trilinear.default", + "aten._weight_int4pack_mm.default", + "aten._weight_int8pack_mm.default", + "aten.abs.default", + "aten.adaptive_max_pool2d_backward.default", + "aten.adaptive_max_pool2d.default", + "aten.adaptive_max_pool3d_backward.default", + "aten.adaptive_max_pool3d.default", + "aten.add.Scalar", + "aten.add.Tensor", + "aten.addbmm.default", + "aten.addmm.out", + "aten.addmv.default", + "aten.angle.default", + "aten.avg_pool2d_backward.default", + "aten.avg_pool2d.default", + "aten.avg_pool3d_backward.default", + "aten.avg_pool3d.default", + "aten.baddbmm.out", + "aten.bernoulli_.float", + "aten.bernoulli_.Tensor", + "aten.bmm.out", + "aten.bucketize.Tensor", + "aten.cat.default", + "aten.cholesky_inverse.default", + "aten.cholesky_solve.default", + "aten.convolution_backward.default", + "aten.convolution.default", + "aten.cummax.default", + "aten.cummin.default", + "aten.cumprod.default", + "aten.cumsum.default", + "aten.exponential.default", + "aten.fill_.Scalar", + "aten.fractional_max_pool2d_backward.default", + "aten.fractional_max_pool2d.default", + "aten.fractional_max_pool3d_backward.default", + "aten.fractional_max_pool3d.default", + "aten.gcd.default", + "aten.geqrf.default", + "aten.grid_sampler_2d_backward.default", + "aten.hann_window.default", + "aten.histc.default", + "aten.histogram.bin_ct", + "aten.index_put.default", + "aten.index_reduce.default", + "aten.index.Tensor", + "aten.kthvalue.default", + "aten.logcumsumexp.default", + "aten.lu_unpack.default", + "aten.masked_scatter_backward.default", + "aten.masked_scatter.default", + "aten.masked_select.default", + "aten.max_pool2d_with_indices_backward.default", + "aten.max_pool2d_with_indices.default", + "aten.max_pool3d_with_indices_backward.default", + "aten.max_pool3d_with_indices.default", + "aten.max_unpool2d.default", + "aten.max_unpool3d.default", + "aten.median.default", + "aten.mm.out", + "aten.mode.default", + "aten.mul.Scalar", + "aten.mul.Tensor", + "aten.nanmedian.default", + "aten.narrow.default", + "aten.native_dropout.default", + "aten.nonzero.default", + "aten.normal_functional.default", + "aten.ormqr.default", + "aten.pad.default", + "aten.permute.default", + "aten.polar.default", + "aten.pow.Scalar", + "aten.pow.Tensor_Scalar", + "aten.pow.Tensor_Tensor", + "aten.rand.default", + "aten.rand.generator", + "aten.randint.default", + "aten.randint.generator", + "aten.randint.low_out", + "aten.randint.low", + "aten.randn.default", + "aten.randn.generator", + "aten.randperm.default", + "aten.repeat_interleave.Tensor", + "aten.replication_pad1d_backward.default", + "aten.replication_pad2d_backward.default", + "aten.reshape.default", + "aten.resize_.default", + "aten.resize_as_.default", + "aten.scatter_reduce.two_out", + "aten.scatter.src_out", + "aten.scatter.value_out", + "aten.searchsorted.Scalar", + "aten.searchsorted.Tensor", + "aten.segment_reduce.default", + "aten.set_.source_Tensor", + "aten.slice.Tensor", + "aten.soft_margin_loss_backward.default", + "aten.sort.default", + "aten.sort.stable", + "aten.squeeze.dim", + "aten.to_sparse.default", + "aten.topk.default", + "aten.triangular_solve.default", + "aten.uniform.default", + "aten.upsample_bicubic2d_backward.default", + "aten.upsample_linear1d_backward.default", + "aten.upsample_trilinear3d_backward.default", + "aten.view_as_complex.default", + "aten.view_as_real.default", + "aten.view.dtype", + "aten._weight_int4pack_mm_with_scales_and_zeros.default", } @@ -193,13 +194,9 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: supported = node.op == "call_function" and ( node.target == operator.getitem or node.target._op not in inductor_fallback_ops + or node.target._op in supported_fallback_operators ) - # if node.op == "call_function" and node.target != operator.getitem: - # print(node.target._op) - # print(supported) - # print('------------------') - return supported def is_node_supported_custom(self, node: torch.fx.Node) -> bool: @@ -248,3 +245,21 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: return PartitionResult( tagged_exported_program=exported_program, partition_tags=partition_tags ) + + def ops_to_not_decompose( + self, ep: ExportedProgram + ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: + """ + Return a list of operations that should not be decomposed and let the AOT compiler handle them. + """ + do_not_decompose = set() + op_support = AOTISupportedOperators() + + for node in ep.graph.nodes: + if ( + node.op == "call_function" + and isinstance(node.target, torch._ops.OpOverload) + and op_support.is_node_supported(None, node) + ): + do_not_decompose.add(node.target) + return list(do_not_decompose), None diff --git a/backends/arm/third-party/serialization_lib b/backends/arm/third-party/serialization_lib new file mode 160000 index 00000000000..187af0d41fe --- /dev/null +++ b/backends/arm/third-party/serialization_lib @@ -0,0 +1 @@ +Subproject commit 187af0d41fe75d08d2a7ec84c1b4d24b9b641ed2 diff --git a/export_aoti.py b/export_aoti.py index bfcf38408d7..d60c6eccad1 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -143,6 +143,9 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"): # edge_program = edge_program.to_backend(AotiPartitioner([])) # print("To backend done.") + # aoti part should be decomposed by the internal torch._inductor.aot_compile + # we should preserve the lowerable part and waiting for aoti backend handle that + # Q: maybe need to turn on fallback_random? edge_program = to_edge_transform_and_lower( aten_dialect, partitioner=[AotiPartitioner([])] ) From aa94acf6ef97a8be852b05d5d199b176d8412f07 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Fri, 22 Aug 2025 13:48:08 -0700 Subject: [PATCH 23/50] update test script to support raw aoti --- backends/aoti/aoti_partitioner.py | 2 + export_and_run_aoti.sh | 28 ++++--- export_aoti.py | 126 +++++++++++++++++++++++++----- 3 files changed, 129 insertions(+), 27 deletions(-) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index 8a17a5364ae..d490b261da6 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -196,6 +196,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: or node.target._op not in inductor_fallback_ops or node.target._op in supported_fallback_operators ) + if supported and node.target != operator.getitem: + print(f"op {node.target._op} is supported: {supported}") return supported diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 7aa4950c790..54f1d0b5092 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -10,7 +10,7 @@ # ./export_and_run_aoti.sh conv2d inference # Uses inference mode # ./export_and_run_aoti.sh conv2d --mode=inference # Alternative syntax # -# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference +# Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference, export_aoti_only # model_arg: argument to pass to export_aoti.py set -e # Exit on any error @@ -26,7 +26,7 @@ for arg in "$@"; do MODE="${arg#*=}" shift ;; - reinstall_all|reinstall_aot|reinstall_runtime|inference) + reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only) # If it's the second argument and a valid mode, use it as mode if [[ "$arg" == "$2" ]]; then MODE="$arg" @@ -37,17 +37,18 @@ done # Validate mode case "$MODE" in - reinstall_all|reinstall_aot|reinstall_runtime|inference) + reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only) # Valid mode, continue ;; *) echo "Error: Unknown mode '$MODE'" - echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference" + echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference, export_aoti_only" echo "" echo "Usage examples:" - echo " ./export_and_run_aoti.sh conv2d # Uses default mode" - echo " ./export_and_run_aoti.sh conv2d inference # Positional mode" - echo " ./export_and_run_aoti.sh conv2d --mode=inference # GNU-style mode" + echo " ./export_and_run_aoti.sh conv2d # Uses default mode" + echo " ./export_and_run_aoti.sh conv2d inference # Positional mode" + echo " ./export_and_run_aoti.sh conv2d --mode=inference # GNU-style mode" + echo " ./export_and_run_aoti.sh conv2d export_aoti_only # Export AOTI only (no runtime)" exit 1 ;; esac @@ -94,8 +95,13 @@ install_executorch() { } export_aoti_model() { + local use_aoti_only=$1 echo "Exporting AOTI model..." - python export_aoti.py $MODEL_ARG + if [[ "$use_aoti_only" == "--aoti_only" ]]; then + python export_aoti.py $MODEL_ARG --aoti_only + else + python export_aoti.py $MODEL_ARG + fi } clean_install_executorch() { @@ -151,9 +157,13 @@ case "$MODE" in export_aoti_model run_inference ;; + "export_aoti_only") + echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)" + export_aoti_model "--aoti_only" + ;; *) echo "Error: Unknown mode '$MODE'" - echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference" + echo "Available modes: reinstall_all, reinstall_aot, reinstall_runtime, inference, export_aoti_only" exit 1 ;; esac diff --git a/export_aoti.py b/export_aoti.py index d60c6eccad1..69cb0782c1b 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -1,7 +1,9 @@ #!/usr/bin/env python3 """ Unified export script for AOTI backend. -Usage: python export_aoti.py +Usage: + python export_aoti.py # Uses export_model_to_et_aoti + python export_aoti.py --aoti_only # Uses export_model_to_pure_aoti Supported models: - mv2: MobileNetV2 model @@ -10,6 +12,7 @@ - add: Simple tensor addition model """ +import argparse import copy import os @@ -66,6 +69,25 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): return x + y +class DepthwiseConv(nn.Module): + def __init__(self): + super().__init__() + # 32 input channels, 32 output channels, groups=32 for depthwise + self.conv = nn.Conv2d( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + padding=1, + dilation=1, + groups=32, + bias=False, + ) + + def forward(self, x): + return self.conv(x) + + # Model registry mapping model names to their configurations MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { "mv2": { @@ -86,6 +108,12 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): "device": "cuda", "description": "Single Conv2d layer model", }, + "depthwise_conv": { + "model_class": DepthwiseConv, + "input_shapes": [(1, 32, 112, 112)], + "device": "cuda", + "description": "Single Depthwise Conv2d layer model", + }, "add": { "model_class": Add, "input_shapes": [(10,), (10,)], @@ -120,7 +148,7 @@ def get_model_and_inputs( return model, example_inputs -def export_model(model, example_inputs, output_filename="aoti_model.pte"): +def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.pte"): """Export model through the AOTI pipeline.""" all_one_input = tuple( torch.ones_like(example_input) for example_input in example_inputs @@ -135,14 +163,6 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"): aten_dialect = export(model, example_inputs) # 2. to_edge: Make optimizations for Edge devices - # print("Step 2: Converting to Edge program...") - # edge_program = to_edge(aten_dialect) - # print(edge_program.exported_program().graph.print_tabular()) - - # print("Step 3: Converting to backend...") - # edge_program = edge_program.to_backend(AotiPartitioner([])) - # print("To backend done.") - # aoti part should be decomposed by the internal torch._inductor.aot_compile # we should preserve the lowerable part and waiting for aoti backend handle that # Q: maybe need to turn on fallback_random? @@ -163,21 +183,91 @@ def export_model(model, example_inputs, output_filename="aoti_model.pte"): print(f"Export completed successfully! Output saved to {output_filename}") +def export_model_to_pure_aoti(model, example_inputs): + """Export model through the AOTI pipeline.""" + all_one_input = tuple( + torch.ones_like(example_input) for example_input in example_inputs + ) + + print("label", model(*all_one_input)) + + print(f"Starting export process...") + + # 1. torch.export: Defines the program with the ATen operator set. + print("Step 1: Converting to ATen dialect...") + aten_dialect = export(model, example_inputs) + + # 2. torch._inductor.aot_compile to aoti delegate + aten_dialect_module = aten_dialect.module() + + output_path = os.path.join(os.getcwd(), "aoti.so") + + options: dict[str, Any] = { + "aot_inductor.package_constants_in_so": True, + "aot_inductor.output_path": output_path, + "aot_inductor.debug_compile": True, + "aot_inductor.repro_level": 3, + "aot_inductor.debug_intermediate_value_printer": "3", + "max_autotune": True, + "max_autotune_gemm_backends": "TRITON", + "max_autotune_conv_backends": "TRITON", + } + + so_path = torch._inductor.aot_compile(aten_dialect_module, example_inputs, options=options) # type: ignore[arg-type] + + assert so_path == output_path, f"Expected {output_path} but got {so_path}" + + check_call( + f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}", + shell=True, + ) + + def main(): - if len(sys.argv) != 2: - available_models = ", ".join(MODEL_REGISTRY.keys()) - print("Usage: python export_aoti.py ") - print(f"Available models: {available_models}") + # Set up argument parser + parser = argparse.ArgumentParser( + description="Unified export script for AOTI backend", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Add model name as positional argument + parser.add_argument( + "model_name", + help="Name of the model to export", + choices=list(MODEL_REGISTRY.keys()), + metavar="model_name", + ) + + # Add the --aoti_only flag + parser.add_argument( + "--aoti_only", + action="store_true", + help="Use export_model_to_pure_aoti instead of export_model_to_et_aoti", + ) + + # Parse arguments + args = parser.parse_args() + + # Show available models and descriptions in help + if len(sys.argv) == 1: + parser.print_help() + print(f"\nAvailable models: {', '.join(MODEL_REGISTRY.keys())}") print("\nModel descriptions:") for name, config in MODEL_REGISTRY.items(): print(f" {name}: {config['description']}") sys.exit(1) - model_name = sys.argv[1] - try: - model, example_inputs = get_model_and_inputs(model_name) - export_model(model, example_inputs) + model, example_inputs = get_model_and_inputs(args.model_name) + + # Choose export function based on --aoti_only flag + if args.aoti_only: + print("Using export_model_to_pure_aoti...") + export_model_to_pure_aoti(model, example_inputs) + else: + print("Using export_model_to_et_aoti...") + export_model_to_et_aoti(model, example_inputs) + except ValueError as e: print(f"Error: {e}") sys.exit(1) From 189871e14c296a3ec8e199de75cf697394d4ec79 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 25 Aug 2025 10:36:16 -0700 Subject: [PATCH 24/50] temp commit --- backends/aoti/aoti_backend.py | 2 ++ backends/aoti/aoti_partitioner.py | 14 +++++------ exir/emit/_emit_program.py | 12 +++++++++ export_aoti.py | 42 ++++++++++++++++++++++++++++++- 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index a0793bdb80f..d248ccfae82 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -62,4 +62,6 @@ def preprocess( shell=True, ) + print("so_path", so_path) + return PreprocessResult(so_path.encode("utf-8")) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index d490b261da6..6dfe888fec8 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -191,13 +191,13 @@ class AOTISupportedOperators(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - supported = node.op == "call_function" and ( - node.target == operator.getitem - or node.target._op not in inductor_fallback_ops - or node.target._op in supported_fallback_operators - ) - if supported and node.target != operator.getitem: - print(f"op {node.target._op} is supported: {supported}") + # supported = node.op == "call_function" and ( + # node.target == operator.getitem + # or str(node.target._op) not in inductor_fallback_ops + # or str(node.target._op) in supported_fallback_operators + # ) + + supported = node.op == "call_function" return supported diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index 0618871bd40..61997e97687 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -156,8 +156,13 @@ def emit_program( instruction_id_to_num_outs_map = {} program_state = _ProgramState() + print( + "111111111111111111111111111111111111111111111111111111111111111111111111111111" + ) + # emit each entry point in order according to name. for name, exported_program in sorted(methods.items()): + print(name) # create empty state emitter_state = _EmitterState( values=[], @@ -169,6 +174,8 @@ def emit_program( emit_mutable_buffer_names=emit_mutable_buffer_names, ) + print("222222222222222222222222222222222222222222222222222222222222222222222") + gm = _remove_non_user_outputs(exported_program) emitter = _TopLevelEmitter( @@ -176,6 +183,9 @@ def emit_program( ) emitter.run() + + print("333333333333333333333333333333333333333333333333333333333333333333333") + plans.append(emitter.plan()) debug_handle_map[name] = emitter.debug_handle_map @@ -192,6 +202,8 @@ def emit_program( if prim_getters is not None: plans.extend(emitter._emit_prim_getters(prim_getters)) + print("333333333333333333333333333333333333333333333333333333333333333333333") + return EmitterOutput( debug_handle_map=debug_handle_map, method_to_delegate_debug_id_map=method_to_delegate_debug_id_map, diff --git a/export_aoti.py b/export_aoti.py index 69cb0782c1b..a3924750362 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -24,11 +24,13 @@ import torch from executorch.backends.aoti.aoti_partitioner import AotiPartitioner -from executorch.exir import to_edge_transform_and_lower +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.exir import to_edge_transform_and_lower, to_edge from torch import nn from torch.export import export from torchvision import models from torchvision.models.mobilenetv2 import MobileNet_V2_Weights +from torchvision.models.resnet import ResNet18_Weights # Model classes @@ -41,6 +43,15 @@ def forward(self, x: torch.Tensor): return self.mv2(x) +class ResNet18(torch.nn.Module): + def __init__(self): + super(ResNet18, self).__init__() + self.resnet18 = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1) + + def forward(self, x: torch.Tensor): + return self.resnet18(x) + + class Linear(torch.nn.Module): def __init__(self): super(Linear, self).__init__() @@ -88,6 +99,15 @@ def forward(self, x): return self.conv(x) +class BatchNorm(nn.Module): + def __init__(self): + super().__init__() + self.bn = nn.BatchNorm2d(num_features=16) + + def forward(self, x): + return self.bn(x) + + # Model registry mapping model names to their configurations MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { "mv2": { @@ -96,6 +116,12 @@ def forward(self, x): "device": "cuda", "description": "MobileNetV2 model", }, + "resnet18": { + "model_class": ResNet18, + "input_shapes": [(1, 3, 224, 224)], + "device": "cpu", + "description": "ResNet18 model", + }, "linear": { "model_class": Linear, "input_shapes": [(127, 7)], @@ -120,6 +146,12 @@ def forward(self, x): "device": "cuda", "description": "Simple tensor addition model", }, + "batchnorm": { + "model_class": BatchNorm, + "input_shapes": [(1, 16, 32, 32)], + "device": "cuda", + "description": "Single BatchNorm2d layer model", + }, } @@ -162,14 +194,22 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p print("Step 1: Converting to ATen dialect...") aten_dialect = export(model, example_inputs) + # print(aten_dialect) + # exit(0) + # 2. to_edge: Make optimizations for Edge devices # aoti part should be decomposed by the internal torch._inductor.aot_compile # we should preserve the lowerable part and waiting for aoti backend handle that # Q: maybe need to turn on fallback_random? + edge_program = to_edge_transform_and_lower( aten_dialect, partitioner=[AotiPartitioner([])] ) + # edge_program = to_edge(aten_dialect) + + print(edge_program.exported_program()) + # 3. to_executorch: Convert the graph to an ExecuTorch program print("Step 4: Converting to ExecuTorch program...") executorch_program = edge_program.to_executorch() From 3ec8024897fd32f070004bdccebc7d58b8101c12 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 25 Aug 2025 10:38:16 -0700 Subject: [PATCH 25/50] merge to 0825 main --- extension/llm/tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/llm/tokenizers b/extension/llm/tokenizers index 4ed91cc545e..f09feca1584 160000 --- a/extension/llm/tokenizers +++ b/extension/llm/tokenizers @@ -1 +1 @@ -Subproject commit 4ed91cc545e9ed7098e53747656eb7eff24eb305 +Subproject commit f09feca15849a790c05b3b7855e7c62ce26ba94b From 041f2b618176f49bc3826548f7e5e4de0509c7b6 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 25 Aug 2025 12:11:26 -0700 Subject: [PATCH 26/50] temp comit --- backends/arm/third-party/serialization_lib | 1 - install_requirements.py | 50 ---------------------- 2 files changed, 51 deletions(-) delete mode 160000 backends/arm/third-party/serialization_lib diff --git a/backends/arm/third-party/serialization_lib b/backends/arm/third-party/serialization_lib deleted file mode 160000 index 187af0d41fe..00000000000 --- a/backends/arm/third-party/serialization_lib +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 187af0d41fe75d08d2a7ec84c1b4d24b9b641ed2 diff --git a/install_requirements.py b/install_requirements.py index 7e6d3010f93..0e0084fe3dd 100644 --- a/install_requirements.py +++ b/install_requirements.py @@ -12,58 +12,8 @@ from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible -<<<<<<< HEAD # This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled. TORCH_NIGHTLY_URL_BASE = "https://download.pytorch.org/whl/nightly" -======= -def python_is_compatible(): - # Scrape the version range from pyproject.toml, which should be in the current directory. - version_specifier = None - with open("pyproject.toml", "r") as file: - for line in file: - if line.startswith("requires-python"): - match = re.search(r'"([^"]*)"', line) - if match: - version_specifier = match.group(1) - break - - if not version_specifier: - print( - "WARNING: Skipping python version check: version range not found", - file=sys.stderr, - ) - return False - - # Install the packaging module if necessary. - try: - import packaging - except ImportError: - subprocess.run( - [sys.executable, "-m", "pip", "install", "packaging"], check=True - ) - # Compare the current python version to the range in version_specifier. Exits - # with status 1 if the version is not compatible, or with status 0 if the - # version is compatible or the logic itself fails. - try: - import packaging.specifiers - import packaging.version - - python_version = packaging.version.parse(platform.python_version()) - version_range = packaging.specifiers.SpecifierSet(version_specifier) - if python_version not in version_range: - print( - f'ERROR: ExecuTorch does not support python version {python_version}: must satisfy "{version_specifier}"', - file=sys.stderr, - ) - return False - except Exception as e: - print(f"WARNING: Skipping python version check: {e}", file=sys.stderr) - return True - - -# The pip repository that hosts nightly torch packages. -TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cu126" ->>>>>>> fe438f9c92 (Add export_add.py) # Supported CUDA versions - modify this to add/remove supported versions # Format: tuple of (major, minor) version numbers From 8c5bb3bc6de9efb5c9e435c4b65e222adb0509b7 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 25 Aug 2025 23:17:33 -0700 Subject: [PATCH 27/50] use env var to control debug mode --- backends/aoti/aoti_backend.py | 3 - backends/aoti/runtime/shims/memory.cpp | 417 +++++++++++++++--- backends/aoti/runtime/shims/memory.h | 8 + .../aoti/runtime/shims/tensor_attribute.h | 8 + exir/program/_program.py | 9 + export_and_run_aoti.sh | 42 +- export_aoti.py | 9 +- 7 files changed, 430 insertions(+), 66 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index d248ccfae82..f785da00783 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -45,9 +45,6 @@ def preprocess( options: dict[str, typing.Any] = { "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, - "aot_inductor.debug_compile": True, - "aot_inductor.repro_level": 3, - "aot_inductor.debug_intermediate_value_printer": "3", "max_autotune": True, "max_autotune_gemm_backends": "TRITON", "max_autotune_conv_backends": "TRITON", diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 2d2bf940833..77a1d26b040 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -166,8 +166,8 @@ AOTITorchError aoti_torch_empty_strided( throw std::runtime_error( "Need to implement empty_strided for non-CUDA non-CPU"); } - std::cout << "Allocated " << nbytes << " bytes at " << ptr << ", sizes_ptr " - << sizes_ptr << std::endl; + std::cout << "////Allocated " << nbytes << " bytes at " << ptr + << ", sizes_ptr " << sizes_ptr << std::endl; // ETensor sizes std::vector sizes(ndim); @@ -175,8 +175,31 @@ AOTITorchError aoti_torch_empty_strided( sizes[i] = sizes_ptr[i]; } + std::cout << "Sizes: "; + for (int i = 0; i < ndim; i++) { + std::cout << sizes[i] << ", "; + } + + std::cout << std::endl; + + // ETensor strides + std::vector strides(ndim); + if (strides_ptr != nullptr) { + // Use provided strides + for (int i = 0; i < ndim; i++) { + strides[i] = strides_ptr[i]; + } + } else { + // Calculate strides from sizes, assume it is in contiguous memory format + strides[ndim - 1] = 1; // Last dimension has stride 1 + for (int i = ndim - 2; i >= 0; i--) { + strides[i] = strides[i + 1] * sizes_ptr[i + 1]; + } + } + std::cout << std::endl; + // ETensor creation - auto tensor = executorch::extension::make_tensor_ptr(sizes, ptr); + auto tensor = executorch::extension::from_blob(ptr, sizes, strides); // Store the tensor so it doesn't get destroyed tensors.insert(tensor); @@ -269,76 +292,259 @@ AOTITorchError aoti_torch_copy_( AOTITensorHandle self, AOTITensorHandle src, int32_t non_blocking) { - // check if size is the same + std::cout << "aoti_torch_copy_ called: self=" << self << ", src=" << src + << std::endl; + + // assert same dim for now if (self->dim() != src->dim()) { - std::cout << "self.dim() " << self->dim() << ", src.dim() " << src->dim() - << std::endl; - throw std::runtime_error("self.dim() != src.dim()"); - } - std::cout << "self->data_ptr(): " << self->data_ptr() - << " sizes: " << self->sizes().data() << std::endl; - std::cout << "src->data_ptr(): " << src->data_ptr() - << " sizes: " << src->sizes().data() << std::endl; - for (int i = 0; i < self->dim(); i++) { - if (self->sizes()[i] != src->sizes()[i]) { - std::cout << "self.sizes()[i] " << self->sizes()[i] << ", src.sizes()[i] " - << src->sizes()[i] << std::endl; - throw std::runtime_error("size mismatch"); + std::cout << "Error: dimension mismatch. self.dim()=" << self->dim() + << ", src.dim()=" << src->dim() << std::endl; + return Error::InvalidArgument; + } + + // only support float32 for now + int32_t self_dtype, src_dtype; + aoti_torch_get_dtype(self, &self_dtype); + aoti_torch_get_dtype(src, &src_dtype); + + if (self_dtype != 6 || src_dtype != 6) { // 6 = float32 + std::cout << "Error: Only float32 tensors supported. Got self.dtype=" + << self_dtype << ", src.dtype=" << src_dtype << std::endl; + return Error::InvalidArgument; + } + + // Get stride information for layout validation + int64_t* self_strides; + int64_t* src_strides; + aoti_torch_get_strides(self, &self_strides); + aoti_torch_get_strides(src, &src_strides); + + auto self_sizes = self->sizes(); + auto src_sizes = src->sizes(); + + // contiguous or channel-last layouts allowed in ettensor + bool self_is_contiguous = true; + bool src_is_contiguous = true; + bool self_is_channels_last = false; + bool src_is_channels_last = false; + + // Check if contiguous (strides decrease from left to right) + int64_t expected_stride = 1; + for (int i = self->dim() - 1; i >= 0; i--) { + if (self_strides[i] != expected_stride) { + self_is_contiguous = false; } + expected_stride *= self_sizes[i]; } - int size = src->nbytes(); - // should check for device + expected_stride = 1; + for (int i = src->dim() - 1; i >= 0; i--) { + if (src_strides[i] != expected_stride) { + src_is_contiguous = false; + } + expected_stride *= src_sizes[i]; + } + + // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C]) + if (self->dim() == 4 && !self_is_contiguous) { + int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2], + C = self_sizes[3]; + if ((self_strides[0] == H * W * C || N <= 1) && (self_strides[1] == W * C || H <= 1) && + (self_strides[2] == C || W == 1) && (self_strides[3] == 1 || C == 1)) { + self_is_channels_last = true; + } + } + + if (src->dim() == 4 && !src_is_contiguous) { + int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2], + C = src_sizes[3]; + if ((src_strides[0] == H * W * C || N <= 1) &&( src_strides[1] == W * C || H <= 1) && + (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) { + src_is_channels_last = true; + } + } + + // Validate layout assumptions + if (!self_is_contiguous && !self_is_channels_last) { + std::cout << "Error: self tensor must be contiguous or channels-last. " + << "Got strides: ["; + for (int i = 0; i < self->dim(); i++) { + std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; + return Error::InvalidArgument; + } + + if (!src_is_contiguous && !src_is_channels_last) { + std::cout << "Error: src tensor must be contiguous or channels-last. " + << "Got strides: ["; + for (int i = 0; i < src->dim(); i++) { + std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; + return Error::InvalidArgument; + } + + // Determine device locations cudaPointerAttributes srcAttributes, dstAttributes; cudaError_t err; - // Get attributes of the source pointer + err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr()); checkCudaError(err, "Failed to get source pointer attributes"); - // Get attributes of the destination pointer + err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr()); checkCudaError(err, "Failed to get destination pointer attributes"); + bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; - // Determine the memory locations and perform the appropriate copy - if (srcIsDevice && dstIsDevice) { - // Device to Device copy - err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyDeviceToDevice); - checkCudaError(err, "Failed to copy from device to device"); - } else if (srcIsDevice && !dstIsDevice) { - // Device to Host copy + + std::cout << "Copy layout: src=" + << (src_is_contiguous ? "contiguous" : "channels-last") << " (" + << (srcIsDevice ? "GPU" : "CPU") << ") -> " + << "dst=" << (self_is_contiguous ? "contiguous" : "channels-last") + << " (" << (dstIsDevice ? "GPU" : "CPU") << ")" << std::endl; + + size_t total_bytes = src->nbytes(); + + // Check if we can do a simple memcpy (same layout) + bool same_layout = (self_is_contiguous && src_is_contiguous) || + (self_is_channels_last && src_is_channels_last); + + if (same_layout) { + std::cout << "Same layout - doing direct copy of " << total_bytes + << " bytes" << std::endl; + + // Simple copy since layouts match + if (srcIsDevice && dstIsDevice) { + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyDeviceToDevice); + checkCudaError(err, "Failed to copy from device to device"); + } else if (srcIsDevice && !dstIsDevice) { + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyDeviceToHost); + checkCudaError(err, "Failed to copy from device to host"); + } else if (!srcIsDevice && dstIsDevice) { + err = cudaMemcpy( + self->mutable_data_ptr(), + src->data_ptr(), + total_bytes, + cudaMemcpyHostToDevice); + checkCudaError(err, "Failed to copy from host to device"); + } else { + std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes); + } + } else { + // Layout conversion needed (contiguous <-> channels-last) + std::cout << "Layout conversion needed - doing element-wise copy" + << std::endl; + + if (self->dim() != 4) { + std::cout << "Error: Layout conversion only supported for 4D tensors" + << std::endl; + return Error::NotImplemented; + } + + // Get data to host for processing + size_t total_elements = total_bytes / sizeof(float); + float* src_host_data = nullptr; + float* dst_host_data = nullptr; + bool need_free_src = false; + bool need_free_dst = false; + + if (srcIsDevice) { + src_host_data = new float[total_elements]; + err = cudaMemcpy( + src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost); + checkCudaError(err, "Failed to copy src to host"); + need_free_src = true; + } else { + src_host_data = static_cast(src->data_ptr()); + } + + if (dstIsDevice) { + dst_host_data = new float[total_elements]; + need_free_dst = true; + } else { + dst_host_data = static_cast(self->mutable_data_ptr()); + } + + // Perform layout conversion (4D NCHW <-> NHWC) + int64_t N = self_sizes[0], C = self_sizes[1], H = self_sizes[2], + W = self_sizes[3]; + + for (int64_t n = 0; n < N; n++) { + for (int64_t c = 0; c < C; c++) { + for (int64_t h = 0; h < H; h++) { + for (int64_t w = 0; w < W; w++) { + size_t src_offset, dst_offset; + + if (src_is_contiguous) { + // Source is NCHW + src_offset = n * C * H * W + c * H * W + h * W + w; + } else { + // Source is NHWC + src_offset = n * H * W * C + h * W * C + w * C + c; + } + + if (self_is_contiguous) { + // Destination is NCHW + dst_offset = n * C * H * W + c * H * W + h * W + w; + } else { + // Destination is NHWC + dst_offset = n * H * W * C + h * W * C + w * C + c; + } + + dst_host_data[dst_offset] = src_host_data[src_offset]; + } + } + } + } + + // Copy result back to device if needed + if (dstIsDevice) { + err = cudaMemcpy( + self->mutable_data_ptr(), + dst_host_data, + total_bytes, + cudaMemcpyHostToDevice); + checkCudaError(err, "Failed to copy result to device"); + } + + // Clean up temporary buffers + if (need_free_src) + delete[] src_host_data; + if (need_free_dst) + delete[] dst_host_data; + } + + // Verify the copy by checking first element + float src_first, dst_first; + if (srcIsDevice) { err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyDeviceToHost); - std::cout << "Device to Host copy, self data: " - << ((float*)self->data_ptr())[0] << std::endl; - checkCudaError(err, "Failed to copy from device to host"); - } else if (!srcIsDevice && dstIsDevice) { - // Host to Device copy + &src_first, src->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost); + checkCudaError(err, "Failed to copy first src element"); + } else { + src_first = static_cast(src->data_ptr())[0]; + } + + if (dstIsDevice) { err = cudaMemcpy( - self->mutable_data_ptr(), - src->data_ptr(), - size, - cudaMemcpyHostToDevice); - std::cout << "Host to Device copy, src data: " - << ((float*)src->data_ptr())[0] << std::endl; - checkCudaError(err, "Failed to copy from host to device"); - } else if (!srcIsDevice && !dstIsDevice) { - // Host to Host copy - std::cout << "Host to Host copy, src data: " << ((float*)src->data_ptr())[0] - << std::endl; - std::memcpy(self->mutable_data_ptr(), src->data_ptr(), size); + &dst_first, self->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost); + checkCudaError(err, "Failed to copy first dst element"); } else { - std::cerr << "Error: Unknown memory type. self: " << dstAttributes.type - << ", src: " << srcAttributes.type << std::endl; - throw std::runtime_error("Unknown memory type"); + dst_first = static_cast(self->data_ptr())[0]; } - // print first value of src and self + + std::cout << "Copy verification: src[0]=" << src_first + << ", dst[0]=" << dst_first << std::endl; + std::cout << "aoti_torch_copy_ completed successfully" << std::endl; + return Error::Ok; } @@ -385,6 +591,103 @@ AOTITorchError aoti_torch_delete_cuda_stream_guard( return Error::Ok; } +AOTITorchError aoti_torch__reinterpret_tensor( + AOTITensorHandle self, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + AOTITensorHandle* ret_new_tensor) { + std::cout << "aoti_torch__reinterpret_tensor called with tensor " << self + << ", ndim: " << ndim << ", storage_offset: " << storage_offset + << std::endl; + + for (int i = 0; i < ndim; i++) { + std::cout << "sizes[" << i << "]: " << sizes_ptr[i] << std::endl; + } + for (int i = 0; i < ndim; i++) { + std::cout << "strides[" << i << "]: " << strides_ptr[i] << std::endl; + } + + // Check if storage_offset is not 0 - return error if not + if (storage_offset != 0) { + std::cout + << "Error: aoti_torch__reinterpret_tensor does not support non-zero storage_offset: " + << storage_offset << std::endl; + return Error::InvalidArgument; + } + + // Check if dimensions match + if (self->dim() != ndim) { + std::cout << "Error: tensor dimension mismatch. self->dim(): " + << self->dim() << ", provided ndim: " << ndim << std::endl; + return Error::InvalidArgument; + } + + // Get tensor properties from the input tensor + int32_t dtype; + AOTITorchError dtype_err = aoti_torch_get_dtype(self, &dtype); + if (dtype_err != Error::Ok) { + std::cout << "Error: failed to get dtype from input tensor" << std::endl; + return dtype_err; + } + + int32_t device_type; + AOTITorchError device_type_err = + aoti_torch_get_device_type(self, &device_type); + if (device_type_err != Error::Ok) { + std::cout << "Error: failed to get device_type from input tensor" + << std::endl; + return device_type_err; + } + + int32_t device_index; + AOTITorchError device_index_err = + aoti_torch_get_device_index(self, &device_index); + if (device_index_err != Error::Ok) { + std::cout << "Error: failed to get device_index from input tensor" + << std::endl; + return device_index_err; + } + + std::cout << "Creating new tensor with dtype: " << dtype + << ", device_type: " << device_type + << ", device_index: " << device_index << std::endl; + + // Create new tensor with the provided sizes and strides using + // aoti_torch_empty_strided + AOTITorchError create_err = aoti_torch_empty_strided( + ndim, + sizes_ptr, + strides_ptr, + dtype, + device_type, + device_index, + ret_new_tensor); + + if (create_err != Error::Ok) { + std::cout << "Error: failed to create new tensor with empty_strided" + << std::endl; + return create_err; + } + + // Copy data from source tensor to new tensor + AOTITorchError copy_err = aoti_torch_copy_(*ret_new_tensor, self, 0); + if (copy_err != Error::Ok) { + std::cout << "Error: failed to copy data from source tensor to new tensor" + << std::endl; + // Clean up the created tensor on failure + aoti_torch_delete_tensor_object(*ret_new_tensor); + *ret_new_tensor = nullptr; + return copy_err; + } + + std::cout << "Successfully created reinterpreted tensor " << *ret_new_tensor + << " from source tensor " << self << std::endl; + + return Error::Ok; +} + // Cleanup function for clearing global state void cleanup_memory() { is_tensor_own_memory.clear(); diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index 996c729b4be..0b8af138c90 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -92,6 +92,14 @@ AOTITorchError aoti_torch_create_cuda_stream_guard( AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); +AOTITorchError aoti_torch__reinterpret_tensor( + AOTITensorHandle self, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + AOTITensorHandle* ret_new_tensor); + // Utility functions void checkCudaError(cudaError_t err, const char* msg); void cleanup_memory(); diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h index 3ed966f99dc..f419f7db632 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.h +++ b/backends/aoti/runtime/shims/tensor_attribute.h @@ -56,6 +56,14 @@ AOTITorchError aoti_torch_get_storage_size( AOTITensorHandle tensor, int64_t* ret_size); +AOTITorchError aoti_torch_get_device_type( + AOTITensorHandle tensor, + int32_t* ret_device_type); + +AOTITorchError aoti_torch_get_device_index( + AOTITensorHandle tensor, + int32_t* ret_device_index); + // Utility functions for device and layout information int32_t aoti_torch_device_type_cpu(); int32_t aoti_torch_device_type_cuda(); diff --git a/exir/program/_program.py b/exir/program/_program.py index af94399a3ed..760056e32bb 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1697,8 +1697,17 @@ def to_executorch( # noqa (FLAKE8) C901 after it has been transformed to the ExecuTorch backend. """ config = config if config else ExecutorchBackendConfig() + + def exported_program_to_device(exported_program, device): + for _, param in exported_program.named_parameters(): + param.data = param.data.to(device) + for _, buffer in exported_program.named_buffers(): + buffer.data = buffer.data.to(device) + return exported_program + execution_programs: Dict[str, ExportedProgram] = {} for name, program in self._edge_programs.items(): + program = exported_program_to_device(program, "cpu") if config.do_quant_fusion_and_const_prop: if program.graph_signature.backward_signature is not None: raise Exception( diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 54f1d0b5092..e850c2bb6bb 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -18,14 +18,19 @@ set -e # Exit on any error # Parse command line arguments MODE="reinstall_all" MODEL_ARG="$1" +DEBUG_MODE=false -# Parse arguments for mode +# Parse arguments for mode and debug flag for arg in "$@"; do case $arg in --mode=*) MODE="${arg#*=}" shift ;; + --debug) + DEBUG_MODE=true + shift + ;; reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only) # If it's the second argument and a valid mode, use it as mode if [[ "$arg" == "$2" ]]; then @@ -49,6 +54,7 @@ case "$MODE" in echo " ./export_and_run_aoti.sh conv2d inference # Positional mode" echo " ./export_and_run_aoti.sh conv2d --mode=inference # GNU-style mode" echo " ./export_and_run_aoti.sh conv2d export_aoti_only # Export AOTI only (no runtime)" + echo " ./export_and_run_aoti.sh conv2d --mode=inference --debug # With debug options enabled" exit 1 ;; esac @@ -130,10 +136,30 @@ run_inference() { ./cmake-out/executor_runner --model_path aoti_model.pte } +# Set up environment variables based on debug flag +if [[ "$DEBUG_MODE" == true ]]; then + echo "Setting debug environment variables..." + export AOT_INDUCTOR_DEBUG_COMPILE="1" + export AOTINDUCTOR_REPRO_LEVEL=3 + export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + echo "Debug variables set:" + echo " AOT_INDUCTOR_DEBUG_COMPILE=$AOT_INDUCTOR_DEBUG_COMPILE" + echo " AOTINDUCTOR_REPRO_LEVEL=$AOTINDUCTOR_REPRO_LEVEL" + echo " AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER" +else + # Ensure debug variables are unset for non-debug modes + unset AOT_INDUCTOR_DEBUG_COMPILE + unset AOTINDUCTOR_REPRO_LEVEL + unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER +fi + # Execute based on mode case "$MODE" in "reinstall_all") - echo "Mode: reinstall_all - Full reinstall and run" + echo "Mode: $MODE - Full reinstall and run" + if [[ "$DEBUG_MODE" == true ]]; then + echo "Debug options enabled with AOT Inductor debug settings" + fi install_executorch export_aoti_model clean_install_executorch @@ -142,23 +168,35 @@ case "$MODE" in ;; "reinstall_aot") echo "Mode: reinstall_aot - Reinstall AOT components and run e2e" + if [[ "$DEBUG_MODE" == true ]]; then + echo "Debug options enabled with AOT Inductor debug settings" + fi install_executorch export_aoti_model run_inference ;; "reinstall_runtime") echo "Mode: reinstall_runtime - Rebuild runtime and run e2e" + if [[ "$DEBUG_MODE" == true ]]; then + echo "Debug options enabled with AOT Inductor debug settings" + fi export_aoti_model build_runtime run_inference ;; "inference") echo "Mode: inference - Export model and run inference only" + if [[ "$DEBUG_MODE" == true ]]; then + echo "Debug options enabled with AOT Inductor debug settings" + fi export_aoti_model run_inference ;; "export_aoti_only") echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)" + if [[ "$DEBUG_MODE" == true ]]; then + echo "Debug options enabled with AOT Inductor debug settings" + fi export_aoti_model "--aoti_only" ;; *) diff --git a/export_aoti.py b/export_aoti.py index a3924750362..2550f33a55a 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -24,8 +24,9 @@ import torch from executorch.backends.aoti.aoti_partitioner import AotiPartitioner -from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner -from executorch.exir import to_edge_transform_and_lower, to_edge + +# from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.exir import to_edge, to_edge_transform_and_lower from torch import nn from torch.export import export from torchvision import models @@ -119,7 +120,7 @@ def forward(self, x): "resnet18": { "model_class": ResNet18, "input_shapes": [(1, 3, 224, 224)], - "device": "cpu", + "device": "cuda", "description": "ResNet18 model", }, "linear": { @@ -247,7 +248,7 @@ def export_model_to_pure_aoti(model, example_inputs): "aot_inductor.output_path": output_path, "aot_inductor.debug_compile": True, "aot_inductor.repro_level": 3, - "aot_inductor.debug_intermediate_value_printer": "3", + "aot_inductor.debug_intermediate_value_printer": "2", "max_autotune": True, "max_autotune_gemm_backends": "TRITON", "max_autotune_conv_backends": "TRITON", From 687bcdf8c939406f03c7bb184efdd5cd9d6ce369 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 26 Aug 2025 14:08:38 -0700 Subject: [PATCH 28/50] enable dump runtime intermediate output for aoti delegated part --- .gitignore | 1 + backends/aoti/CMakeLists.txt | 3 +- backends/aoti/runtime/aoti_backend.cpp | 2 + backends/aoti/runtime/shims/memory.cpp | 135 +++++++----- .../aoti/runtime/shims/tensor_attribute.cpp | 7 + .../aoti/runtime/shims/tensor_attribute.h | 2 + backends/aoti/runtime/shims/utils.cpp | 202 ++++++++++++++++++ backends/aoti/runtime/shims/utils.h | 39 ++++ backends/aoti/runtime/targets.bzl | 2 + export_and_run_aoti.sh | 64 ++++-- 10 files changed, 393 insertions(+), 64 deletions(-) create mode 100644 backends/aoti/runtime/shims/utils.cpp create mode 100644 backends/aoti/runtime/shims/utils.h diff --git a/.gitignore b/.gitignore index 78268c70d8c..92b68cbc2d7 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ tokenizer.json *kernel.cpp *wrapper_metadata.json *wrapper.cpp +aoti_intermediate_output.txt # Editor temporaries *.idea diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index 6922d5e9356..ca26f30d73e 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -31,7 +31,8 @@ set(_aoti_sources runtime/aoti_backend.cpp runtime/aoti_model_container.cpp runtime/shims/memory.cpp - runtime/shims/tensor_attribute.cpp) + runtime/shims/tensor_attribute.cpp + runtime/shims/utils.cpp) add_library(aoti_backend STATIC ${_aoti_sources}) target_include_directories( aoti_backend diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 453613d47f8..03c46c03bdd 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -29,6 +29,7 @@ #include "aoti_model_container.h" #include "shims/memory.h" #include "shims/tensor_attribute.h" +#include "shims/utils.h" // Include CUDA AOTI shims #include @@ -374,6 +375,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { free(handle); cleanup_memory(); cleanup_tensor_metadata(); + cleanup_aoti_tensor_output(); ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); } }; diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 77a1d26b040..09a773dd43e 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -322,67 +322,106 @@ AOTITorchError aoti_torch_copy_( auto self_sizes = self->sizes(); auto src_sizes = src->sizes(); - // contiguous or channel-last layouts allowed in ettensor + // Check if tensors have the same tensor schema (sizes, strides, dtype) + bool same_schema = true; + + // Check sizes match + for (int i = 0; i < self->dim(); i++) { + if (self_sizes[i] != src_sizes[i]) { + same_schema = false; + break; + } + } + + // Check strides match (only if sizes match) + if (same_schema) { + for (int i = 0; i < self->dim(); i++) { + if (self_strides[i] != src_strides[i]) { + same_schema = false; + break; + } + } + } + + // Declare layout variables for both cases bool self_is_contiguous = true; bool src_is_contiguous = true; bool self_is_channels_last = false; bool src_is_channels_last = false; - // Check if contiguous (strides decrease from left to right) - int64_t expected_stride = 1; - for (int i = self->dim() - 1; i >= 0; i--) { - if (self_strides[i] != expected_stride) { - self_is_contiguous = false; + if (same_schema) { + std::cout << "Same tensor schema detected - enabling naive copy" + << std::endl; + // For same schema, we don't need to check memory formats - just use direct + // copy + } else { + // Different strides: check memory format and only support contiguous <-> + // channels-last conversion + std::cout + << "Different tensor schemas - checking memory format compatibility" + << std::endl; + + // Check if contiguous (strides decrease from left to right) + int64_t expected_stride = 1; + for (int i = self->dim() - 1; i >= 0; i--) { + if (self_strides[i] != expected_stride) { + self_is_contiguous = false; + } + expected_stride *= self_sizes[i]; } - expected_stride *= self_sizes[i]; - } - expected_stride = 1; - for (int i = src->dim() - 1; i >= 0; i--) { - if (src_strides[i] != expected_stride) { - src_is_contiguous = false; + expected_stride = 1; + for (int i = src->dim() - 1; i >= 0; i--) { + if (src_strides[i] != expected_stride) { + src_is_contiguous = false; + } + expected_stride *= src_sizes[i]; } - expected_stride *= src_sizes[i]; - } - // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C]) - if (self->dim() == 4 && !self_is_contiguous) { - int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2], - C = self_sizes[3]; - if ((self_strides[0] == H * W * C || N <= 1) && (self_strides[1] == W * C || H <= 1) && - (self_strides[2] == C || W == 1) && (self_strides[3] == 1 || C == 1)) { - self_is_channels_last = true; + // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C]) + if (self->dim() == 4 && !self_is_contiguous) { + int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2], + C = self_sizes[3]; + if ((self_strides[0] == H * W * C || N <= 1) && + (self_strides[1] == W * C || H <= 1) && + (self_strides[2] == C || W == 1) && + (self_strides[3] == 1 || C == 1)) { + self_is_channels_last = true; + } } - } - if (src->dim() == 4 && !src_is_contiguous) { - int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2], - C = src_sizes[3]; - if ((src_strides[0] == H * W * C || N <= 1) &&( src_strides[1] == W * C || H <= 1) && - (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) { - src_is_channels_last = true; + if (src->dim() == 4 && !src_is_contiguous) { + int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2], + C = src_sizes[3]; + if ((src_strides[0] == H * W * C || N <= 1) && + (src_strides[1] == W * C || H <= 1) && + (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) { + src_is_channels_last = true; + } } - } - // Validate layout assumptions - if (!self_is_contiguous && !self_is_channels_last) { - std::cout << "Error: self tensor must be contiguous or channels-last. " - << "Got strides: ["; - for (int i = 0; i < self->dim(); i++) { - std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : ""); + // Validate layout assumptions only when schemas differ + if (!self_is_contiguous && !self_is_channels_last) { + std::cout + << "Error: self tensor must be contiguous or channels-last for stride conversion. " + << "Got strides: ["; + for (int i = 0; i < self->dim(); i++) { + std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; + return Error::InvalidArgument; } - std::cout << "]" << std::endl; - return Error::InvalidArgument; - } - if (!src_is_contiguous && !src_is_channels_last) { - std::cout << "Error: src tensor must be contiguous or channels-last. " - << "Got strides: ["; - for (int i = 0; i < src->dim(); i++) { - std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : ""); + if (!src_is_contiguous && !src_is_channels_last) { + std::cout + << "Error: src tensor must be contiguous or channels-last for stride conversion. " + << "Got strides: ["; + for (int i = 0; i < src->dim(); i++) { + std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; + return Error::InvalidArgument; } - std::cout << "]" << std::endl; - return Error::InvalidArgument; } // Determine device locations @@ -406,11 +445,7 @@ AOTITorchError aoti_torch_copy_( size_t total_bytes = src->nbytes(); - // Check if we can do a simple memcpy (same layout) - bool same_layout = (self_is_contiguous && src_is_contiguous) || - (self_is_channels_last && src_is_channels_last); - - if (same_layout) { + if (same_schema) { std::cout << "Same layout - doing direct copy of " << total_bytes << " bytes" << std::endl; diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index eb3d0e22371..dcea848597e 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -123,6 +123,13 @@ AOTITorchError aoti_torch_get_device_index( return Error::Ok; } +AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) { + *ret_dim = tensor->dim(); + std::cout << "getting dim from tensor " << tensor << " = " << *ret_dim + << std::endl; + return Error::Ok; +} + int32_t aoti_torch_device_type_cpu() { // Let's say cpu is 0 for ET as well return 0; diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h index f419f7db632..ab4f8037ebf 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.h +++ b/backends/aoti/runtime/shims/tensor_attribute.h @@ -64,6 +64,8 @@ AOTITorchError aoti_torch_get_device_index( AOTITensorHandle tensor, int32_t* ret_device_index); +AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim); + // Utility functions for device and layout information int32_t aoti_torch_device_type_cpu(); int32_t aoti_torch_device_type_cuda(); diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp new file mode 100644 index 00000000000..10882c16cf4 --- /dev/null +++ b/backends/aoti/runtime/shims/utils.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "utils.h" +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +namespace internal { +// Constants for file operations +const char* const TENSOR_OUTPUT_FILENAME = + "/home/gasoonjia/executorch/aoti_intermediate_output.txt"; +} // namespace internal + +extern "C" { + +void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) { + printf("Printing tensor handle: %p\n", self); + + if (!self) { + throw std::runtime_error("Tensor handle is null"); + } + + printf("Tensor handle is not null\n"); + + // Get dtype and check if it's float32 (dtype 6 in PyTorch) + int32_t dtype = 0; + if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) { + throw std::runtime_error("Failed to get tensor dtype"); + } + + printf("Tensor dtype is: %d\n", dtype); + + if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch + throw std::runtime_error( + "Tensor dtype is not float32. Expected dtype 6, got: " + + std::to_string(dtype)); + } + + printf("Tensor dtype is float32\n"); + + // Get data pointer + void* data_ptr = nullptr; + if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS || + !data_ptr) { + throw std::runtime_error("Failed to get tensor data pointer"); + } + + printf("Tensor data pointer is %p not null\n", data_ptr); + + // Get dimensions + int64_t dim = 0; + if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) { + throw std::runtime_error("Failed to get tensor dimensions"); + } + + printf("Tensor dimensions are: %ld\n", dim); + + // Get sizes + int64_t* sizes = nullptr; + if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) { + throw std::runtime_error("Failed to get tensor sizes"); + } + + printf("Tensor sizes are: %ld\n", sizes); + + // Calculate total number of elements + int64_t total_elements = 1; + for (int i = 0; i < dim; i++) { + total_elements *= sizes[i]; + } + + printf("Total elements in tensor: %ld\n", total_elements); + + // Check device type to handle CUDA tensors properly + int32_t device_type = 0; + if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) { + throw std::runtime_error("Failed to get tensor device type"); + } + + printf("Tensor device type: %d\n", device_type); + + AtenTensorHandle cpu_tensor = nullptr; + const float* float_data = nullptr; + bool need_cleanup = false; + + // Check if tensor is on CUDA (device_type 1 is CUDA) + if (device_type == 1) { + printf("Tensor is on CUDA, copying to CPU...\n"); + + // Get strides for creating CPU tensor + int64_t* strides = nullptr; + if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS || + !strides) { + throw std::runtime_error("Failed to get tensor strides"); + } + + // Create a CPU tensor with same shape and layout + if (aoti_torch_empty_strided( + dim, sizes, strides, dtype, 0, -1, &cpu_tensor) != + AOTI_TORCH_SUCCESS) { + throw std::runtime_error("Failed to create CPU tensor"); + } + + // Copy data from CUDA to CPU tensor + if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) { + aoti_torch_delete_tensor_object(cpu_tensor); + throw std::runtime_error("Failed to copy tensor from CUDA to CPU"); + } + + // Get CPU data pointer + void* cpu_data_ptr = nullptr; + if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) != + AOTI_TORCH_SUCCESS || + !cpu_data_ptr) { + aoti_torch_delete_tensor_object(cpu_tensor); + throw std::runtime_error("Failed to get CPU tensor data pointer"); + } + + float_data = static_cast(cpu_data_ptr); + need_cleanup = true; + printf("Successfully copied CUDA tensor to CPU\n"); + } else { + // Tensor is already on CPU, use original data pointer + printf("Tensor is on CPU, using original data pointer\n"); + float_data = static_cast(data_ptr); + } + + // Open file for writing (append mode to not overwrite previous outputs) + printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME); + + std::ofstream output_file( + internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app); + if (!output_file.is_open()) { + if (need_cleanup) { + aoti_torch_delete_tensor_object(cpu_tensor); + } + throw std::runtime_error( + "Failed to open output file: " + + std::string(internal::TENSOR_OUTPUT_FILENAME)); + } + + printf("Successfully opened file for writing\n"); + + // Write message and tensor info to file + output_file << "=== " << msg << " ===" << std::endl; + output_file << "Device type: " << device_type << std::endl; + output_file << "Dimensions: " << dim << std::endl; + output_file << "Sizes: ["; + for (int i = 0; i < dim; i++) { + output_file << sizes[i]; + if (i < dim - 1) + output_file << ", "; + } + output_file << "]" << std::endl; + output_file << "Total elements: " << total_elements << std::endl; + output_file << "Data content:" << std::endl; + + // Write tensor data to file (now safe to access) + for (int64_t i = 0; i < total_elements; i++) { + output_file << float_data[i] << " "; + if (i < total_elements - 1) { + output_file << ", "; + // Add newline every 10 elements for readability + if ((i + 1) % 10 == 0) { + output_file << std::endl; + } + } + } + output_file << std::endl << std::endl; + + // Clean up CPU tensor if we created one + if (need_cleanup) { + aoti_torch_delete_tensor_object(cpu_tensor); + printf("Cleaned up temporary CPU tensor\n"); + } + + // File will be automatically closed when output_file goes out of scope +} + +// Function to cleanup the tensor output file (to be called from +// aoti_backend.cpp) +void cleanup_aoti_tensor_output() { + // No cleanup needed since file is opened and closed on each call +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h new file mode 100644 index 00000000000..6bcd34efcfb --- /dev/null +++ b/backends/aoti/runtime/shims/utils.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Type definitions +using AOTITensorHandle = Tensor*; +using AOTITorchError = Error; + +// Utility function for printing tensor information +void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg); + +// Cleanup function for tensor output file (called during backend destruction) +void cleanup_aoti_tensor_output(); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl index 28c9e893721..2c87ad68a2c 100644 --- a/backends/aoti/runtime/targets.bzl +++ b/backends/aoti/runtime/targets.bzl @@ -8,11 +8,13 @@ def define_common_targets(): "aoti_model_container.cpp", "shims/memory.cpp", "shims/tensor_attribute.cpp", + "shims/utils.cpp", ], headers = [ "aoti_model_container.h", "shims/memory.h", "shims/tensor_attribute.h", + "shims/utils.h", ], # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) link_whole = True, diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index e850c2bb6bb..ebb1a44239e 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -3,14 +3,19 @@ # Script to export and run AOTI with different modes # Usage: # ./export_and_run_aoti.sh [mode] -# ./export_and_run_aoti.sh --mode= +# ./export_and_run_aoti.sh --mode= [--debug] [--dump] # # Examples: -# ./export_and_run_aoti.sh conv2d # Uses default mode (reinstall_all) -# ./export_and_run_aoti.sh conv2d inference # Uses inference mode -# ./export_and_run_aoti.sh conv2d --mode=inference # Alternative syntax +# ./export_and_run_aoti.sh conv2d # Uses default mode (reinstall_all) +# ./export_and_run_aoti.sh conv2d inference # Uses inference mode +# ./export_and_run_aoti.sh conv2d --mode=inference # Alternative syntax +# ./export_and_run_aoti.sh conv2d --mode=inference --dump # With AOTI intermediate output dumping +# ./export_and_run_aoti.sh conv2d --mode=inference --debug --dump # With both debug and dump # # Available modes: reinstall_all (default), reinstall_aot, reinstall_runtime, inference, export_aoti_only +# Flags: +# --debug: Enable debug mode with extensive logging +# --dump: Enable AOTI intermediate output dumping to aoti_intermediate_output.txt # model_arg: argument to pass to export_aoti.py set -e # Exit on any error @@ -19,6 +24,7 @@ set -e # Exit on any error MODE="reinstall_all" MODEL_ARG="$1" DEBUG_MODE=false +DUMP_MODE=false # Parse arguments for mode and debug flag for arg in "$@"; do @@ -31,6 +37,10 @@ for arg in "$@"; do DEBUG_MODE=true shift ;; + --dump) + DUMP_MODE=true + shift + ;; reinstall_all|reinstall_aot|reinstall_runtime|inference|export_aoti_only) # If it's the second argument and a valid mode, use it as mode if [[ "$arg" == "$2" ]]; then @@ -87,6 +97,7 @@ cleanup_temp_files() { rm -f *kernel.cpp rm -f *wrapper_metadata.json rm -f *wrapper.cpp + rm -f aoti_intermediate_output.txt echo "Cleanup completed." } @@ -121,12 +132,25 @@ build_runtime() { rm -rf cmake-out mkdir -p cmake-out cd cmake-out - cmake -DEXECUTORCH_BUILD_AOTI=ON \ - -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ - -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ - -DEXECUTORCH_LOG_LEVEL=Debug \ - -DCMAKE_BUILD_TYPE=Debug \ - .. + + if [[ "$DEBUG_MODE" == true ]]; then + echo "Building with debug configuration..." + cmake -DEXECUTORCH_BUILD_AOTI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_LOG_LEVEL=Debug \ + -DCMAKE_BUILD_TYPE=Debug \ + .. + else + echo "Building with release configuration..." + cmake -DEXECUTORCH_BUILD_AOTI=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ + -DEXECUTORCH_LOG_LEVEL=Info \ + -DCMAKE_BUILD_TYPE=Release \ + .. + fi + cd .. cmake --build cmake-out -j9 } @@ -136,18 +160,32 @@ run_inference() { ./cmake-out/executor_runner --model_path aoti_model.pte } -# Set up environment variables based on debug flag +# Set up environment variables based on debug and dump flags if [[ "$DEBUG_MODE" == true ]]; then echo "Setting debug environment variables..." export AOT_INDUCTOR_DEBUG_COMPILE="1" export AOTINDUCTOR_REPRO_LEVEL=3 - export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + + # Set intermediate value printer based on dump flag + if [[ "$DUMP_MODE" == true ]]; then + export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)" + else + export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="3" + fi + echo "Debug variables set:" echo " AOT_INDUCTOR_DEBUG_COMPILE=$AOT_INDUCTOR_DEBUG_COMPILE" echo " AOTINDUCTOR_REPRO_LEVEL=$AOTINDUCTOR_REPRO_LEVEL" echo " AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER" +elif [[ "$DUMP_MODE" == true ]]; then + # Only dump mode enabled (without debug) + echo "Setting AOTI intermediate output dumping..." + export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)" + echo " AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER" else - # Ensure debug variables are unset for non-debug modes + # Ensure debug variables are unset for non-debug/non-dump modes unset AOT_INDUCTOR_DEBUG_COMPILE unset AOTINDUCTOR_REPRO_LEVEL unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER From eca93d1f9f884aadab74f6c6c04a0e2b55cdc4e0 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 28 Aug 2025 00:52:26 -0700 Subject: [PATCH 29/50] resnet18 works --- .gitignore | 2 +- backends/aoti/runtime/shims/memory.cpp | 179 ++++++++++++------ .../aoti/runtime/shims/tensor_attribute.cpp | 16 ++ compare_outputs.py | 154 +++++++++++++++ .../executor_runner/executor_runner.cpp | 145 ++++---------- export_and_run_aoti.sh | 17 ++ export_aoti.py | 82 +++++++- 7 files changed, 427 insertions(+), 168 deletions(-) create mode 100755 compare_outputs.py diff --git a/.gitignore b/.gitignore index 92b68cbc2d7..2e9b9c948a2 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,7 @@ tokenizer.json *kernel.cpp *wrapper_metadata.json *wrapper.cpp -aoti_intermediate_output.txt +aoti_debug_data* # Editor temporaries *.idea diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 09a773dd43e..25e750edb3c 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -26,6 +26,79 @@ namespace aoti { using executorch::runtime::Error; using executorch::runtime::etensor::Tensor; +namespace { // Internal namespace for utility functions + +// Version 1: For use with int64_t sizes (e.g., from blob creation functions) +// Check if tensor is in contiguous memory format (NCHW for 4D tensors) +// Contiguous format means strides decrease from left to right: +// For NCHW: strides = [C*H*W, H*W, W, 1] +bool is_tensor_contiguous( + int64_t ndim, + const int64_t* sizes, + const int64_t* strides) { + int64_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; i--) { + if (strides[i] != expected_stride) { + return false; + } + expected_stride *= sizes[i]; + } + return true; +} + +// Check if tensor is in channels-last format (NHWC for 4D tensors) +// Channels-last format for 4D: strides = [H*W*C, 1, W*C, C] +bool is_tensor_channels_last( + int64_t ndim, + const int64_t* sizes, + const int64_t* strides) { + if (ndim != 4) { + return false; // Channels-last only defined for 4D tensors + } + + int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3]; + + // Check NHWC format: strides = [H*W*C, 1, W*C, C] + // Handle edge cases where dimensions might be 1 + return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) && + (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); +} + +// Version 2: For use with ExecutorTorch tensors (int32_t sizes) +// Check if tensor is in contiguous memory format (NCHW for 4D tensors) +bool is_tensor_contiguous( + int64_t ndim, + const int32_t* sizes, + const int64_t* strides) { + int64_t expected_stride = 1; + for (int i = ndim - 1; i >= 0; i--) { + if (strides[i] != expected_stride) { + return false; + } + expected_stride *= sizes[i]; + } + return true; +} + +// Check if tensor is in channels-last format (NHWC for 4D tensors) +bool is_tensor_channels_last( + int64_t ndim, + const int32_t* sizes, + const int64_t* strides) { + if (ndim != 4) { + return false; // Channels-last only defined for 4D tensors + } + + int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3]; + + // Check NHWC format: strides = [H*W*C, 1, W*C, C] + // Handle edge cases where dimensions might be 1 + return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) && + (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); +} + +} // anonymous namespace + // Global storage for tensors and their metadata std::unordered_set> tensors; std::unordered_map is_tensor_own_memory; @@ -47,7 +120,21 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( int64_t opaque_metadata_size) { std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim << ", dtype: " << dtype << ", device_type: " << device_type - << std::endl; + << ", storage_offset: " << storage_offset << std::endl; + + // Only float32 tensors are supported + if (dtype != 6) { // 6 = float32 + std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " + << dtype << " (expected: 6 for float32)" << std::endl; + return Error::InvalidArgument; + } + + // Storage offset must always be 0 + if (storage_offset != 0) { + std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " + << storage_offset << std::endl; + return Error::InvalidArgument; + } // Convert sizes to the format expected by ExecutorTorch std::vector sizes(ndim); @@ -58,31 +145,15 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // check the tensor format // Only support contiguous format for now - int64_t expected_stride = 1; - for (int i = ndim - 1; i >= 0; --i) { - if (strides_ptr[i] != expected_stride) { - std::cout - << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error" - << std::endl; - return Error::InvalidArgument; - } - expected_stride *= sizes_ptr[i]; + if (!is_tensor_contiguous(ndim, sizes_ptr, strides_ptr)) { + std::cout + << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error" + << std::endl; + return Error::InvalidArgument; } - // Adjust data pointer by storage_offset if needed + // Since storage_offset is guaranteed to be 0, use data pointer directly void* adjusted_data = data; - if (storage_offset > 0) { - // Calculate byte offset based on dtype size - size_t dtype_size = - 4; // Assuming float32 for now, you may need to handle other dtypes - if (dtype == 6) { // float32 - dtype_size = 4; - } else { - std::cout << "Error: Unhandled dtype " << dtype << std::endl; - return Error::NotImplemented; - } - adjusted_data = static_cast(data) + (storage_offset * dtype_size); - } // Create ExecutorTorch tensor that wraps the existing memory // Note: We're NOT copying the data, just wrapping it @@ -362,42 +433,21 @@ AOTITorchError aoti_torch_copy_( << std::endl; // Check if contiguous (strides decrease from left to right) - int64_t expected_stride = 1; - for (int i = self->dim() - 1; i >= 0; i--) { - if (self_strides[i] != expected_stride) { - self_is_contiguous = false; - } - expected_stride *= self_sizes[i]; - } + self_is_contiguous = + is_tensor_contiguous(self->dim(), self_sizes.data(), self_strides); - expected_stride = 1; - for (int i = src->dim() - 1; i >= 0; i--) { - if (src_strides[i] != expected_stride) { - src_is_contiguous = false; - } - expected_stride *= src_sizes[i]; - } + src_is_contiguous = + is_tensor_contiguous(src->dim(), src_sizes.data(), src_strides); - // Check if channels-last (4D: NHWC, strides in order [H*W*C, 1, W*C, C]) - if (self->dim() == 4 && !self_is_contiguous) { - int64_t N = self_sizes[0], H = self_sizes[1], W = self_sizes[2], - C = self_sizes[3]; - if ((self_strides[0] == H * W * C || N <= 1) && - (self_strides[1] == W * C || H <= 1) && - (self_strides[2] == C || W == 1) && - (self_strides[3] == 1 || C == 1)) { - self_is_channels_last = true; - } + // Check if channels-last (4D: NHWC format) + if (!self_is_contiguous) { + self_is_channels_last = + is_tensor_channels_last(self->dim(), self_sizes.data(), self_strides); } - if (src->dim() == 4 && !src_is_contiguous) { - int64_t N = src_sizes[0], H = src_sizes[1], W = src_sizes[2], - C = src_sizes[3]; - if ((src_strides[0] == H * W * C || N <= 1) && - (src_strides[1] == W * C || H <= 1) && - (src_strides[2] == C || W <= 1) && (src_strides[3] == 1 || C <= 1)) { - src_is_channels_last = true; - } + if (!src_is_contiguous) { + src_is_channels_last = + is_tensor_channels_last(src->dim(), src_sizes.data(), src_strides); } // Validate layout assumptions only when schemas differ @@ -409,17 +459,27 @@ AOTITorchError aoti_torch_copy_( std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : ""); } std::cout << "]" << std::endl; + std::cout << "self_sizes: ["; + for (int i = 0; i < self->dim(); i++) { + std::cout << self_sizes[i] << (i < self->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; return Error::InvalidArgument; } if (!src_is_contiguous && !src_is_channels_last) { std::cout - << "Error: src tensor must be contiguous or channels-last for stride conversion. " + << "Error: src tensor must be contiguous or channels-last for stride conversion. \n" << "Got strides: ["; for (int i = 0; i < src->dim(); i++) { std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : ""); } std::cout << "]" << std::endl; + std::cout << "src_sizes: ["; + for (int i = 0; i < self->dim(); i++) { + std::cout << src_sizes[i] << (i < self->dim() - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; return Error::InvalidArgument; } } @@ -667,6 +727,13 @@ AOTITorchError aoti_torch__reinterpret_tensor( return dtype_err; } + if (dtype != 6) { // 6 = float32 + std::cout + << "ERROR: Only float32 tensors are supported in reinterpret_tensor. Got dtype: " + << dtype << " (expected: 6 for float32)" << std::endl; + return Error::InvalidArgument; + } + int32_t device_type; AOTITorchError device_type_err = aoti_torch_get_device_type(self, &device_type); diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index dcea848597e..1ffcdba381d 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -45,6 +45,14 @@ AOTITorchError aoti_torch_get_storage_offset( int64_t* ret_storage_offset) { // Storage offset is always 0 in ET *ret_storage_offset = 0; + + // ASSERTION: Storage offset must always be 0 + if (*ret_storage_offset != 0) { + std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " + << *ret_storage_offset << std::endl; + return Error::InvalidArgument; + } + return Error::Ok; } @@ -73,6 +81,14 @@ AOTITorchError aoti_torch_get_dtype( AOTITensorHandle tensor, int32_t* ret_dtype) { *ret_dtype = static_cast(tensor->scalar_type()); + + // ASSERTION: Only float32 tensors are supported + if (*ret_dtype != 6) { // 6 = float32 + std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " + << *ret_dtype << " (expected: 6 for float32)" << std::endl; + return Error::InvalidArgument; + } + return Error::Ok; } diff --git a/compare_outputs.py b/compare_outputs.py new file mode 100755 index 00000000000..e83b701f73a --- /dev/null +++ b/compare_outputs.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Comparison script to calculate max absolute tolerance (atol) and max relative tolerance (rtol) +between runtime outputs and label outputs. +""" + +import os +import sys + +import numpy as np + + +def read_csv_file(filepath): + """Read a comma-separated values file and return as numpy array.""" + try: + with open(filepath, "r") as f: + content = f.read().strip() + if not content: + print(f"Warning: {filepath} is empty") + return np.array([]) + + # Split by comma and convert to float + values = [float(x.strip()) for x in content.split(",") if x.strip()] + return np.array(values) + except FileNotFoundError: + print(f"Error: {filepath} not found") + return None + except ValueError as e: + print(f"Error parsing {filepath}: {e}") + return None + + +def calculate_tolerances(runtime_outputs, label_outputs): + """Calculate max absolute and relative tolerances.""" + if runtime_outputs is None or label_outputs is None: + return None, None + + if len(runtime_outputs) == 0 or len(label_outputs) == 0: + print("Warning: One of the output arrays is empty") + return None, None + + if len(runtime_outputs) != len(label_outputs): + print( + f"Warning: Array lengths don't match: runtime={len(runtime_outputs)}, label={len(label_outputs)}" + ) + # Pad shorter array with zeros or truncate longer array + min_len = min(len(runtime_outputs), len(label_outputs)) + runtime_outputs = runtime_outputs[:min_len] + label_outputs = label_outputs[:min_len] + + # Calculate absolute differences + abs_diff = np.abs(runtime_outputs - label_outputs) + max_atol = np.max(abs_diff) + + # Calculate relative differences (avoid division by zero) + # rel_diff = |a - b| / max(|a|, |b|, eps) where eps is a small number + eps = 1e-8 + denominator = np.maximum( + np.maximum(np.abs(runtime_outputs), np.abs(label_outputs)), eps + ) + rel_diff = abs_diff / denominator + max_rtol = np.max(rel_diff) + + return max_atol, max_rtol + + +def main(): + """Main function to compare outputs and print tolerances.""" + # File paths + runtime_file = "aoti_debug_data/final_runtime_output.txt" + label_file = "aoti_debug_data/label_output.txt" + + print("=" * 60) + print("AOTI Runtime vs Label Output Comparison") + print("=" * 60) + + # Check if files exist + if not os.path.exists(runtime_file): + print(f"Error: {runtime_file} not found") + sys.exit(1) + + if not os.path.exists(label_file): + print(f"Error: {label_file} not found") + sys.exit(1) + + # Read the files + print(f"Reading runtime outputs from: {runtime_file}") + runtime_outputs = read_csv_file(runtime_file) + + print(f"Reading label outputs from: {label_file}") + label_outputs = read_csv_file(label_file) + + if runtime_outputs is None or label_outputs is None: + print("Failed to read one or both files") + sys.exit(1) + + print(f"Runtime outputs shape: {runtime_outputs.shape}") + print(f"Label outputs shape: {label_outputs.shape}") + + if runtime_outputs.shape != label_outputs.shape: + print("Error: Output shapes don't match") + sys.exit(1) + + # Calculate tolerances + max_atol, max_rtol = calculate_tolerances(runtime_outputs, label_outputs) + + if max_atol is None or max_rtol is None: + print("Failed to calculate tolerances") + sys.exit(1) + + # Print results + print("-" * 60) + print("COMPARISON RESULTS:") + print(f"Max Absolute Tolerance (atol): {max_atol:.10f}") + print(f"Max Relative Tolerance (rtol): {max_rtol:.10f}") + print("-" * 60) + + # Print some statistics + print("ADDITIONAL STATISTICS:") + print(f"Total elements compared: {len(runtime_outputs)}") + print( + f"Runtime output range: [{np.min(runtime_outputs):.6f}, {np.max(runtime_outputs):.6f}]" + ) + print( + f"Label output range: [{np.min(label_outputs):.6f}, {np.max(label_outputs):.6f}]" + ) + + # Calculate mean absolute difference + abs_diff = np.abs(runtime_outputs - label_outputs) + mean_atol = np.mean(abs_diff) + print(f"Mean Absolute Tolerance: {mean_atol:.10f}") + + # Check if outputs are close within common tolerances + is_close_1e5 = np.allclose( + runtime_outputs, + label_outputs, + atol=1e-5, + rtol=1e-5, + ) + is_close_1e6 = np.allclose( + runtime_outputs, + label_outputs, + atol=1e-6, + rtol=1e-6, + ) + + print(f"Close within atol=1e-5, rtol=1e-5: {is_close_1e5}") + print(f"Close within atol=1e-6, rtol=1e-6: {is_close_1e6}") + + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 5ce872eec8e..4a4b659c748 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -1,7 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. * Copyright 2024-2025 Arm Limited and/or its affiliates. + * All rights reserved. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -50,16 +50,6 @@ DEFINE_string( model_path, "model.pte", "Model serialized in flatbuffer format."); -DEFINE_string(inputs, "", "Comma-separated list of input files"); -DEFINE_string( - output_file, - "", - "Base name of output file. If not empty output will be written to the file(s)."); - -DEFINE_bool( - print_all_output, - false, - "Prints all output. By default only first and last 100 elements are printed."); DEFINE_uint32(num_executions, 1, "Number of times to run the model."); #ifdef ET_EVENT_TRACER_ENABLED DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path."); @@ -69,8 +59,6 @@ DEFINE_int32( -1, "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); -using executorch::aten::ScalarType; -using executorch::aten::Tensor; using executorch::extension::FileDataLoader; using executorch::runtime::Error; using executorch::runtime::EValue; @@ -83,8 +71,6 @@ using executorch::runtime::MethodMeta; using executorch::runtime::Program; using executorch::runtime::Result; using executorch::runtime::Span; -using executorch::runtime::Tag; -using executorch::runtime::TensorInfo; /// Helper to manage resources for ETDump generation class EventTraceManager { @@ -171,43 +157,6 @@ int main(int argc, char** argv) { "FileDataLoader::from() failed: 0x%" PRIx32, (uint32_t)loader.error()); - std::vector inputs_storage; - std::vector> input_buffers; - - std::stringstream list_of_input_files(FLAGS_inputs); - std::string path; - - // First reserve memory for number of vector elements to avoid vector - // reallocations when emplacing back. - std::vector file_paths; - while (std::getline(list_of_input_files, path, ',')) { - file_paths.push_back(std::move(path)); - } - inputs_storage.reserve(file_paths.size()); - - for (const auto& file_path : file_paths) { - std::ifstream input_file_handle( - file_path, std::ios::binary | std::ios::ate); - - if (!input_file_handle) { - ET_LOG(Error, "Failed to open input file: %s\n", file_path.c_str()); - return 1; - } - - std::streamsize file_size = input_file_handle.tellg(); - input_file_handle.seekg(0, std::ios::beg); - - // Reserve memory for actual file contents. - inputs_storage.emplace_back(file_size, '\0'); - - if (!input_file_handle.read(&inputs_storage.back()[0], file_size)) { - ET_LOG(Error, "Failed to read input file: %s\n", file_path.c_str()); - return 1; - } - - input_buffers.emplace_back(&inputs_storage.back()[0], file_size); - } - // Parse the program file. This is immutable, and can also be reused between // multiple execution invocations across multiple threads. Result program = Program::load(&loader.get()); @@ -306,8 +255,7 @@ int main(int argc, char** argv) { // Run the model. for (uint32_t i = 0; i < FLAGS_num_executions; i++) { ET_LOG(Debug, "Preparing inputs."); - // Allocate input tensors and set all of their elements to 1 or to the - // contents of input_buffers if available. The `inputs` + // Allocate input tensors and set all of their elements to 1. The `inputs` // variable owns the allocated memory and must live past the last call to // `execute()`. // @@ -315,8 +263,7 @@ int main(int argc, char** argv) { // because inputs whose space gets reused by memory planning (if // any such inputs exist) will not be preserved for the next // execution. - auto inputs = executorch::extension::prepare_input_tensors( - *method, {}, input_buffers); + auto inputs = executorch::extension::prepare_input_tensors(*method); ET_CHECK_MSG( inputs.ok(), "Could not prepare inputs: 0x%" PRIx32, @@ -348,69 +295,47 @@ int main(int argc, char** argv) { std::vector outputs(method->outputs_size()); ET_LOG(Info, "%zu outputs: ", outputs.size()); Error status = method->get_outputs(outputs.data(), outputs.size()); + ET_CHECK(status == Error::Ok); - if (FLAGS_output_file.size() > 0) { - for (int i = 0; i < outputs.size(); ++i) { - if (outputs[i].isTensor()) { - Tensor tensor = outputs[i].toTensor(); - - char out_filename[255]; - snprintf(out_filename, 255, "%s-%d.bin", FLAGS_output_file.c_str(), i); - ET_LOG(Info, "Writing output to file: %s", out_filename); - FILE* out_file = fopen(out_filename, "wb"); - fwrite(tensor.const_data_ptr(), 1, tensor.nbytes(), out_file); - fclose(out_file); - } - } + // Open file to dump outputs + std::ofstream output_file("aoti_debug_data/final_runtime_output.txt"); + if (!output_file.is_open()) { + ET_LOG(Error, "Failed to open output file for dumping"); } - if (FLAGS_print_all_output) { - for (int i = 0; i < outputs.size(); ++i) { - if (outputs[i].isTensor()) { - Tensor tensor = outputs[i].toTensor(); - - for (int j = 0; j < tensor.numel(); ++j) { - if (tensor.scalar_type() == ScalarType::Int) { - printf( - "Output[%d][%d]: (int) %d\n", - i, - j, - tensor.const_data_ptr()[j]); - } else if (tensor.scalar_type() == ScalarType::Float) { - printf( - "Output[%d][%d]: (float) %f\n", - i, - j, - tensor.const_data_ptr()[j]); - } else if (tensor.scalar_type() == ScalarType::Char) { - printf( - "Output[%d][%d]: (char) %d\n", - i, - j, - tensor.const_data_ptr()[j]); - } else if (tensor.scalar_type() == ScalarType::Bool) { - printf( - "Output[%d][%d]: (bool) %s (0x%x)\n", - i, - j, - tensor.const_data_ptr()[j] ? "true " : "false", - tensor.const_data_ptr()[j]); - } - } - } else { - printf("Output[%d]: Not Tensor\n", i); + // Print the first and last 100 elements of long lists of scalars. + std::cout << executorch::extension::evalue_edge_items(100); + for (int i = 0; i < outputs.size(); ++i) { + std::cout << "Output " << i << ": " << outputs[i] << std::endl; + + // Also dump to file - extract tensor data and write comma-separated values + if (output_file.is_open() && outputs[i].isTensor()) { + auto tensor = outputs[i].toTensor(); + const void* data_ptr = tensor.const_data_ptr(); + + // assert output is in float different tensor types + const float* float_data = static_cast(data_ptr); + size_t num_elements = tensor.numel(); + + for (size_t j = 0; j < num_elements; ++j) { + if (j > 0) + output_file << ","; + output_file << float_data[j]; } - } - } else { - // Print the first and last 100 elements of long lists of scalars. - std::cout << executorch::extension::evalue_edge_items(100); - for (int i = 0; i < outputs.size(); ++i) { - std::cout << "OutputX " << i << ": " << outputs[i] << std::endl; + if (i < outputs.size() - 1) + output_file << ","; } } + if (output_file.is_open()) { + output_file.close(); + ET_LOG( + Info, + "Runtime outputs dumped to aoti_debug_data/final_runtime_output.txt"); + } + if (tracer.get_event_tracer()) { // Dump ETDump data containing profiling/debugging data to file specified in // command line flag. diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index ebb1a44239e..7a60cb66be5 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -160,6 +160,11 @@ run_inference() { ./cmake-out/executor_runner --model_path aoti_model.pte } +compare_outputs() { + echo "Comparing runtime outputs with label outputs..." + python compare_outputs.py +} + # Set up environment variables based on debug and dump flags if [[ "$DEBUG_MODE" == true ]]; then echo "Setting debug environment variables..." @@ -169,7 +174,10 @@ if [[ "$DEBUG_MODE" == true ]]; then # Set intermediate value printer based on dump flag if [[ "$DUMP_MODE" == true ]]; then export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + export INDUCTOR_PROVENANCE=1 + export TORCH_TRACE="/home/gasoonjia/executorch/aoti_debug_data" echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)" + echo "Eager-AOTI relationship extration enabled (INDUCTOR_PROVENANCE=1), output to $TORCH_TRACE" else export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="3" fi @@ -182,13 +190,18 @@ elif [[ "$DUMP_MODE" == true ]]; then # Only dump mode enabled (without debug) echo "Setting AOTI intermediate output dumping..." export AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER="2" + export INDUCTOR_PROVENANCE=1 + export TORCH_TRACE="/home/gasoonjia/executorch/aoti_debug_data" echo "AOTI intermediate output dumping enabled (AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2)" echo " AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=$AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER" + echo "Eager-AOTI relationship extration enabled (INDUCTOR_PROVENANCE=1), output to $TORCH_TRACE" else # Ensure debug variables are unset for non-debug/non-dump modes unset AOT_INDUCTOR_DEBUG_COMPILE unset AOTINDUCTOR_REPRO_LEVEL unset AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER + unset INDUCTOR_PROVENANCE + unset TORCH_TRACE fi # Execute based on mode @@ -203,6 +216,7 @@ case "$MODE" in clean_install_executorch build_runtime run_inference + compare_outputs ;; "reinstall_aot") echo "Mode: reinstall_aot - Reinstall AOT components and run e2e" @@ -212,6 +226,7 @@ case "$MODE" in install_executorch export_aoti_model run_inference + compare_outputs ;; "reinstall_runtime") echo "Mode: reinstall_runtime - Rebuild runtime and run e2e" @@ -221,6 +236,7 @@ case "$MODE" in export_aoti_model build_runtime run_inference + compare_outputs ;; "inference") echo "Mode: inference - Export model and run inference only" @@ -229,6 +245,7 @@ case "$MODE" in fi export_aoti_model run_inference + compare_outputs ;; "export_aoti_only") echo "Mode: export_aoti_only - Export model using pure AOTI only (no runtime or installation)" diff --git a/export_aoti.py b/export_aoti.py index 2550f33a55a..9f9d4ce8e6c 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -34,6 +34,12 @@ from torchvision.models.resnet import ResNet18_Weights +# for maintaing precision of 32-bit float as much as possible +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False +torch.backends.cudnn.conv.fp32_precision = "fp32" + + # Model classes class MV2(torch.nn.Module): def __init__(self): @@ -109,6 +115,56 @@ def forward(self, x): return self.bn(x) +class SingleResNetBlock(nn.Module): + def __init__(self, in_channels=64, out_channels=64, stride=1): + super().__init__() + self.conv1 = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + ) + self.bn1 = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(out_channels) + + # Skip connection - identity mapping if same channels, 1x1 conv if different + self.skip_connection = None + if stride != 1 or in_channels != out_channels: + self.skip_connection = nn.Sequential( + nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=stride, bias=False + ), + nn.BatchNorm2d(out_channels), + ) + + def forward(self, x): + identity = x + + # First conv block + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + # Second conv block + out = self.conv2(out) + out = self.bn2(out) + + # Skip connection + if self.skip_connection is not None: + identity = self.skip_connection(x) + + out += identity + out = self.relu(out) + + return out + + # Model registry mapping model names to their configurations MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { "mv2": { @@ -153,6 +209,12 @@ def forward(self, x): "device": "cuda", "description": "Single BatchNorm2d layer model", }, + "single_resnet_block": { + "model_class": SingleResNetBlock, + "input_shapes": [(1, 64, 8, 8)], + "device": "cuda", + "description": "Single ResNet block with skip connection", + }, } @@ -187,7 +249,25 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p torch.ones_like(example_input) for example_input in example_inputs ) - print("label", model(*all_one_input)) + label_output = model(*all_one_input) + print("label", label_output) + + # Create directory if it doesn't exist + os.makedirs("aoti_debug_data", exist_ok=True) + + # Dump label to file + with open("aoti_debug_data/label_output.txt", "w") as f: + if isinstance(label_output, tuple): + # Multiple outputs + all_elements = [] + for tensor in label_output: + if tensor.numel() > 0: + all_elements.extend(tensor.flatten().tolist()) + f.write(",".join(map(str, all_elements))) + else: + # Single output + if label_output.numel() > 0: + f.write(",".join(map(str, label_output.flatten().tolist()))) print(f"Starting export process...") From 7962fb348fa092e8355aac2edd898d97a38f24e6 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 2 Sep 2025 22:35:01 -0700 Subject: [PATCH 30/50] centralize def type --- backends/aoti/runtime/shims/memory.cpp | 3 - backends/aoti/runtime/shims/memory.h | 67 +++++++------------ .../aoti/runtime/shims/tensor_attribute.cpp | 3 - .../aoti/runtime/shims/tensor_attribute.h | 11 +-- backends/aoti/runtime/shims/types.h | 45 +++++++++++++ backends/aoti/runtime/shims/utils.cpp | 2 +- backends/aoti/runtime/shims/utils.h | 14 +--- backends/aoti/runtime/targets.bzl | 1 + 8 files changed, 76 insertions(+), 70 deletions(-) create mode 100644 backends/aoti/runtime/shims/types.h diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 25e750edb3c..cbf52932268 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -23,9 +23,6 @@ namespace executorch { namespace backends { namespace aoti { -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - namespace { // Internal namespace for utility functions // Version 1: For use with int64_t sizes (e.g., from blob creation functions) diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index 0b8af138c90..57058397972 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -8,66 +8,49 @@ #pragma once -#include -#include -#include #include #include #include #include #include #include +#include "types.h" namespace executorch { namespace backends { namespace aoti { -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - extern "C" { -// Type definitions -using AOTITensorHandle = Tensor*; -using AOTIRuntimeError = Error; -using AOTITorchError = Error; - -struct CUDAStreamGuardOpaque { - cudaStream_t original_stream; - int device_index; - cudaEvent_t sync_event; -}; -using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; - // Global storage declarations extern std::unordered_map is_tensor_own_memory; extern std::unordered_set> tensors; // Memory-related operations -AOTITorchError aoti_torch_create_tensor_from_blob_v2( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor, - int32_t layout, - const uint8_t* opaque_metadata, - int64_t opaque_metadata_size); - -AOTITorchError aoti_torch_create_tensor_from_blob( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor); +// AOTITorchError aoti_torch_create_tensor_from_blob_v2( +// void* data, +// int64_t ndim, +// const int64_t* sizes_ptr, +// const int64_t* strides_ptr, +// int64_t storage_offset, +// int32_t dtype, +// int32_t device_type, +// int32_t device_index, +// AOTITensorHandle* ret_new_tensor, +// int32_t layout, +// const uint8_t* opaque_metadata, +// int64_t opaque_metadata_size); + +// AOTITorchError aoti_torch_create_tensor_from_blob( +// void* data, +// int64_t ndim, +// const int64_t* sizes_ptr, +// const int64_t* strides_ptr, +// int64_t storage_offset, +// int32_t dtype, +// int32_t device_type, +// int32_t device_index, +// AOTITensorHandle* ret_new_tensor); AOTITorchError aoti_torch_empty_strided( int64_t ndim, diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index 1ffcdba381d..8e0097cd8bd 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -13,9 +13,6 @@ namespace executorch { namespace backends { namespace aoti { -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - // Global storage for tensor metadata std::unordered_map> tensor_to_sizes; std::unordered_map> tensor_to_strides; diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h index ab4f8037ebf..387056a30fd 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.h +++ b/backends/aoti/runtime/shims/tensor_attribute.h @@ -8,8 +8,7 @@ #pragma once -#include -#include +#include "types.h" #include #include @@ -17,16 +16,8 @@ namespace executorch { namespace backends { namespace aoti { -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - extern "C" { -// Type definitions -using AOTITensorHandle = Tensor*; -using AOTIRuntimeError = Error; -using AOTITorchError = Error; - // Global storage for tensor metadata extern std::unordered_map> tensor_to_sizes; extern std::unordered_map> tensor_to_strides; diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h new file mode 100644 index 00000000000..312d05a4d33 --- /dev/null +++ b/backends/aoti/runtime/shims/types.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Common using declarations for ExecutorTorch types +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + +extern "C" { + +// Common AOTI type aliases +// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + +// CUDA-specific types +struct CUDAStreamGuardOpaque { + cudaStream_t original_stream; + int device_index; + cudaEvent_t sync_event; +}; +using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp index 10882c16cf4..e81e141e7fd 100644 --- a/backends/aoti/runtime/shims/utils.cpp +++ b/backends/aoti/runtime/shims/utils.cpp @@ -25,7 +25,7 @@ const char* const TENSOR_OUTPUT_FILENAME = extern "C" { -void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg) { +void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) { printf("Printing tensor handle: %p\n", self); if (!self) { diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h index 6bcd34efcfb..c0c2a59be0a 100644 --- a/backends/aoti/runtime/shims/utils.h +++ b/backends/aoti/runtime/shims/utils.h @@ -8,26 +8,18 @@ #pragma once -#include -#include -#include +#include #include +#include "types.h" namespace executorch { namespace backends { namespace aoti { -using executorch::runtime::Error; -using executorch::runtime::etensor::Tensor; - extern "C" { -// Type definitions -using AOTITensorHandle = Tensor*; -using AOTITorchError = Error; - // Utility function for printing tensor information -void aoti_torch_print_tensor_handle(AtenTensorHandle self, const char* msg); +void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg); // Cleanup function for tensor output file (called during backend destruction) void cleanup_aoti_tensor_output(); diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/runtime/targets.bzl index 2c87ad68a2c..d57a187366f 100644 --- a/backends/aoti/runtime/targets.bzl +++ b/backends/aoti/runtime/targets.bzl @@ -14,6 +14,7 @@ def define_common_targets(): "aoti_model_container.h", "shims/memory.h", "shims/tensor_attribute.h", + "shims/types.h", "shims/utils.h", ], # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) From 23de936d876230c992d3f4d08fb804bf965fbfcc Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 3 Sep 2025 12:12:39 -0700 Subject: [PATCH 31/50] enabling llama31 --- export_aoti.py | 31 +++++++++++++++++++++++++++++++ requirements-dev.txt | 1 + 2 files changed, 32 insertions(+) diff --git a/export_aoti.py b/export_aoti.py index 9f9d4ce8e6c..66824bbc3b6 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -32,6 +32,7 @@ from torchvision import models from torchvision.models.mobilenetv2 import MobileNet_V2_Weights from torchvision.models.resnet import ResNet18_Weights +from transformers import AutoModelForCausalLM # for maintaing precision of 32-bit float as much as possible @@ -165,6 +166,30 @@ def forward(self, x): return out +class Llama31(torch.nn.Module): + def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B"): + super(Llama31, self).__init__() + # Load Llama 3.1 model from HF + self.model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.float32, + device_map="cuda", + # trust_remote_code=True, + use_cache=False, # Turn off KV cache + ) + self.model.eval() + + def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None): + # Disable KV cache for inference + with torch.no_grad(): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + use_cache=False, # Explicitly turn off KV cache + ) + return outputs.logits + + # Model registry mapping model names to their configurations MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { "mv2": { @@ -215,6 +240,12 @@ def forward(self, x): "device": "cuda", "description": "Single ResNet block with skip connection", }, + "llama31": { + "model_class": Llama31, + "input_shapes": [(1, 128)], # batch_size=1, sequence_length=128 + "device": "cuda", + "description": "Llama 3.1 model with KV cache disabled", + }, } diff --git a/requirements-dev.txt b/requirements-dev.txt index 8c8f518a5ea..964bdecef76 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,3 +10,4 @@ certifi # Imported by resolve_buck.py. lintrunner==0.12.7 lintrunner-adapters==0.12.6 patchelf +transformers From b792c7d973465cc24081e937f97fa1a73110051f Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 4 Sep 2025 00:07:23 -0700 Subject: [PATCH 32/50] add llama31 for test --- ...jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json | 1 + config.yaml | 5 + export_aoti.py | 15 +- load_saved_config_example.py | 111 +++++++++++++ .../2025-09-03/14-45-08/.hydra/config.yaml | 73 ++++++++ outputs/2025-09-03/14-45-08/.hydra/hydra.yaml | 154 +++++++++++++++++ .../2025-09-03/14-45-08/.hydra/overrides.yaml | 3 + outputs/2025-09-03/14-45-08/export_llm.log | 40 +++++ .../2025-09-03/15-17-23/.hydra/config.yaml | 74 +++++++++ outputs/2025-09-03/15-17-23/.hydra/hydra.yaml | 157 ++++++++++++++++++ .../2025-09-03/15-17-23/.hydra/overrides.yaml | 6 + outputs/2025-09-03/15-17-23/export_llm.log | 38 +++++ .../2025-09-03/15-30-13/.hydra/config.yaml | 74 +++++++++ outputs/2025-09-03/15-30-13/.hydra/hydra.yaml | 157 ++++++++++++++++++ .../2025-09-03/15-30-13/.hydra/overrides.yaml | 6 + outputs/2025-09-03/15-30-13/export_llm.log | 0 .../2025-09-03/16-25-46/.hydra/config.yaml | 74 +++++++++ outputs/2025-09-03/16-25-46/.hydra/hydra.yaml | 157 ++++++++++++++++++ .../2025-09-03/16-25-46/.hydra/overrides.yaml | 6 + outputs/2025-09-03/16-25-46/export_llm.log | 0 .../2025-09-03/16-29-28/.hydra/config.yaml | 74 +++++++++ outputs/2025-09-03/16-29-28/.hydra/hydra.yaml | 157 ++++++++++++++++++ .../2025-09-03/16-29-28/.hydra/overrides.yaml | 6 + outputs/2025-09-03/16-29-28/export_llm.log | 0 .../2025-09-03/16-30-46/.hydra/config.yaml | 73 ++++++++ outputs/2025-09-03/16-30-46/.hydra/hydra.yaml | 156 +++++++++++++++++ .../2025-09-03/16-30-46/.hydra/overrides.yaml | 5 + outputs/2025-09-03/16-30-46/export_llm.log | 40 +++++ saved_llm_config.yaml | 73 ++++++++ 29 files changed, 1732 insertions(+), 3 deletions(-) create mode 100644 clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json create mode 100644 config.yaml create mode 100644 load_saved_config_example.py create mode 100644 outputs/2025-09-03/14-45-08/.hydra/config.yaml create mode 100644 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/14-45-08/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/14-45-08/export_llm.log create mode 100644 outputs/2025-09-03/15-17-23/.hydra/config.yaml create mode 100644 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/15-17-23/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/15-17-23/export_llm.log create mode 100644 outputs/2025-09-03/15-30-13/.hydra/config.yaml create mode 100644 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/15-30-13/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/15-30-13/export_llm.log create mode 100644 outputs/2025-09-03/16-25-46/.hydra/config.yaml create mode 100644 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/16-25-46/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/16-25-46/export_llm.log create mode 100644 outputs/2025-09-03/16-29-28/.hydra/config.yaml create mode 100644 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/16-29-28/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/16-29-28/export_llm.log create mode 100644 outputs/2025-09-03/16-30-46/.hydra/config.yaml create mode 100644 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml create mode 100644 outputs/2025-09-03/16-30-46/.hydra/overrides.yaml create mode 100644 outputs/2025-09-03/16-30-46/export_llm.log create mode 100644 saved_llm_config.yaml diff --git a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json new file mode 100644 index 00000000000..8c1a9f6d812 --- /dev/null +++ b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json @@ -0,0 +1 @@ +{"nodes": [{"name": "buf6", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf7"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf8", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf9"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf12", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf13"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf14", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf15"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf18", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf19"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf39", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf40"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf41", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf42"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf45", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf46"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf47", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf48"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf50", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf51"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf72", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf73"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf74", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf75"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf78", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf79"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf80", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf81"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf83", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf84"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf104", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf105"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf106", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf107"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf110", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf111"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf112", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf113"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf115", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf116"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf137", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf138"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf139", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf140"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf143", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf144"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf145", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf146"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf148", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf149"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf169", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf170"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf171", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf172"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf175", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf176"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf177", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf178"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf180", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf181"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf202", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf203"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf204", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf205"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf208", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf209"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf210", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf211"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf213", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf214"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf234", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf235"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf236", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf237"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf240", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf241"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf242", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf243"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf245", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf246"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf267", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf268"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf269", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf270"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf273", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf274"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf275", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf276"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf278", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf279"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf299", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf300"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf301", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf302"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf305", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf306"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf307", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf308"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf310", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf311"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf332", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf333"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf334", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf335"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf338", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf339"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf340", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf341"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf343", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf344"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf364", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf365"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf366", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf367"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf370", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf371"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf372", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf373"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf375", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf376"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf397", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf398"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf399", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf400"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf403", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf404"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf405", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf406"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf408", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf409"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf429", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf430"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf431", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf432"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf435", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf436"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf437", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf438"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf440", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf441"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf462", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf463"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf464", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf465"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf468", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf469"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf470", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf471"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf473", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf474"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf494", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf495"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf496", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf497"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf500", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf501"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf502", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf503"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf505", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf506"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf527", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf528"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf529", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf530"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf533", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf534"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf535", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf536"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf538", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf539"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf559", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf560"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf561", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf562"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf565", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf566"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf567", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf568"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf570", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf571"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf592", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf593"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf594", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf595"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf598", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf599"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf600", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf601"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf603", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf604"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf624", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf625"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf626", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf627"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf630", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf631"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf632", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf633"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf635", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf636"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf657", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf658"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf659", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf660"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf663", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf664"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf665", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf666"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf668", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf669"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf689", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf690"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf691", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf692"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf695", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf696"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf697", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf698"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf700", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf701"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf722", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf723"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf724", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf725"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf728", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf729"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf730", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf731"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf733", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf734"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf754", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf755"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf756", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf757"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf760", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf761"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf762", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf763"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf765", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf766"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf787", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf788"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf789", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf790"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf793", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf794"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf795", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf796"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf798", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf799"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf819", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf820"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf821", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf822"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf825", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf826"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf827", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf828"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf830", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf831"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf852", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf853"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf854", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf855"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf858", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf859"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf860", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf861"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf863", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf864"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf884", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf885"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf886", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf887"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf890", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf891"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf892", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf893"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf895", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf896"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf917", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf918"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf919", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf920"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf923", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf924"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf925", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf926"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf928", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf929"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf949", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf950"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf951", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf952"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf955", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf956"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf957", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf958"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf960", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf961"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf982", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf983"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf984", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf985"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf988", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf989"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf990", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf991"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf993", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf994"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1014", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1015"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1016", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1017"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1020", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1021"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1022", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1023"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1025", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1026"}}], "metadata": {}, "is_hop_single_tensor_return": null}}]} \ No newline at end of file diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000000..7fbd565cff5 --- /dev/null +++ b/config.yaml @@ -0,0 +1,5 @@ +base: + model_class: llama3_2 + checkpoint: /home/gasoonjia//consolidated.00.pth + params: /home/gasoonjia/executorch/params.json + metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' diff --git a/export_aoti.py b/export_aoti.py index 66824bbc3b6..c1c24d212ef 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -242,7 +242,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None): }, "llama31": { "model_class": Llama31, - "input_shapes": [(1, 128)], # batch_size=1, sequence_length=128 + "input_shapes": [(1, 32)], # batch_size=1, sequence_length=128 "device": "cuda", "description": "Llama 3.1 model with KV cache disabled", }, @@ -269,7 +269,14 @@ def get_model_and_inputs( model = model_class().to(device).eval() # Create example inputs (support multiple inputs) - example_inputs = tuple(torch.randn(*shape, device=device) for shape in input_shapes) + example_inputs = tuple( + ( + torch.randint(0, 10000, size=shape, device=device) + if model_name == "llama31" + else torch.randn(*shape, device=device) + ) + for shape in input_shapes + ) return model, example_inputs @@ -304,7 +311,9 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p # 1. torch.export: Defines the program with the ATen operator set. print("Step 1: Converting to ATen dialect...") - aten_dialect = export(model, example_inputs) + with torch.no_grad(): + # from torch.export._trace import _export + aten_dialect = export(model, example_inputs, strict=False) # print(aten_dialect) # exit(0) diff --git a/load_saved_config_example.py b/load_saved_config_example.py new file mode 100644 index 00000000000..95c2c9a07bd --- /dev/null +++ b/load_saved_config_example.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +""" +Example script showing how to load a saved LLM config and use it. +""" + +import os +import sys + +# Add the executorch path to import modules +sys.path.append("/home/gasoonjia/executorch") + +from executorch.examples.models.llama.export_llama_lib import export_llama +from executorch.extension.llm.export.config.llm_config import LlmConfig +from executorch.extension.llm.export.export_llm import ( + load_config_from_file, + save_config_to_file, +) + + +def load_and_use_saved_config(): + """Load a previously saved config and use it for export.""" + + # Method 1: Load from a saved YAML file + try: + config_obj = load_config_from_file("used_config_llama3.yaml") + print("✓ Successfully loaded config from used_config_llama3.yaml") + + # Optional: Modify the loaded config + print("Original quantization mode:", config_obj.quantization.qmode) + config_obj.quantization.qmode = "8da4w" # Change quantization + config_obj.debug.verbose = True # Enable verbose logging + print("Modified quantization mode:", config_obj.quantization.qmode) + + # Use the config for export + print("Starting export with loaded config...") + output_file = export_llama(config_obj) + print(f"✓ Export completed! Output: {output_file}") + + except FileNotFoundError: + print("❌ Config file 'used_config_llama3.yaml' not found.") + print("First save a config by running the main export script.") + return False + + return True + + +def create_and_save_custom_config(): + """Create a custom config and save it.""" + + # Create a new config from scratch + custom_config = LlmConfig() + + # Configure the model + custom_config.base.model_class = "llama3" + custom_config.base.checkpoint = ( + "/path/to/your/checkpoint.pth" # Set your checkpoint path + ) + + # Configure model settings + custom_config.model.use_kv_cache = True + custom_config.model.use_sdpa_with_kv_cache = True + custom_config.model.dtype_override = "fp32" + + # Configure export settings + custom_config.export.max_seq_length = 2048 + custom_config.export.output_dir = "./outputs" + + # Configure backend + custom_config.backend.xnnpack.enabled = True + custom_config.backend.xnnpack.extended_ops = True + + # Configure quantization + custom_config.quantization.qmode = "8da4w" + + # Configure debug + custom_config.debug.verbose = True + + # Save the custom config + config_filename = "my_custom_llama_config.yaml" + save_config_to_file(custom_config, config_filename) + print(f"✓ Custom config saved to {config_filename}") + + # Load it back to verify + loaded_config = load_config_from_file(config_filename) + print("✓ Verified: Config loaded successfully") + + return loaded_config + + +def main(): + print("=== LLM Config Load/Save Examples ===\n") + + # Example 1: Try to load a previously saved config + print("1. Attempting to load saved config...") + success = load_and_use_saved_config() + + if not success: + print("\n2. Creating and saving a custom config...") + custom_config = create_and_save_custom_config() + + print("\n3. Using the custom config for export...") + try: + output_file = export_llama(custom_config) + print(f"✓ Export completed with custom config! Output: {output_file}") + except Exception as e: + print(f"❌ Export failed: {e}") + print("Make sure to set a valid checkpoint path in the config.") + + +if __name__ == "__main__": + main() diff --git a/outputs/2025-09-03/14-45-08/.hydra/config.yaml b/outputs/2025-09-03/14-45-08/.hydra/config.yaml new file mode 100644 index 00000000000..34a34cf92f9 --- /dev/null +++ b/outputs/2025-09-03/14-45-08/.hydra/config.yaml @@ -0,0 +1,73 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false diff --git a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml new file mode 100644 index 00000000000..c2e16273566 --- /dev/null +++ b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml @@ -0,0 +1,154 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/14-45-08 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml new file mode 100644 index 00000000000..acc7258c572 --- /dev/null +++ b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml @@ -0,0 +1,3 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json diff --git a/outputs/2025-09-03/14-45-08/export_llm.log b/outputs/2025-09-03/14-45-08/export_llm.log new file mode 100644 index 00000000000..574ad77780e --- /dev/null +++ b/outputs/2025-09-03/14-45-08/export_llm.log @@ -0,0 +1,40 @@ +[2025-09-03 14:45:08,888][root][INFO] - Applying quantizers: [] +[2025-09-03 14:45:17,670][root][INFO] - Checkpoint dtype: torch.bfloat16 +[2025-09-03 14:45:17,672][root][INFO] - Model after source transforms: Transformer( + (tok_embeddings): Embedding(128256, 4096) + (layers): ModuleList( + (0-31): 32 x TransformerBlock( + (attention): AttentionMHA( + (wq): Linear(in_features=4096, out_features=4096, bias=False) + (wk): Linear(in_features=4096, out_features=1024, bias=False) + (wv): Linear(in_features=4096, out_features=1024, bias=False) + (wo): Linear(in_features=4096, out_features=4096, bias=False) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + ) + (feed_forward): FeedForward( + (w1): Linear(in_features=4096, out_features=14336, bias=False) + (w2): Linear(in_features=14336, out_features=4096, bias=False) + (w3): Linear(in_features=4096, out_features=14336, bias=False) + ) + (attention_norm): RMSNorm() + (ffn_norm): RMSNorm() + ) + ) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + (norm): RMSNorm() + (output): Linear(in_features=4096, out_features=128256, bias=False) +) +[2025-09-03 14:45:17,673][root][INFO] - Exporting with: +[2025-09-03 14:45:17,674][root][INFO] - inputs: (tensor([[1, 2, 3]]),) +[2025-09-03 14:45:17,674][root][INFO] - kwargs: None +[2025-09-03 14:45:17,674][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) +[2025-09-03 14:45:33,074][root][INFO] - Running canonical pass: RemoveRedundantTransposes +[2025-09-03 14:45:33,152][root][INFO] - Using pt2e [] to quantizing the model... +[2025-09-03 14:45:33,152][root][INFO] - No quantizer provided, passing... +[2025-09-03 14:46:55,091][root][INFO] - Lowering model using following partitioner(s): +[2025-09-03 14:47:47,454][root][INFO] - Required memory for activation in bytes: [0, 26074624] +[2025-09-03 14:48:03,642][root][INFO] - Saved exported program to ./llama3_1.pte diff --git a/outputs/2025-09-03/15-17-23/.hydra/config.yaml b/outputs/2025-09-03/15-17-23/.hydra/config.yaml new file mode 100644 index 00000000000..74c7f49c21f --- /dev/null +++ b/outputs/2025-09-03/15-17-23/.hydra/config.yaml @@ -0,0 +1,74 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false +save_exported_program: true diff --git a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml new file mode 100644 index 00000000000..d224649ae3a --- /dev/null +++ b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml @@ -0,0 +1,157 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + - ++model.use_kv_cache=False + - ++model.use_sdpa_with_kv_cache=False + - ++save_exported_program=True + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-17-23 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml new file mode 100644 index 00000000000..fccd73d94f1 --- /dev/null +++ b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml @@ -0,0 +1,6 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json +- ++model.use_kv_cache=False +- ++model.use_sdpa_with_kv_cache=False +- ++save_exported_program=True diff --git a/outputs/2025-09-03/15-17-23/export_llm.log b/outputs/2025-09-03/15-17-23/export_llm.log new file mode 100644 index 00000000000..9cdb4c31406 --- /dev/null +++ b/outputs/2025-09-03/15-17-23/export_llm.log @@ -0,0 +1,38 @@ +[2025-09-03 15:17:23,719][root][INFO] - Applying quantizers: [] +[2025-09-03 15:17:25,710][root][INFO] - Checkpoint dtype: torch.bfloat16 +[2025-09-03 15:17:25,711][root][INFO] - Model after source transforms: Transformer( + (tok_embeddings): Embedding(128256, 4096) + (layers): ModuleList( + (0-31): 32 x TransformerBlock( + (attention): AttentionMHA( + (wq): Linear(in_features=4096, out_features=4096, bias=False) + (wk): Linear(in_features=4096, out_features=1024, bias=False) + (wv): Linear(in_features=4096, out_features=1024, bias=False) + (wo): Linear(in_features=4096, out_features=4096, bias=False) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + ) + (feed_forward): FeedForward( + (w1): Linear(in_features=4096, out_features=14336, bias=False) + (w2): Linear(in_features=14336, out_features=4096, bias=False) + (w3): Linear(in_features=4096, out_features=14336, bias=False) + ) + (attention_norm): RMSNorm() + (ffn_norm): RMSNorm() + ) + ) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + (norm): RMSNorm() + (output): Linear(in_features=4096, out_features=128256, bias=False) +) +[2025-09-03 15:17:25,712][root][INFO] - Exporting with: +[2025-09-03 15:17:25,712][root][INFO] - inputs: (tensor([[1, 2, 3]]),) +[2025-09-03 15:17:25,712][root][INFO] - kwargs: None +[2025-09-03 15:17:25,713][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) +[2025-09-03 15:17:39,308][root][INFO] - Running canonical pass: RemoveRedundantTransposes +[2025-09-03 15:17:39,376][root][INFO] - Using pt2e [] to quantizing the model... +[2025-09-03 15:17:39,377][root][INFO] - No quantizer provided, passing... +[2025-09-03 15:18:45,017][root][INFO] - Lowering model using following partitioner(s): diff --git a/outputs/2025-09-03/15-30-13/.hydra/config.yaml b/outputs/2025-09-03/15-30-13/.hydra/config.yaml new file mode 100644 index 00000000000..74c7f49c21f --- /dev/null +++ b/outputs/2025-09-03/15-30-13/.hydra/config.yaml @@ -0,0 +1,74 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false +save_exported_program: true diff --git a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml new file mode 100644 index 00000000000..e13edc3e222 --- /dev/null +++ b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml @@ -0,0 +1,157 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + - ++model.use_kv_cache=False + - ++model.use_sdpa_with_kv_cache=False + - ++save_exported_program=True + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-30-13 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml new file mode 100644 index 00000000000..fccd73d94f1 --- /dev/null +++ b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml @@ -0,0 +1,6 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json +- ++model.use_kv_cache=False +- ++model.use_sdpa_with_kv_cache=False +- ++save_exported_program=True diff --git a/outputs/2025-09-03/15-30-13/export_llm.log b/outputs/2025-09-03/15-30-13/export_llm.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/outputs/2025-09-03/16-25-46/.hydra/config.yaml b/outputs/2025-09-03/16-25-46/.hydra/config.yaml new file mode 100644 index 00000000000..74c7f49c21f --- /dev/null +++ b/outputs/2025-09-03/16-25-46/.hydra/config.yaml @@ -0,0 +1,74 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false +save_exported_program: true diff --git a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml new file mode 100644 index 00000000000..f3b218f45ca --- /dev/null +++ b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml @@ -0,0 +1,157 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + - ++model.use_kv_cache=False + - ++model.use_sdpa_with_kv_cache=False + - ++save_exported_program=True + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-25-46 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml new file mode 100644 index 00000000000..fccd73d94f1 --- /dev/null +++ b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml @@ -0,0 +1,6 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json +- ++model.use_kv_cache=False +- ++model.use_sdpa_with_kv_cache=False +- ++save_exported_program=True diff --git a/outputs/2025-09-03/16-25-46/export_llm.log b/outputs/2025-09-03/16-25-46/export_llm.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/outputs/2025-09-03/16-29-28/.hydra/config.yaml b/outputs/2025-09-03/16-29-28/.hydra/config.yaml new file mode 100644 index 00000000000..74c7f49c21f --- /dev/null +++ b/outputs/2025-09-03/16-29-28/.hydra/config.yaml @@ -0,0 +1,74 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false +save_exported_program: true diff --git a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml new file mode 100644 index 00000000000..8490cd4d2cd --- /dev/null +++ b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml @@ -0,0 +1,157 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + - ++model.use_kv_cache=False + - ++model.use_sdpa_with_kv_cache=False + - ++save_exported_program=True + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-29-28 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml new file mode 100644 index 00000000000..fccd73d94f1 --- /dev/null +++ b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml @@ -0,0 +1,6 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json +- ++model.use_kv_cache=False +- ++model.use_sdpa_with_kv_cache=False +- ++save_exported_program=True diff --git a/outputs/2025-09-03/16-29-28/export_llm.log b/outputs/2025-09-03/16-29-28/export_llm.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/outputs/2025-09-03/16-30-46/.hydra/config.yaml b/outputs/2025-09-03/16-30-46/.hydra/config.yaml new file mode 100644 index 00000000000..34a34cf92f9 --- /dev/null +++ b/outputs/2025-09-03/16-30-46/.hydra/config.yaml @@ -0,0 +1,73 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false diff --git a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml new file mode 100644 index 00000000000..9960f35db88 --- /dev/null +++ b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml @@ -0,0 +1,156 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - ++base.model_class=llama3_1 + - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json + - ++model.use_kv_cache=False + - ++model.use_sdpa_with_kv_cache=False + job: + name: export_llm + chdir: null + override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False + id: ??? + num: ??? + config_name: llm_config + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /home/gasoonjia/executorch + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: '' + schema: structured + provider: schema + output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-30-46 + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml new file mode 100644 index 00000000000..369364d85c9 --- /dev/null +++ b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml @@ -0,0 +1,5 @@ +- ++base.model_class=llama3_1 +- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth +- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json +- ++model.use_kv_cache=False +- ++model.use_sdpa_with_kv_cache=False diff --git a/outputs/2025-09-03/16-30-46/export_llm.log b/outputs/2025-09-03/16-30-46/export_llm.log new file mode 100644 index 00000000000..ebb9f84e570 --- /dev/null +++ b/outputs/2025-09-03/16-30-46/export_llm.log @@ -0,0 +1,40 @@ +[2025-09-03 16:30:46,353][root][INFO] - Applying quantizers: [] +[2025-09-03 16:30:52,013][root][INFO] - Checkpoint dtype: torch.bfloat16 +[2025-09-03 16:30:52,014][root][INFO] - Model after source transforms: Transformer( + (tok_embeddings): Embedding(128256, 4096) + (layers): ModuleList( + (0-31): 32 x TransformerBlock( + (attention): AttentionMHA( + (wq): Linear(in_features=4096, out_features=4096, bias=False) + (wk): Linear(in_features=4096, out_features=1024, bias=False) + (wv): Linear(in_features=4096, out_features=1024, bias=False) + (wo): Linear(in_features=4096, out_features=4096, bias=False) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + ) + (feed_forward): FeedForward( + (w1): Linear(in_features=4096, out_features=14336, bias=False) + (w2): Linear(in_features=14336, out_features=4096, bias=False) + (w3): Linear(in_features=4096, out_features=14336, bias=False) + ) + (attention_norm): RMSNorm() + (ffn_norm): RMSNorm() + ) + ) + (rope): Rope( + (apply_rotary_emb): RotaryEmbedding() + ) + (norm): RMSNorm() + (output): Linear(in_features=4096, out_features=128256, bias=False) +) +[2025-09-03 16:30:52,015][root][INFO] - Exporting with: +[2025-09-03 16:30:52,016][root][INFO] - inputs: (tensor([[1, 2, 3]]),) +[2025-09-03 16:30:52,016][root][INFO] - kwargs: None +[2025-09-03 16:30:52,016][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) +[2025-09-03 16:31:06,978][root][INFO] - Running canonical pass: RemoveRedundantTransposes +[2025-09-03 16:31:07,056][root][INFO] - Using pt2e [] to quantizing the model... +[2025-09-03 16:31:07,056][root][INFO] - No quantizer provided, passing... +[2025-09-03 16:32:22,170][root][INFO] - Lowering model using following partitioner(s): +[2025-09-03 16:33:19,737][root][INFO] - Required memory for activation in bytes: [0, 26074624] +[2025-09-03 16:33:33,215][root][INFO] - Saved exported program to ./llama3_1.pte diff --git a/saved_llm_config.yaml b/saved_llm_config.yaml new file mode 100644 index 00000000000..34a34cf92f9 --- /dev/null +++ b/saved_llm_config.yaml @@ -0,0 +1,73 @@ +base: + model_class: llama3_1 + params: /home/gasoonjia/Llama-3.1-8B/original/params.json + checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth + checkpoint_dir: null + adapter_checkpoint: null + adapter_config: null + tokenizer_path: null + metadata: null + use_lora: 0 + fairseq2: false + preq_mode: null + preq_group_size: 32 + preq_embedding_quantize: 8,0 +model: + dtype_override: fp32 + enable_dynamic_shape: true + use_shared_embedding: false + use_sdpa_with_kv_cache: false + expand_rope_table: false + use_attention_sink: null + output_prune_map: null + input_prune_map: null + use_kv_cache: false + quantize_kv_cache: false + local_global_attention: null +export: + max_seq_length: 128 + max_context_length: 128 + output_dir: . + output_name: null + so_library: null + export_only: false + foundation_weights_file: null +debug: + profile_memory: false + profile_path: null + generate_etrecord: false + generate_full_logits: false + verbose: false +quantization: + qmode: null + embedding_quantize: null + pt2e_quantize: null + group_size: null + use_spin_quant: null + use_qat: false + calibration_tasks: null + calibration_limit: null + calibration_seq_length: null + calibration_data: Once upon a time +backend: + xnnpack: + enabled: false + extended_ops: false + coreml: + enabled: false + enable_state: false + preserve_sdpa: false + quantize: null + ios: 15 + compute_units: cpu_only + vulkan: + enabled: false + qnn: + enabled: false + use_sha: false + soc_model: SM8650 + use_qnn_sha: false + optimized_rotation_path: null + num_sharding: 0 + mps: + enabled: false From 01b306d2e5522ee9396cdf714eaf86440db0e84b Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 4 Sep 2025 00:11:40 -0700 Subject: [PATCH 33/50] remove uncessary data from github --- .gitignore | 2 + config.yaml | 5 - export_and_run_aoti.sh | 1 + load_saved_config_example.py | 111 ------------- .../2025-09-03/14-45-08/.hydra/config.yaml | 73 -------- outputs/2025-09-03/14-45-08/.hydra/hydra.yaml | 154 ----------------- .../2025-09-03/14-45-08/.hydra/overrides.yaml | 3 - outputs/2025-09-03/14-45-08/export_llm.log | 40 ----- .../2025-09-03/15-17-23/.hydra/config.yaml | 74 --------- outputs/2025-09-03/15-17-23/.hydra/hydra.yaml | 157 ------------------ .../2025-09-03/15-17-23/.hydra/overrides.yaml | 6 - outputs/2025-09-03/15-17-23/export_llm.log | 38 ----- .../2025-09-03/15-30-13/.hydra/config.yaml | 74 --------- outputs/2025-09-03/15-30-13/.hydra/hydra.yaml | 157 ------------------ .../2025-09-03/15-30-13/.hydra/overrides.yaml | 6 - outputs/2025-09-03/15-30-13/export_llm.log | 0 .../2025-09-03/16-25-46/.hydra/config.yaml | 74 --------- outputs/2025-09-03/16-25-46/.hydra/hydra.yaml | 157 ------------------ .../2025-09-03/16-25-46/.hydra/overrides.yaml | 6 - outputs/2025-09-03/16-25-46/export_llm.log | 0 .../2025-09-03/16-29-28/.hydra/config.yaml | 74 --------- outputs/2025-09-03/16-29-28/.hydra/hydra.yaml | 157 ------------------ .../2025-09-03/16-29-28/.hydra/overrides.yaml | 6 - outputs/2025-09-03/16-29-28/export_llm.log | 0 .../2025-09-03/16-30-46/.hydra/config.yaml | 73 -------- outputs/2025-09-03/16-30-46/.hydra/hydra.yaml | 156 ----------------- .../2025-09-03/16-30-46/.hydra/overrides.yaml | 5 - outputs/2025-09-03/16-30-46/export_llm.log | 40 ----- saved_llm_config.yaml | 73 -------- 29 files changed, 3 insertions(+), 1719 deletions(-) delete mode 100644 config.yaml delete mode 100644 load_saved_config_example.py delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/14-45-08/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/14-45-08/export_llm.log delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/15-17-23/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/15-17-23/export_llm.log delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/15-30-13/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/15-30-13/export_llm.log delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/16-25-46/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/16-25-46/export_llm.log delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/16-29-28/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/16-29-28/export_llm.log delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/config.yaml delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/hydra.yaml delete mode 100644 outputs/2025-09-03/16-30-46/.hydra/overrides.yaml delete mode 100644 outputs/2025-09-03/16-30-46/export_llm.log delete mode 100644 saved_llm_config.yaml diff --git a/.gitignore b/.gitignore index 2e9b9c948a2..295c352adbc 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ tokenizer.json *kernel.cpp *wrapper_metadata.json *wrapper.cpp +*wrapper.json + aoti_debug_data* # Editor temporaries diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 7fbd565cff5..00000000000 --- a/config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -base: - model_class: llama3_2 - checkpoint: /home/gasoonjia//consolidated.00.pth - params: /home/gasoonjia/executorch/params.json - metadata: '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index 7a60cb66be5..cb5595fb8b5 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -97,6 +97,7 @@ cleanup_temp_files() { rm -f *kernel.cpp rm -f *wrapper_metadata.json rm -f *wrapper.cpp + rm -f *wrapper.json rm -f aoti_intermediate_output.txt echo "Cleanup completed." diff --git a/load_saved_config_example.py b/load_saved_config_example.py deleted file mode 100644 index 95c2c9a07bd..00000000000 --- a/load_saved_config_example.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python3 -""" -Example script showing how to load a saved LLM config and use it. -""" - -import os -import sys - -# Add the executorch path to import modules -sys.path.append("/home/gasoonjia/executorch") - -from executorch.examples.models.llama.export_llama_lib import export_llama -from executorch.extension.llm.export.config.llm_config import LlmConfig -from executorch.extension.llm.export.export_llm import ( - load_config_from_file, - save_config_to_file, -) - - -def load_and_use_saved_config(): - """Load a previously saved config and use it for export.""" - - # Method 1: Load from a saved YAML file - try: - config_obj = load_config_from_file("used_config_llama3.yaml") - print("✓ Successfully loaded config from used_config_llama3.yaml") - - # Optional: Modify the loaded config - print("Original quantization mode:", config_obj.quantization.qmode) - config_obj.quantization.qmode = "8da4w" # Change quantization - config_obj.debug.verbose = True # Enable verbose logging - print("Modified quantization mode:", config_obj.quantization.qmode) - - # Use the config for export - print("Starting export with loaded config...") - output_file = export_llama(config_obj) - print(f"✓ Export completed! Output: {output_file}") - - except FileNotFoundError: - print("❌ Config file 'used_config_llama3.yaml' not found.") - print("First save a config by running the main export script.") - return False - - return True - - -def create_and_save_custom_config(): - """Create a custom config and save it.""" - - # Create a new config from scratch - custom_config = LlmConfig() - - # Configure the model - custom_config.base.model_class = "llama3" - custom_config.base.checkpoint = ( - "/path/to/your/checkpoint.pth" # Set your checkpoint path - ) - - # Configure model settings - custom_config.model.use_kv_cache = True - custom_config.model.use_sdpa_with_kv_cache = True - custom_config.model.dtype_override = "fp32" - - # Configure export settings - custom_config.export.max_seq_length = 2048 - custom_config.export.output_dir = "./outputs" - - # Configure backend - custom_config.backend.xnnpack.enabled = True - custom_config.backend.xnnpack.extended_ops = True - - # Configure quantization - custom_config.quantization.qmode = "8da4w" - - # Configure debug - custom_config.debug.verbose = True - - # Save the custom config - config_filename = "my_custom_llama_config.yaml" - save_config_to_file(custom_config, config_filename) - print(f"✓ Custom config saved to {config_filename}") - - # Load it back to verify - loaded_config = load_config_from_file(config_filename) - print("✓ Verified: Config loaded successfully") - - return loaded_config - - -def main(): - print("=== LLM Config Load/Save Examples ===\n") - - # Example 1: Try to load a previously saved config - print("1. Attempting to load saved config...") - success = load_and_use_saved_config() - - if not success: - print("\n2. Creating and saving a custom config...") - custom_config = create_and_save_custom_config() - - print("\n3. Using the custom config for export...") - try: - output_file = export_llama(custom_config) - print(f"✓ Export completed with custom config! Output: {output_file}") - except Exception as e: - print(f"❌ Export failed: {e}") - print("Make sure to set a valid checkpoint path in the config.") - - -if __name__ == "__main__": - main() diff --git a/outputs/2025-09-03/14-45-08/.hydra/config.yaml b/outputs/2025-09-03/14-45-08/.hydra/config.yaml deleted file mode 100644 index 34a34cf92f9..00000000000 --- a/outputs/2025-09-03/14-45-08/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false diff --git a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml b/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml deleted file mode 100644 index c2e16273566..00000000000 --- a/outputs/2025-09-03/14-45-08/.hydra/hydra.yaml +++ /dev/null @@ -1,154 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/14-45-08 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml b/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml deleted file mode 100644 index acc7258c572..00000000000 --- a/outputs/2025-09-03/14-45-08/.hydra/overrides.yaml +++ /dev/null @@ -1,3 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json diff --git a/outputs/2025-09-03/14-45-08/export_llm.log b/outputs/2025-09-03/14-45-08/export_llm.log deleted file mode 100644 index 574ad77780e..00000000000 --- a/outputs/2025-09-03/14-45-08/export_llm.log +++ /dev/null @@ -1,40 +0,0 @@ -[2025-09-03 14:45:08,888][root][INFO] - Applying quantizers: [] -[2025-09-03 14:45:17,670][root][INFO] - Checkpoint dtype: torch.bfloat16 -[2025-09-03 14:45:17,672][root][INFO] - Model after source transforms: Transformer( - (tok_embeddings): Embedding(128256, 4096) - (layers): ModuleList( - (0-31): 32 x TransformerBlock( - (attention): AttentionMHA( - (wq): Linear(in_features=4096, out_features=4096, bias=False) - (wk): Linear(in_features=4096, out_features=1024, bias=False) - (wv): Linear(in_features=4096, out_features=1024, bias=False) - (wo): Linear(in_features=4096, out_features=4096, bias=False) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - ) - (feed_forward): FeedForward( - (w1): Linear(in_features=4096, out_features=14336, bias=False) - (w2): Linear(in_features=14336, out_features=4096, bias=False) - (w3): Linear(in_features=4096, out_features=14336, bias=False) - ) - (attention_norm): RMSNorm() - (ffn_norm): RMSNorm() - ) - ) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - (norm): RMSNorm() - (output): Linear(in_features=4096, out_features=128256, bias=False) -) -[2025-09-03 14:45:17,673][root][INFO] - Exporting with: -[2025-09-03 14:45:17,674][root][INFO] - inputs: (tensor([[1, 2, 3]]),) -[2025-09-03 14:45:17,674][root][INFO] - kwargs: None -[2025-09-03 14:45:17,674][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) -[2025-09-03 14:45:33,074][root][INFO] - Running canonical pass: RemoveRedundantTransposes -[2025-09-03 14:45:33,152][root][INFO] - Using pt2e [] to quantizing the model... -[2025-09-03 14:45:33,152][root][INFO] - No quantizer provided, passing... -[2025-09-03 14:46:55,091][root][INFO] - Lowering model using following partitioner(s): -[2025-09-03 14:47:47,454][root][INFO] - Required memory for activation in bytes: [0, 26074624] -[2025-09-03 14:48:03,642][root][INFO] - Saved exported program to ./llama3_1.pte diff --git a/outputs/2025-09-03/15-17-23/.hydra/config.yaml b/outputs/2025-09-03/15-17-23/.hydra/config.yaml deleted file mode 100644 index 74c7f49c21f..00000000000 --- a/outputs/2025-09-03/15-17-23/.hydra/config.yaml +++ /dev/null @@ -1,74 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false -save_exported_program: true diff --git a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml b/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml deleted file mode 100644 index d224649ae3a..00000000000 --- a/outputs/2025-09-03/15-17-23/.hydra/hydra.yaml +++ /dev/null @@ -1,157 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - - ++model.use_kv_cache=False - - ++model.use_sdpa_with_kv_cache=False - - ++save_exported_program=True - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-17-23 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml b/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml deleted file mode 100644 index fccd73d94f1..00000000000 --- a/outputs/2025-09-03/15-17-23/.hydra/overrides.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json -- ++model.use_kv_cache=False -- ++model.use_sdpa_with_kv_cache=False -- ++save_exported_program=True diff --git a/outputs/2025-09-03/15-17-23/export_llm.log b/outputs/2025-09-03/15-17-23/export_llm.log deleted file mode 100644 index 9cdb4c31406..00000000000 --- a/outputs/2025-09-03/15-17-23/export_llm.log +++ /dev/null @@ -1,38 +0,0 @@ -[2025-09-03 15:17:23,719][root][INFO] - Applying quantizers: [] -[2025-09-03 15:17:25,710][root][INFO] - Checkpoint dtype: torch.bfloat16 -[2025-09-03 15:17:25,711][root][INFO] - Model after source transforms: Transformer( - (tok_embeddings): Embedding(128256, 4096) - (layers): ModuleList( - (0-31): 32 x TransformerBlock( - (attention): AttentionMHA( - (wq): Linear(in_features=4096, out_features=4096, bias=False) - (wk): Linear(in_features=4096, out_features=1024, bias=False) - (wv): Linear(in_features=4096, out_features=1024, bias=False) - (wo): Linear(in_features=4096, out_features=4096, bias=False) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - ) - (feed_forward): FeedForward( - (w1): Linear(in_features=4096, out_features=14336, bias=False) - (w2): Linear(in_features=14336, out_features=4096, bias=False) - (w3): Linear(in_features=4096, out_features=14336, bias=False) - ) - (attention_norm): RMSNorm() - (ffn_norm): RMSNorm() - ) - ) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - (norm): RMSNorm() - (output): Linear(in_features=4096, out_features=128256, bias=False) -) -[2025-09-03 15:17:25,712][root][INFO] - Exporting with: -[2025-09-03 15:17:25,712][root][INFO] - inputs: (tensor([[1, 2, 3]]),) -[2025-09-03 15:17:25,712][root][INFO] - kwargs: None -[2025-09-03 15:17:25,713][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) -[2025-09-03 15:17:39,308][root][INFO] - Running canonical pass: RemoveRedundantTransposes -[2025-09-03 15:17:39,376][root][INFO] - Using pt2e [] to quantizing the model... -[2025-09-03 15:17:39,377][root][INFO] - No quantizer provided, passing... -[2025-09-03 15:18:45,017][root][INFO] - Lowering model using following partitioner(s): diff --git a/outputs/2025-09-03/15-30-13/.hydra/config.yaml b/outputs/2025-09-03/15-30-13/.hydra/config.yaml deleted file mode 100644 index 74c7f49c21f..00000000000 --- a/outputs/2025-09-03/15-30-13/.hydra/config.yaml +++ /dev/null @@ -1,74 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false -save_exported_program: true diff --git a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml b/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml deleted file mode 100644 index e13edc3e222..00000000000 --- a/outputs/2025-09-03/15-30-13/.hydra/hydra.yaml +++ /dev/null @@ -1,157 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - - ++model.use_kv_cache=False - - ++model.use_sdpa_with_kv_cache=False - - ++save_exported_program=True - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/15-30-13 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml b/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml deleted file mode 100644 index fccd73d94f1..00000000000 --- a/outputs/2025-09-03/15-30-13/.hydra/overrides.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json -- ++model.use_kv_cache=False -- ++model.use_sdpa_with_kv_cache=False -- ++save_exported_program=True diff --git a/outputs/2025-09-03/15-30-13/export_llm.log b/outputs/2025-09-03/15-30-13/export_llm.log deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/outputs/2025-09-03/16-25-46/.hydra/config.yaml b/outputs/2025-09-03/16-25-46/.hydra/config.yaml deleted file mode 100644 index 74c7f49c21f..00000000000 --- a/outputs/2025-09-03/16-25-46/.hydra/config.yaml +++ /dev/null @@ -1,74 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false -save_exported_program: true diff --git a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml deleted file mode 100644 index f3b218f45ca..00000000000 --- a/outputs/2025-09-03/16-25-46/.hydra/hydra.yaml +++ /dev/null @@ -1,157 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - - ++model.use_kv_cache=False - - ++model.use_sdpa_with_kv_cache=False - - ++save_exported_program=True - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-25-46 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml deleted file mode 100644 index fccd73d94f1..00000000000 --- a/outputs/2025-09-03/16-25-46/.hydra/overrides.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json -- ++model.use_kv_cache=False -- ++model.use_sdpa_with_kv_cache=False -- ++save_exported_program=True diff --git a/outputs/2025-09-03/16-25-46/export_llm.log b/outputs/2025-09-03/16-25-46/export_llm.log deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/outputs/2025-09-03/16-29-28/.hydra/config.yaml b/outputs/2025-09-03/16-29-28/.hydra/config.yaml deleted file mode 100644 index 74c7f49c21f..00000000000 --- a/outputs/2025-09-03/16-29-28/.hydra/config.yaml +++ /dev/null @@ -1,74 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false -save_exported_program: true diff --git a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml b/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml deleted file mode 100644 index 8490cd4d2cd..00000000000 --- a/outputs/2025-09-03/16-29-28/.hydra/hydra.yaml +++ /dev/null @@ -1,157 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - - ++model.use_kv_cache=False - - ++model.use_sdpa_with_kv_cache=False - - ++save_exported_program=True - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False,++save_exported_program=True - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-29-28 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml b/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml deleted file mode 100644 index fccd73d94f1..00000000000 --- a/outputs/2025-09-03/16-29-28/.hydra/overrides.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json -- ++model.use_kv_cache=False -- ++model.use_sdpa_with_kv_cache=False -- ++save_exported_program=True diff --git a/outputs/2025-09-03/16-29-28/export_llm.log b/outputs/2025-09-03/16-29-28/export_llm.log deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/outputs/2025-09-03/16-30-46/.hydra/config.yaml b/outputs/2025-09-03/16-30-46/.hydra/config.yaml deleted file mode 100644 index 34a34cf92f9..00000000000 --- a/outputs/2025-09-03/16-30-46/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false diff --git a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml b/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml deleted file mode 100644 index 9960f35db88..00000000000 --- a/outputs/2025-09-03/16-30-46/.hydra/hydra.yaml +++ /dev/null @@ -1,156 +0,0 @@ -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} - sweep: - dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: null - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][HYDRA] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - loggers: - logging_example: - level: DEBUG - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: simple - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: RUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=RUN - task: - - ++base.model_class=llama3_1 - - ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - - ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json - - ++model.use_kv_cache=False - - ++model.use_sdpa_with_kv_cache=False - job: - name: export_llm - chdir: null - override_dirname: ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth,++base.model_class=llama3_1,++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json,++model.use_kv_cache=False,++model.use_sdpa_with_kv_cache=False - id: ??? - num: ??? - config_name: llm_config - env_set: {} - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /home/gasoonjia/executorch - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: '' - schema: structured - provider: schema - output_dir: /home/gasoonjia/executorch/outputs/2025-09-03/16-30-46 - choices: - hydra/env: default - hydra/callbacks: null - hydra/job_logging: default - hydra/hydra_logging: default - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml b/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml deleted file mode 100644 index 369364d85c9..00000000000 --- a/outputs/2025-09-03/16-30-46/.hydra/overrides.yaml +++ /dev/null @@ -1,5 +0,0 @@ -- ++base.model_class=llama3_1 -- ++base.checkpoint=/home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth -- ++base.params=/home/gasoonjia/Llama-3.1-8B/original/params.json -- ++model.use_kv_cache=False -- ++model.use_sdpa_with_kv_cache=False diff --git a/outputs/2025-09-03/16-30-46/export_llm.log b/outputs/2025-09-03/16-30-46/export_llm.log deleted file mode 100644 index ebb9f84e570..00000000000 --- a/outputs/2025-09-03/16-30-46/export_llm.log +++ /dev/null @@ -1,40 +0,0 @@ -[2025-09-03 16:30:46,353][root][INFO] - Applying quantizers: [] -[2025-09-03 16:30:52,013][root][INFO] - Checkpoint dtype: torch.bfloat16 -[2025-09-03 16:30:52,014][root][INFO] - Model after source transforms: Transformer( - (tok_embeddings): Embedding(128256, 4096) - (layers): ModuleList( - (0-31): 32 x TransformerBlock( - (attention): AttentionMHA( - (wq): Linear(in_features=4096, out_features=4096, bias=False) - (wk): Linear(in_features=4096, out_features=1024, bias=False) - (wv): Linear(in_features=4096, out_features=1024, bias=False) - (wo): Linear(in_features=4096, out_features=4096, bias=False) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - ) - (feed_forward): FeedForward( - (w1): Linear(in_features=4096, out_features=14336, bias=False) - (w2): Linear(in_features=14336, out_features=4096, bias=False) - (w3): Linear(in_features=4096, out_features=14336, bias=False) - ) - (attention_norm): RMSNorm() - (ffn_norm): RMSNorm() - ) - ) - (rope): Rope( - (apply_rotary_emb): RotaryEmbedding() - ) - (norm): RMSNorm() - (output): Linear(in_features=4096, out_features=128256, bias=False) -) -[2025-09-03 16:30:52,015][root][INFO] - Exporting with: -[2025-09-03 16:30:52,016][root][INFO] - inputs: (tensor([[1, 2, 3]]),) -[2025-09-03 16:30:52,016][root][INFO] - kwargs: None -[2025-09-03 16:30:52,016][root][INFO] - dynamic shapes: ({1: Dim('token_dim', min=0, max=127)},) -[2025-09-03 16:31:06,978][root][INFO] - Running canonical pass: RemoveRedundantTransposes -[2025-09-03 16:31:07,056][root][INFO] - Using pt2e [] to quantizing the model... -[2025-09-03 16:31:07,056][root][INFO] - No quantizer provided, passing... -[2025-09-03 16:32:22,170][root][INFO] - Lowering model using following partitioner(s): -[2025-09-03 16:33:19,737][root][INFO] - Required memory for activation in bytes: [0, 26074624] -[2025-09-03 16:33:33,215][root][INFO] - Saved exported program to ./llama3_1.pte diff --git a/saved_llm_config.yaml b/saved_llm_config.yaml deleted file mode 100644 index 34a34cf92f9..00000000000 --- a/saved_llm_config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -base: - model_class: llama3_1 - params: /home/gasoonjia/Llama-3.1-8B/original/params.json - checkpoint: /home/gasoonjia/Llama-3.1-8B/original/consolidated.00.pth - checkpoint_dir: null - adapter_checkpoint: null - adapter_config: null - tokenizer_path: null - metadata: null - use_lora: 0 - fairseq2: false - preq_mode: null - preq_group_size: 32 - preq_embedding_quantize: 8,0 -model: - dtype_override: fp32 - enable_dynamic_shape: true - use_shared_embedding: false - use_sdpa_with_kv_cache: false - expand_rope_table: false - use_attention_sink: null - output_prune_map: null - input_prune_map: null - use_kv_cache: false - quantize_kv_cache: false - local_global_attention: null -export: - max_seq_length: 128 - max_context_length: 128 - output_dir: . - output_name: null - so_library: null - export_only: false - foundation_weights_file: null -debug: - profile_memory: false - profile_path: null - generate_etrecord: false - generate_full_logits: false - verbose: false -quantization: - qmode: null - embedding_quantize: null - pt2e_quantize: null - group_size: null - use_spin_quant: null - use_qat: false - calibration_tasks: null - calibration_limit: null - calibration_seq_length: null - calibration_data: Once upon a time -backend: - xnnpack: - enabled: false - extended_ops: false - coreml: - enabled: false - enable_state: false - preserve_sdpa: false - quantize: null - ios: 15 - compute_units: cpu_only - vulkan: - enabled: false - qnn: - enabled: false - use_sha: false - soc_model: SM8650 - use_qnn_sha: false - optimized_rotation_path: null - num_sharding: 0 - mps: - enabled: false From 3f22996f7f3bb911ade7d0def9402ccf678351ca Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 4 Sep 2025 00:12:14 -0700 Subject: [PATCH 34/50] remove uncessary data from github --- ...akzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json | 1 - 1 file changed, 1 deletion(-) delete mode 100644 clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json diff --git a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json b/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json deleted file mode 100644 index 8c1a9f6d812..00000000000 --- a/clqzakzgm66sr5ylwpqob22jytv5wftwk7kydy3xl34trmmzc3sj.wrapper.json +++ /dev/null @@ -1 +0,0 @@ -{"nodes": [{"name": "buf6", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf7"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf8", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf5"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf9"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf12", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf13"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf14", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf11"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf15"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf18", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf19"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf39", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf40"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf41", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf38"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf42"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf45", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf46"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf47", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf44"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf48"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf50", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf51"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf72", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf73"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf74", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf71"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf75"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf78", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf79"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf80", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf77"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf81"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf83", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf84"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf104", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf105"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf106", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf103"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf107"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf110", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf111"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf112", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf109"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf113"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf115", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf116"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf137", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf138"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf139", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf136"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf140"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf143", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf144"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf145", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf142"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf146"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf148", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf149"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf169", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf170"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf171", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf168"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf172"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf175", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf176"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf177", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf174"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf178"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf180", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf181"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf202", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf203"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf204", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf201"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf205"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf208", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf209"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf210", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf207"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf211"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf213", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf214"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf234", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf235"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf236", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf233"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf237"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf240", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf241"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf242", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf239"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf243"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf245", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf246"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf267", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf268"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf269", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf266"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf270"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf273", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf274"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf275", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf272"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf276"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf278", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf279"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf299", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf300"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf301", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf298"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf302"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf305", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf306"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf307", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf304"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf308"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf310", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf311"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf332", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf333"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf334", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf331"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf335"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf338", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf339"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf340", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf337"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf341"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf343", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf344"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf364", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf365"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf366", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf363"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf367"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf370", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf371"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf372", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf369"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf373"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf375", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf376"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf397", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf398"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf399", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf396"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf400"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf403", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf404"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf405", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf402"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf406"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf408", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf409"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf429", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf430"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf431", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf428"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf432"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf435", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf436"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf437", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf434"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf438"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf440", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf441"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf462", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf463"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf464", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf461"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf465"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf468", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf469"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf470", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf467"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf471"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf473", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf474"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf494", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf495"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf496", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf493"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf497"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf500", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf501"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf502", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf499"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf503"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf505", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf506"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf527", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf528"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf529", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf526"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf530"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf533", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf534"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf535", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf532"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf536"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf538", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf539"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf559", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf560"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf561", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf558"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf562"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf565", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf566"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf567", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf564"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf568"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf570", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf571"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf592", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf593"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf594", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf591"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf595"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf598", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf599"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf600", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf597"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf601"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf603", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf604"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf624", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf625"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf626", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf623"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf627"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf630", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf631"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf632", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf629"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf633"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf635", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf636"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf657", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf658"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf659", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf656"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf660"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf663", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf664"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf665", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf662"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf666"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf668", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf669"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf689", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf690"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf691", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf688"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf692"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf695", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf696"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf697", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf694"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf698"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf700", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf701"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf722", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf723"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf724", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf721"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf725"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf728", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf729"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf730", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf727"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf731"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf733", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf734"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf754", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf755"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf756", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf753"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf757"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf760", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf761"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf762", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf759"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf763"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf765", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf766"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf787", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf788"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf789", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf786"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf790"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf793", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf794"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf795", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf792"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf796"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf798", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf799"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf819", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf820"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf821", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf818"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf822"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf825", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf826"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf827", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf824"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf828"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf830", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf831"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf852", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf853"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf854", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf851"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf855"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf858", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf859"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf860", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf857"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf861"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf863", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf864"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf884", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf885"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf886", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf883"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf887"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf890", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf891"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf892", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf889"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf893"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf895", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf896"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf917", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf918"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf919", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf916"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf920"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf923", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf924"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf925", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf922"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf926"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf928", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf929"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf949", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf950"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf951", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf948"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf952"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf955", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf956"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf957", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf954"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf958"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf960", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf961"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf982", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf983"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf984", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf981"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf985"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf988", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf989"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf990", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf987"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf991"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf993", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf994"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1014", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1015"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1016", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1013"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1017"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1020", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 64}, "kind": 1}, {"name": "end", "arg": {"as_int": 9223372036854775807}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1021"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1022", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf1019"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 64}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1023"}}], "metadata": {}, "is_hop_single_tensor_return": null}}, {"name": "buf1025", "node": {"target": "aten::slice_copy.Tensor", "inputs": [{"name": "self", "arg": {"as_tensor": {"name": "buf17"}}, "kind": 1}, {"name": "dim", "arg": {"as_int": 3}, "kind": 1}, {"name": "start", "arg": {"as_int": 0}, "kind": 1}, {"name": "end", "arg": {"as_int": 32}, "kind": 1}, {"name": "step", "arg": {"as_int": 1}, "kind": 1}], "outputs": [{"as_tensor": {"name": "buf1026"}}], "metadata": {}, "is_hop_single_tensor_return": null}}]} \ No newline at end of file From cc06edacd6be47ba917757f7d4d7e3825664c0a4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 4 Sep 2025 16:30:55 -0700 Subject: [PATCH 35/50] add support fallback kernels check --- backends/aoti/aoti_backend.py | 54 ++++- backends/aoti/runtime/shims/utils.cpp | 326 +++++++++++++------------- backends/aoti/runtime/shims/utils.h | 4 +- requirements-dev.txt | 1 - requirements-examples.txt | 2 +- 5 files changed, 218 insertions(+), 169 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index f785da00783..d4d30773fb9 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -4,13 +4,14 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import contextlib import copy import os import shutil import typing from subprocess import check_call -from typing import final, List +from typing import Any, Dict, final, List, Optional, Set import torch from executorch.exir.backend.backend_details import ( @@ -19,6 +20,48 @@ PreprocessResult, ) from executorch.exir.backend.compile_spec_schema import CompileSpec +from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu + + +# exist fallback operators in et namespace; +supported_fallback_kernels: Dict[str, Any] = {} + +# required fallback kernels but not supported +missing_fallback_kernels: Set[str] = set() + + +# context manager for non-fallback guarantee +# it will raise exception when generating fallback kernels during aoti compile +@contextlib.contextmanager +def raise_on_generate_fall_back_call(): + original_generate_c_shim_extern_kernel_call = ( + CppWrapperCpu.generate_c_shim_extern_kernel_call + ) + + def generate_supported_c_shim_extern_kernel_call( + self, + kernel: str, + args: list[str], + device: str, + *, + debug_args: Optional[list[str]] = None, + ): + if kernel in supported_fallback_kernels: + original_generate_c_shim_extern_kernel_call( + self, kernel, args, device, debug_args=debug_args + ) + else: + missing_fallback_kernels.add(kernel) + + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + generate_supported_c_shim_extern_kernel_call + ) + try: + yield + finally: + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( + original_generate_c_shim_extern_kernel_call + ) @final @@ -50,7 +93,14 @@ def preprocess( "max_autotune_conv_backends": "TRITON", } - so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] + with raise_on_generate_fall_back_call(): + so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] + if len(missing_fallback_kernels) > 0: + formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels)) + raise RuntimeError( + f"Missing fallback kernels ({len(missing_fallback_kernels)} total):\n - {formatted_kernels}\n" + "Please add them to the AOTI backend." + ) assert so_path == output_path, f"Expected {output_path} but got {so_path}" diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp index e81e141e7fd..a9dc5c84eb7 100644 --- a/backends/aoti/runtime/shims/utils.cpp +++ b/backends/aoti/runtime/shims/utils.cpp @@ -25,169 +25,169 @@ const char* const TENSOR_OUTPUT_FILENAME = extern "C" { -void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) { - printf("Printing tensor handle: %p\n", self); - - if (!self) { - throw std::runtime_error("Tensor handle is null"); - } - - printf("Tensor handle is not null\n"); - - // Get dtype and check if it's float32 (dtype 6 in PyTorch) - int32_t dtype = 0; - if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) { - throw std::runtime_error("Failed to get tensor dtype"); - } - - printf("Tensor dtype is: %d\n", dtype); - - if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch - throw std::runtime_error( - "Tensor dtype is not float32. Expected dtype 6, got: " + - std::to_string(dtype)); - } - - printf("Tensor dtype is float32\n"); - - // Get data pointer - void* data_ptr = nullptr; - if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS || - !data_ptr) { - throw std::runtime_error("Failed to get tensor data pointer"); - } - - printf("Tensor data pointer is %p not null\n", data_ptr); - - // Get dimensions - int64_t dim = 0; - if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) { - throw std::runtime_error("Failed to get tensor dimensions"); - } - - printf("Tensor dimensions are: %ld\n", dim); - - // Get sizes - int64_t* sizes = nullptr; - if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) { - throw std::runtime_error("Failed to get tensor sizes"); - } - - printf("Tensor sizes are: %ld\n", sizes); - - // Calculate total number of elements - int64_t total_elements = 1; - for (int i = 0; i < dim; i++) { - total_elements *= sizes[i]; - } - - printf("Total elements in tensor: %ld\n", total_elements); - - // Check device type to handle CUDA tensors properly - int32_t device_type = 0; - if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) { - throw std::runtime_error("Failed to get tensor device type"); - } - - printf("Tensor device type: %d\n", device_type); - - AtenTensorHandle cpu_tensor = nullptr; - const float* float_data = nullptr; - bool need_cleanup = false; - - // Check if tensor is on CUDA (device_type 1 is CUDA) - if (device_type == 1) { - printf("Tensor is on CUDA, copying to CPU...\n"); - - // Get strides for creating CPU tensor - int64_t* strides = nullptr; - if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS || - !strides) { - throw std::runtime_error("Failed to get tensor strides"); - } - - // Create a CPU tensor with same shape and layout - if (aoti_torch_empty_strided( - dim, sizes, strides, dtype, 0, -1, &cpu_tensor) != - AOTI_TORCH_SUCCESS) { - throw std::runtime_error("Failed to create CPU tensor"); - } - - // Copy data from CUDA to CPU tensor - if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) { - aoti_torch_delete_tensor_object(cpu_tensor); - throw std::runtime_error("Failed to copy tensor from CUDA to CPU"); - } - - // Get CPU data pointer - void* cpu_data_ptr = nullptr; - if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) != - AOTI_TORCH_SUCCESS || - !cpu_data_ptr) { - aoti_torch_delete_tensor_object(cpu_tensor); - throw std::runtime_error("Failed to get CPU tensor data pointer"); - } - - float_data = static_cast(cpu_data_ptr); - need_cleanup = true; - printf("Successfully copied CUDA tensor to CPU\n"); - } else { - // Tensor is already on CPU, use original data pointer - printf("Tensor is on CPU, using original data pointer\n"); - float_data = static_cast(data_ptr); - } - - // Open file for writing (append mode to not overwrite previous outputs) - printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME); - - std::ofstream output_file( - internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app); - if (!output_file.is_open()) { - if (need_cleanup) { - aoti_torch_delete_tensor_object(cpu_tensor); - } - throw std::runtime_error( - "Failed to open output file: " + - std::string(internal::TENSOR_OUTPUT_FILENAME)); - } - - printf("Successfully opened file for writing\n"); - - // Write message and tensor info to file - output_file << "=== " << msg << " ===" << std::endl; - output_file << "Device type: " << device_type << std::endl; - output_file << "Dimensions: " << dim << std::endl; - output_file << "Sizes: ["; - for (int i = 0; i < dim; i++) { - output_file << sizes[i]; - if (i < dim - 1) - output_file << ", "; - } - output_file << "]" << std::endl; - output_file << "Total elements: " << total_elements << std::endl; - output_file << "Data content:" << std::endl; - - // Write tensor data to file (now safe to access) - for (int64_t i = 0; i < total_elements; i++) { - output_file << float_data[i] << " "; - if (i < total_elements - 1) { - output_file << ", "; - // Add newline every 10 elements for readability - if ((i + 1) % 10 == 0) { - output_file << std::endl; - } - } - } - output_file << std::endl << std::endl; - - // Clean up CPU tensor if we created one - if (need_cleanup) { - aoti_torch_delete_tensor_object(cpu_tensor); - printf("Cleaned up temporary CPU tensor\n"); - } - - // File will be automatically closed when output_file goes out of scope -} +// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) { +// printf("Printing tensor handle: %p\n", self); + +// if (!self) { +// throw std::runtime_error("Tensor handle is null"); +// } + +// printf("Tensor handle is not null\n"); + +// // Get dtype and check if it's float32 (dtype 6 in PyTorch) +// int32_t dtype = 0; +// if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) { +// throw std::runtime_error("Failed to get tensor dtype"); +// } + +// printf("Tensor dtype is: %d\n", dtype); + +// if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch +// throw std::runtime_error( +// "Tensor dtype is not float32. Expected dtype 6, got: " + +// std::to_string(dtype)); +// } + +// printf("Tensor dtype is float32\n"); + +// // Get data pointer +// void* data_ptr = nullptr; +// if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS || +// !data_ptr) { +// throw std::runtime_error("Failed to get tensor data pointer"); +// } + +// printf("Tensor data pointer is %p not null\n", data_ptr); + +// // Get dimensions +// int64_t dim = 0; +// if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) { +// throw std::runtime_error("Failed to get tensor dimensions"); +// } + +// printf("Tensor dimensions are: %ld\n", dim); + +// // Get sizes +// int64_t* sizes = nullptr; +// if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) { +// throw std::runtime_error("Failed to get tensor sizes"); +// } + +// printf("Tensor sizes are: %ld\n", sizes); + +// // Calculate total number of elements +// int64_t total_elements = 1; +// for (int i = 0; i < dim; i++) { +// total_elements *= sizes[i]; +// } + +// printf("Total elements in tensor: %ld\n", total_elements); + +// // Check device type to handle CUDA tensors properly +// int32_t device_type = 0; +// if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) { +// throw std::runtime_error("Failed to get tensor device type"); +// } + +// printf("Tensor device type: %d\n", device_type); + +// AtenTensorHandle cpu_tensor = nullptr; +// const float* float_data = nullptr; +// bool need_cleanup = false; + +// // Check if tensor is on CUDA (device_type 1 is CUDA) +// if (device_type == 1) { +// printf("Tensor is on CUDA, copying to CPU...\n"); + +// // Get strides for creating CPU tensor +// int64_t* strides = nullptr; +// if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS || +// !strides) { +// throw std::runtime_error("Failed to get tensor strides"); +// } + +// // Create a CPU tensor with same shape and layout +// if (aoti_torch_empty_strided( +// dim, sizes, strides, dtype, 0, -1, &cpu_tensor) != +// AOTI_TORCH_SUCCESS) { +// throw std::runtime_error("Failed to create CPU tensor"); +// } + +// // Copy data from CUDA to CPU tensor +// if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) { +// aoti_torch_delete_tensor_object(cpu_tensor); +// throw std::runtime_error("Failed to copy tensor from CUDA to CPU"); +// } + +// // Get CPU data pointer +// void* cpu_data_ptr = nullptr; +// if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) != +// AOTI_TORCH_SUCCESS || +// !cpu_data_ptr) { +// aoti_torch_delete_tensor_object(cpu_tensor); +// throw std::runtime_error("Failed to get CPU tensor data pointer"); +// } + +// float_data = static_cast(cpu_data_ptr); +// need_cleanup = true; +// printf("Successfully copied CUDA tensor to CPU\n"); +// } else { +// // Tensor is already on CPU, use original data pointer +// printf("Tensor is on CPU, using original data pointer\n"); +// float_data = static_cast(data_ptr); +// } + +// // Open file for writing (append mode to not overwrite previous outputs) +// printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME); + +// std::ofstream output_file( +// internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app); +// if (!output_file.is_open()) { +// if (need_cleanup) { +// aoti_torch_delete_tensor_object(cpu_tensor); +// } +// throw std::runtime_error( +// "Failed to open output file: " + +// std::string(internal::TENSOR_OUTPUT_FILENAME)); +// } + +// printf("Successfully opened file for writing\n"); + +// // Write message and tensor info to file +// output_file << "=== " << msg << " ===" << std::endl; +// output_file << "Device type: " << device_type << std::endl; +// output_file << "Dimensions: " << dim << std::endl; +// output_file << "Sizes: ["; +// for (int i = 0; i < dim; i++) { +// output_file << sizes[i]; +// if (i < dim - 1) +// output_file << ", "; +// } +// output_file << "]" << std::endl; +// output_file << "Total elements: " << total_elements << std::endl; +// output_file << "Data content:" << std::endl; + +// // Write tensor data to file (now safe to access) +// for (int64_t i = 0; i < total_elements; i++) { +// output_file << float_data[i] << " "; +// if (i < total_elements - 1) { +// output_file << ", "; +// // Add newline every 10 elements for readability +// if ((i + 1) % 10 == 0) { +// output_file << std::endl; +// } +// } +// } +// output_file << std::endl << std::endl; + +// // Clean up CPU tensor if we created one +// if (need_cleanup) { +// aoti_torch_delete_tensor_object(cpu_tensor); +// printf("Cleaned up temporary CPU tensor\n"); +// } + +// // File will be automatically closed when output_file goes out of scope +// } // Function to cleanup the tensor output file (to be called from // aoti_backend.cpp) diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h index c0c2a59be0a..06d2edce212 100644 --- a/backends/aoti/runtime/shims/utils.h +++ b/backends/aoti/runtime/shims/utils.h @@ -18,8 +18,8 @@ namespace aoti { extern "C" { -// Utility function for printing tensor information -void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg); +// // Utility function for printing tensor information +// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg); // Cleanup function for tensor output file (called during backend destruction) void cleanup_aoti_tensor_output(); diff --git a/requirements-dev.txt b/requirements-dev.txt index 964bdecef76..8c8f518a5ea 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -10,4 +10,3 @@ certifi # Imported by resolve_buck.py. lintrunner==0.12.7 lintrunner-adapters==0.12.6 patchelf -transformers diff --git a/requirements-examples.txt b/requirements-examples.txt index 0923cf8fefc..26ac1ad9279 100644 --- a/requirements-examples.txt +++ b/requirements-examples.txt @@ -4,4 +4,4 @@ datasets == 3.6.0 # 4.0.0 deprecates trust_remote_code and load scripts. For now timm == 1.0.7 torchsr == 1.0.4 torchtune >= 0.6.1 -transformers == 4.53.1 +transformers == 4.52.4 From ef191c4c3cd177e55c78b599dfaba058da3c2449 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 4 Sep 2025 17:03:17 -0700 Subject: [PATCH 36/50] collect missing kernels while always generated fallback --- backends/aoti/aoti_backend.py | 18 ++-- backends/aoti/aoti_partitioner.py | 164 ------------------------------ export_aoti.py | 8 +- 3 files changed, 13 insertions(+), 177 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index d4d30773fb9..e6244ae9346 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -33,12 +33,12 @@ # context manager for non-fallback guarantee # it will raise exception when generating fallback kernels during aoti compile @contextlib.contextmanager -def raise_on_generate_fall_back_call(): +def collect_unsupported_fallback_kernels(): original_generate_c_shim_extern_kernel_call = ( CppWrapperCpu.generate_c_shim_extern_kernel_call ) - def generate_supported_c_shim_extern_kernel_call( + def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels( self, kernel: str, args: list[str], @@ -46,15 +46,15 @@ def generate_supported_c_shim_extern_kernel_call( *, debug_args: Optional[list[str]] = None, ): - if kernel in supported_fallback_kernels: - original_generate_c_shim_extern_kernel_call( - self, kernel, args, device, debug_args=debug_args - ) - else: + if kernel not in supported_fallback_kernels: missing_fallback_kernels.add(kernel) + original_generate_c_shim_extern_kernel_call( + self, kernel, args, device, debug_args=debug_args + ) + CppWrapperCpu.generate_c_shim_extern_kernel_call = ( - generate_supported_c_shim_extern_kernel_call + generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels ) try: yield @@ -93,7 +93,7 @@ def preprocess( "max_autotune_conv_backends": "TRITON", } - with raise_on_generate_fall_back_call(): + with collect_unsupported_fallback_kernels(): so_path = torch._inductor.aot_compile(edge_program_module, args, kwargs, options=options) # type: ignore[arg-type] if len(missing_fallback_kernels) > 0: formatted_kernels = "\n - ".join(sorted(missing_fallback_kernels)) diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index 6dfe888fec8..6aeb63f959d 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -24,170 +24,6 @@ from torch.fx.passes.operator_support import OperatorSupportBase -# exist fallback operators in et namespace; should map to inductor_fallback_ops -supported_fallback_operators: Dict[str, Dict[str, List[str]]] = {} - -inductor_fallback_ops: Set[str] = { - "aten._adaptive_avg_pool2d_backward.default", - "aten._adaptive_avg_pool2d.default", - "aten._adaptive_avg_pool3d_backward.default", - "aten._adaptive_avg_pool3d.default", - "aten._addmm_activation.default", - "aten._cdist_backward.default", - "aten._cdist_forward.default", - "aten._cudnn_rnn.default", - "aten._dyn_quant_matmul_4bit.default", - "aten._dyn_quant_pack_4bit_weight.default", - "aten._efficient_attention_backward.default", - "aten._efficient_attention_forward.default", - "aten._efficientzerotensor.default", - "aten._embedding_bag_dense_backward.default", - "aten._embedding_bag_forward_only.default", - "aten._embedding_bag_per_sample_weights_backward.default", - "aten._embedding_bag.default", - "aten._fft_c2c.default", - "aten._fft_r2c.default", - "aten._flash_attention_backward.default", - "aten._flash_attention_forward.default", - "aten._fused_moving_avg_obs_fq_helper_functional.default", - "aten._fused_moving_avg_obs_fq_helper.default", - "aten._fused_rms_norm.default", - "aten._histogramdd_from_bin_cts.default", - "aten._int_mm.out", - "aten._pdist_backward.default", - "aten._pdist_forward.default", - "aten._scaled_dot_product_attention_math_for_mps.default", - "aten._scaled_dot_product_cudnn_attention_backward.default", - "aten._scaled_dot_product_cudnn_attention.default", - "aten._scaled_dot_product_efficient_attention_backward.default", - "aten._scaled_dot_product_efficient_attention.default", - "aten._scaled_dot_product_flash_attention_backward.default", - "aten._scaled_dot_product_flash_attention_for_cpu_backward.default", - "aten._scaled_dot_product_flash_attention_for_cpu.default", - "aten._scaled_dot_product_flash_attention.default", - "aten._scaled_dot_product_fused_attention_overrideable_backward.default", - "aten._scaled_dot_product_fused_attention_overrideable.default", - "aten._scaled_mm.default", - "aten._scaled_mm.out", - "aten._segment_reduce_backward.default", - "aten._thnn_fused_lstm_cell.default", - "aten._to_sparse.default", - "aten._trilinear.default", - "aten._weight_int4pack_mm.default", - "aten._weight_int8pack_mm.default", - "aten.abs.default", - "aten.adaptive_max_pool2d_backward.default", - "aten.adaptive_max_pool2d.default", - "aten.adaptive_max_pool3d_backward.default", - "aten.adaptive_max_pool3d.default", - "aten.add.Scalar", - "aten.add.Tensor", - "aten.addbmm.default", - "aten.addmm.out", - "aten.addmv.default", - "aten.angle.default", - "aten.avg_pool2d_backward.default", - "aten.avg_pool2d.default", - "aten.avg_pool3d_backward.default", - "aten.avg_pool3d.default", - "aten.baddbmm.out", - "aten.bernoulli_.float", - "aten.bernoulli_.Tensor", - "aten.bmm.out", - "aten.bucketize.Tensor", - "aten.cat.default", - "aten.cholesky_inverse.default", - "aten.cholesky_solve.default", - "aten.convolution_backward.default", - "aten.convolution.default", - "aten.cummax.default", - "aten.cummin.default", - "aten.cumprod.default", - "aten.cumsum.default", - "aten.exponential.default", - "aten.fill_.Scalar", - "aten.fractional_max_pool2d_backward.default", - "aten.fractional_max_pool2d.default", - "aten.fractional_max_pool3d_backward.default", - "aten.fractional_max_pool3d.default", - "aten.gcd.default", - "aten.geqrf.default", - "aten.grid_sampler_2d_backward.default", - "aten.hann_window.default", - "aten.histc.default", - "aten.histogram.bin_ct", - "aten.index_put.default", - "aten.index_reduce.default", - "aten.index.Tensor", - "aten.kthvalue.default", - "aten.logcumsumexp.default", - "aten.lu_unpack.default", - "aten.masked_scatter_backward.default", - "aten.masked_scatter.default", - "aten.masked_select.default", - "aten.max_pool2d_with_indices_backward.default", - "aten.max_pool2d_with_indices.default", - "aten.max_pool3d_with_indices_backward.default", - "aten.max_pool3d_with_indices.default", - "aten.max_unpool2d.default", - "aten.max_unpool3d.default", - "aten.median.default", - "aten.mm.out", - "aten.mode.default", - "aten.mul.Scalar", - "aten.mul.Tensor", - "aten.nanmedian.default", - "aten.narrow.default", - "aten.native_dropout.default", - "aten.nonzero.default", - "aten.normal_functional.default", - "aten.ormqr.default", - "aten.pad.default", - "aten.permute.default", - "aten.polar.default", - "aten.pow.Scalar", - "aten.pow.Tensor_Scalar", - "aten.pow.Tensor_Tensor", - "aten.rand.default", - "aten.rand.generator", - "aten.randint.default", - "aten.randint.generator", - "aten.randint.low_out", - "aten.randint.low", - "aten.randn.default", - "aten.randn.generator", - "aten.randperm.default", - "aten.repeat_interleave.Tensor", - "aten.replication_pad1d_backward.default", - "aten.replication_pad2d_backward.default", - "aten.reshape.default", - "aten.resize_.default", - "aten.resize_as_.default", - "aten.scatter_reduce.two_out", - "aten.scatter.src_out", - "aten.scatter.value_out", - "aten.searchsorted.Scalar", - "aten.searchsorted.Tensor", - "aten.segment_reduce.default", - "aten.set_.source_Tensor", - "aten.slice.Tensor", - "aten.soft_margin_loss_backward.default", - "aten.sort.default", - "aten.sort.stable", - "aten.squeeze.dim", - "aten.to_sparse.default", - "aten.topk.default", - "aten.triangular_solve.default", - "aten.uniform.default", - "aten.upsample_bicubic2d_backward.default", - "aten.upsample_linear1d_backward.default", - "aten.upsample_trilinear3d_backward.default", - "aten.view_as_complex.default", - "aten.view_as_real.default", - "aten.view.dtype", - "aten._weight_int4pack_mm_with_scales_and_zeros.default", -} - class AOTISupportedOperators(OperatorSupportBase): def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: diff --git a/export_aoti.py b/export_aoti.py index c1c24d212ef..d720086ac0f 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -167,15 +167,15 @@ def forward(self, x): class Llama31(torch.nn.Module): - def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B"): + def __init__(self, model_id="meta-llama/Meta-Llama-3.1-8B", use_cache=False): super(Llama31, self).__init__() # Load Llama 3.1 model from HF + self.use_cache = use_cache self.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map="cuda", - # trust_remote_code=True, - use_cache=False, # Turn off KV cache + use_cache=self.use_cache, # Turn off KV cache ) self.model.eval() @@ -185,7 +185,7 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None): outputs = self.model( input_ids=input_ids, attention_mask=attention_mask, - use_cache=False, # Explicitly turn off KV cache + use_cache=self.use_cache, # Explicitly turn off KV cache ) return outputs.logits From 62fbd92df41c76dc2dea24a69238cc484d5cee67 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 9 Sep 2025 16:32:38 -0700 Subject: [PATCH 37/50] use ptd pipeline on .so file --- CMakeLists.txt | 6 + backends/aoti/aoti_backend.py | 15 +- backends/aoti/runtime/aoti_backend.cpp | 28 +++- .../executor_runner/executor_runner.cpp | 43 +++++- examples/portable/executor_runner/targets.bzl | 2 + export_and_run_aoti.sh | 6 +- export_aoti.py | 135 +++++++++++++++--- 7 files changed, 209 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f98246e1851..e3debc9fcf5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -107,6 +107,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_SKIP_BUILD_RPATH OFF) # Don't use the install-rpath during the build phase set(CMAKE_BUILD_WITH_INSTALL_RPATH ON) + # Automatically add all linked folders that are NOT in the build directory to # the rpath (per library?) # @@ -1014,6 +1015,11 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) extension_runner_util gflags executorch_backends ) + # Add flat tensor extension if it's built + if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR) + list(APPEND _executor_runner_libs extension_flat_tensor) + endif() + if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) elseif(EXECUTORCH_BUILD_CADENCE) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index e6244ae9346..986aa938888 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -7,13 +7,13 @@ import contextlib import copy import os -import shutil import typing from subprocess import check_call from typing import Any, Dict, final, List, Optional, Set import torch +from executorch.exir._serialize._named_data_store import NamedDataStore from executorch.exir.backend.backend_details import ( BackendDetails, ExportedProgram, @@ -72,6 +72,7 @@ def preprocess( compile_specs: List[CompileSpec], ) -> PreprocessResult: print("entering the lowerable parts in AotiBackend.preprocess....") + named_data_store = NamedDataStore() # print("here", edge_program.example_inputs) copy_edge_program = copy.deepcopy(edge_program) @@ -88,6 +89,7 @@ def preprocess( options: dict[str, typing.Any] = { "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, + "aot_inductor.force_mmap_weights": False, "max_autotune": True, "max_autotune_gemm_backends": "TRITON", "max_autotune_conv_backends": "TRITON", @@ -111,4 +113,13 @@ def preprocess( print("so_path", so_path) - return PreprocessResult(so_path.encode("utf-8")) + with open(so_path, "rb") as f: + so_data = f.read() + + named_data_store.add_named_data("so_blob", so_data, 1, "aoti_cuda_blob") + + return PreprocessResult( + processed_bytes=b"", + debug_handle_map={}, + data_store_output=named_data_store.get_named_data_store_output(), + ) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 03c46c03bdd..24d935d579e 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -51,6 +51,7 @@ using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::FreeableBuffer; using executorch::runtime::MemoryAllocator; +using executorch::runtime::NamedDataMap; using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; @@ -69,15 +70,34 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { // Once per loaded binary blob Result init( BackendInitContext& context, - FreeableBuffer* processed, // This will be the buffer from aoti_backend + FreeableBuffer* processed, // This will be a empty buffer ArrayRef compile_specs // This will be my empty list ) const override { - const char* so_path = static_cast(processed->data()); + // const char* so_path = static_cast(processed->data()); - printf("so path: %s\n", so_path); + // printf("so path: %s\n", so_path); + + const NamedDataMap* named_data_map = context.get_named_data_map(); + + std::string so_path = "/tmp/test.so"; + std::string so_blob_key = "so_blob"; + + Result aoti_cuda_buffer = + named_data_map->get_data(aoti_cuda_blob_name.c_str()); + + // Create a temporary file + std::ofstream outfile(so_path.c_str(), std::ios::binary); + + // Write the ELF buffer to the temporary file + outfile.write( + (char*)aoti_cuda_buffer->data(), + sizeof(void*) * aoti_cuda_buffer->size()); + + // Finish writing the file to disk + outfile.close(); // Load the ELF using dlopen - void* so_handle = dlopen(so_path, RTLD_LAZY | RTLD_LOCAL); + void* so_handle = dlopen(so_path.c_str(), RTLD_LAZY | RTLD_LOCAL); if (so_handle == nullptr) { std::cout << dlerror() << std::endl; return Error::AccessFailed; diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 4a4b659c748..37029d150b8 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -50,6 +51,10 @@ DEFINE_string( model_path, "model.pte", "Model serialized in flatbuffer format."); +DEFINE_string( + data_path, + "", + "Path to external tensor data file (.ptd format). Optional."); DEFINE_uint32(num_executions, 1, "Number of times to run the model."); #ifdef ET_EVENT_TRACER_ENABLED DEFINE_string(etdump_path, "model.etdump", "Write ETDump data to this path."); @@ -60,6 +65,7 @@ DEFINE_int32( "Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device."); using executorch::extension::FileDataLoader; +using executorch::extension::FlatTensorDataMap; using executorch::runtime::Error; using executorch::runtime::EValue; using executorch::runtime::EventTracer; @@ -242,8 +248,43 @@ int main(int argc, char** argv) { // be used by a single thread at at time, but it can be reused. // EventTraceManager tracer; + + // Handle optional external tensor data loading + std::unique_ptr data_loader; + std::unique_ptr data_map; + + if (!FLAGS_data_path.empty()) { + ET_LOG( + Info, "Loading external tensor data from %s", FLAGS_data_path.c_str()); + + // Create FileDataLoader for the PTD file + Result data_loader_result = + FileDataLoader::from(FLAGS_data_path.c_str()); + ET_CHECK_MSG( + data_loader_result.ok(), + "Failed to create FileDataLoader for data path %s: 0x%" PRIx32, + FLAGS_data_path.c_str(), + (uint32_t)data_loader_result.error()); + + data_loader = + std::make_unique(std::move(data_loader_result.get())); + + // Create FlatTensorDataMap from the loaded blob + Result data_map_result = + FlatTensorDataMap::load(data_loader.get()); + ET_CHECK_MSG( + data_map_result.ok(), + "Failed to load FlatTensorDataMap from %s: 0x%" PRIx32, + FLAGS_data_path.c_str(), + (uint32_t)data_map_result.error()); + + data_map = + std::make_unique(std::move(data_map_result.get())); + ET_LOG(Info, "External tensor data loaded successfully"); + } + Result method = program->load_method( - method_name, &memory_manager, tracer.get_event_tracer()); + method_name, &memory_manager, tracer.get_event_tracer(), data_map.get()); ET_CHECK_MSG( method.ok(), "Loading of method %s failed with status 0x%" PRIx32, diff --git a/examples/portable/executor_runner/targets.bzl b/examples/portable/executor_runner/targets.bzl index 0af45d85075..d1304a84bcb 100644 --- a/examples/portable/executor_runner/targets.bzl +++ b/examples/portable/executor_runner/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(): "//executorch/devtools/etdump:etdump_flatcc", "//executorch/extension/data_loader:file_data_loader", "//executorch/extension/evalue_util:print_evalue", + "//executorch/extension/flat_tensor:flat_tensor_data_map", "//executorch/extension/runner_util:inputs", ], external_deps = [ @@ -38,6 +39,7 @@ def define_common_targets(): "//executorch/runtime/executor:program", "//executorch/extension/data_loader:file_data_loader", "//executorch/extension/evalue_util:print_evalue", + "//executorch/extension/flat_tensor:flat_tensor_data_map", "//executorch/extension/runner_util:inputs", "//executorch/extension/threadpool:cpuinfo_utils", "//executorch/extension/threadpool:threadpool", diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index cb5595fb8b5..a971df35b13 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -141,6 +141,8 @@ build_runtime() { -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_LOG_LEVEL=Debug \ -DCMAKE_BUILD_TYPE=Debug \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ .. else echo "Building with release configuration..." @@ -149,6 +151,8 @@ build_runtime() { -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ .. fi @@ -158,7 +162,7 @@ build_runtime() { run_inference() { echo "Running executor_runner with debug logging enabled..." - ./cmake-out/executor_runner --model_path aoti_model.pte + ./cmake-out/executor_runner --model_path aoti_model.pte --data_path aoti_cuda_blob.ptd } compare_outputs() { diff --git a/export_aoti.py b/export_aoti.py index d720086ac0f..8be26d0d258 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -29,10 +29,11 @@ from executorch.exir import to_edge, to_edge_transform_and_lower from torch import nn from torch.export import export +from torch.nn.attention import SDPBackend from torchvision import models from torchvision.models.mobilenetv2 import MobileNet_V2_Weights from torchvision.models.resnet import ResNet18_Weights -from transformers import AutoModelForCausalLM +from transformers import AutoModelForCausalLM, WhisperModel # for maintaing precision of 32-bit float as much as possible @@ -190,6 +191,74 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None): return outputs.logits +class Whisper(torch.nn.Module): + def __init__(self, model_name="openai/whisper-tiny"): + super(Whisper, self).__init__() + # 1. Load pre-trained Whisper model (tiny version is lightweight) + self.model = WhisperModel.from_pretrained(model_name) + self.model.eval() + + def forward(self, input_features: torch.Tensor): + outputs = self.model.encoder(input_features=input_features) + + # Return both encoder and decoder hidden states for compatibility + return outputs.last_hidden_state + + +class MockConv1d(nn.Module): + def __init__(self): + super().__init__() + self.conv = nn.Conv1d( + in_channels=80, + out_channels=384, + kernel_size=3, + stride=1, + padding=1, + dilation=1, + groups=1, + bias=True, + ) + + def forward(self, x): + return self.conv(x) + + +class TransformerBlock(nn.Module): + def __init__(self, embed_dim=256, num_heads=8, ff_dim=1024, dropout=0.1): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + + # Multi-head self-attention + self.self_attn = nn.MultiheadAttention( + embed_dim=embed_dim, num_heads=num_heads, dropout=dropout, batch_first=True + ) + + # Layer normalization layers + self.norm1 = nn.LayerNorm(embed_dim) + self.norm2 = nn.LayerNorm(embed_dim) + + # Feed-forward network + self.ffn = nn.Sequential( + nn.Linear(embed_dim, ff_dim), + nn.ReLU(), + nn.Dropout(dropout), + nn.Linear(ff_dim, embed_dim), + nn.Dropout(dropout), + ) + + def forward(self, x): + # Self-attention block with residual connection + attn_output, _ = self.self_attn(x, x, x) + x = self.norm1(x + attn_output) + + # Feed-forward block with residual connection + ff_output = self.ffn(x) + x = self.norm2(x + ff_output) + + return x + + # Model registry mapping model names to their configurations MODEL_REGISTRY: Dict[str, Dict[str, Any]] = { "mv2": { @@ -246,6 +315,24 @@ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None): "device": "cuda", "description": "Llama 3.1 model with KV cache disabled", }, + "whisper": { + "model_class": Whisper, + "input_shapes": [(1, 80, 3000)], + "device": "cuda", + "description": "OpenAI Whisper ASR model. now is encoder only", + }, + "conv1d": { + "model_class": MockConv1d, + "input_shapes": [(1, 80, 3000)], + "device": "cuda", + "description": "Conv1d layer with 80 input channels, 384 output channels", + }, + "transformer_block": { + "model_class": TransformerBlock, + "input_shapes": [(4, 32, 256)], # batch_size=4, seq_len=32, embed_dim=256 + "device": "cuda", + "description": "Single transformer block with multi-head attention and feed-forward network", + }, } @@ -253,7 +340,7 @@ def get_model_and_inputs( model_name: str, ) -> Tuple[torch.nn.Module, Tuple[torch.Tensor, ...]]: """Get model and example inputs based on model name.""" - + # if model_name not in MODEL_REGISTRY: available_models = ", ".join(MODEL_REGISTRY.keys()) raise ValueError( @@ -281,7 +368,9 @@ def get_model_and_inputs( return model, example_inputs -def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.pte"): +def export_model_to_et_aoti( + model, example_inputs, output_pte_path="aoti_model.pte", output_data_dir=None +): """Export model through the AOTI pipeline.""" all_one_input = tuple( torch.ones_like(example_input) for example_input in example_inputs @@ -309,23 +398,24 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p print(f"Starting export process...") - # 1. torch.export: Defines the program with the ATen operator set. print("Step 1: Converting to ATen dialect...") - with torch.no_grad(): - # from torch.export._trace import _export + with torch.nn.attention.sdpa_kernel( + [SDPBackend.MATH] # pyre-fixme[16] + ), torch.no_grad(): + # 1. torch.export: Defines the program with the ATen operator set. aten_dialect = export(model, example_inputs, strict=False) - # print(aten_dialect) - # exit(0) + # print(aten_dialect) + # exit(0) - # 2. to_edge: Make optimizations for Edge devices - # aoti part should be decomposed by the internal torch._inductor.aot_compile - # we should preserve the lowerable part and waiting for aoti backend handle that - # Q: maybe need to turn on fallback_random? + # 2. to_edge: Make optimizations for Edge devices + # aoti part should be decomposed by the internal torch._inductor.aot_compile + # we should preserve the lowerable part and waiting for aoti backend handle that + # Q: maybe need to turn on fallback_random? - edge_program = to_edge_transform_and_lower( - aten_dialect, partitioner=[AotiPartitioner([])] - ) + edge_program = to_edge_transform_and_lower( + aten_dialect, partitioner=[AotiPartitioner([])] + ) # edge_program = to_edge(aten_dialect) @@ -337,11 +427,20 @@ def export_model_to_et_aoti(model, example_inputs, output_filename="aoti_model.p print("To executorch done.") # 4. Save the compiled .pte program - print(f"Step 5: Saving to {output_filename}...") - with open(output_filename, "wb") as file: + if output_data_dir is None: + output_data_dir = os.getcwd() + + print(f"Step 5: Saving pte to {output_pte_path} and ptd to {output_data_dir}") + with open(output_pte_path, "wb") as file: file.write(executorch_program.buffer) - print(f"Export completed successfully! Output saved to {output_filename}") + print(f"size of Named Data: {len(executorch_program._tensor_data)}") + + executorch_program.write_tensor_data_to_file(output_data_dir) + + print( + f"Export completed successfully! PTE saved to {output_pte_path} and ptd saved to {output_data_dir}" + ) def export_model_to_pure_aoti(model, example_inputs): From 057f1fad13d3ab92294b7112ab65498cac3c377f Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 10 Sep 2025 00:22:42 -0700 Subject: [PATCH 38/50] use cpu model as input --- backends/aoti/aoti_backend.py | 20 ++++++++++++++++++++ backends/aoti/runtime/aoti_backend.cpp | 2 +- exir/program/_program.py | 2 +- export_aoti.py | 14 +------------- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 986aa938888..a64bb9c5cc5 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -21,6 +21,7 @@ ) from executorch.exir.backend.compile_spec_schema import CompileSpec from torch._inductor.codegen.cpp_wrapper_cpu import CppWrapperCpu +from torch.export.passes import move_to_device_pass # exist fallback operators in et namespace; @@ -71,14 +72,33 @@ def preprocess( edge_program: ExportedProgram, compile_specs: List[CompileSpec], ) -> PreprocessResult: + print("entering the lowerable parts in AotiBackend.preprocess....") named_data_store = NamedDataStore() # print("here", edge_program.example_inputs) copy_edge_program = copy.deepcopy(edge_program) + + # Move the edge_program from CPU to CUDA using move_to_device_pass + copy_edge_program = move_to_device_pass(copy_edge_program, "cuda") # graph_module = copy_edge_program.graph_module edge_program_module = copy_edge_program.module() args, kwargs = copy_edge_program.example_inputs + + # Deep copy args and move tensors to CUDA for aot_compile + def move_to_cuda(obj): + if isinstance(obj, torch.Tensor): + return obj.cuda() + elif isinstance(obj, (list, tuple)): + return type(obj)(move_to_cuda(item) for item in obj) + elif isinstance(obj, dict): + return {key: move_to_cuda(value) for key, value in obj.items()} + else: + return obj + + args = move_to_cuda(copy.deepcopy(args)) + kwargs = move_to_cuda(copy.deepcopy(kwargs)) + # print("args, kwargs", args, kwargs) print("len(args)", len(args)) print("args[0].shape", args[0].shape) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 24d935d579e..6ccd099da0f 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -83,7 +83,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { std::string so_blob_key = "so_blob"; Result aoti_cuda_buffer = - named_data_map->get_data(aoti_cuda_blob_name.c_str()); + named_data_map->get_data(so_blob_key.c_str()); // Create a temporary file std::ofstream outfile(so_path.c_str(), std::ios::binary); diff --git a/exir/program/_program.py b/exir/program/_program.py index 760056e32bb..e3ada9301b7 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1707,7 +1707,7 @@ def exported_program_to_device(exported_program, device): execution_programs: Dict[str, ExportedProgram] = {} for name, program in self._edge_programs.items(): - program = exported_program_to_device(program, "cpu") + # program = exported_program_to_device(program, "cpu") if config.do_quant_fusion_and_const_prop: if program.graph_signature.backward_signature is not None: raise Exception( diff --git a/export_aoti.py b/export_aoti.py index 8be26d0d258..e644177568c 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -264,73 +264,61 @@ def forward(self, x): "mv2": { "model_class": MV2, "input_shapes": [(1, 3, 224, 224)], - "device": "cuda", "description": "MobileNetV2 model", }, "resnet18": { "model_class": ResNet18, "input_shapes": [(1, 3, 224, 224)], - "device": "cuda", "description": "ResNet18 model", }, "linear": { "model_class": Linear, "input_shapes": [(127, 7)], - "device": "cuda", "description": "Simple linear layer model", }, "conv2d": { "model_class": SingleConv2d, "input_shapes": [(4, 3, 8, 8)], - "device": "cuda", "description": "Single Conv2d layer model", }, "depthwise_conv": { "model_class": DepthwiseConv, "input_shapes": [(1, 32, 112, 112)], - "device": "cuda", "description": "Single Depthwise Conv2d layer model", }, "add": { "model_class": Add, "input_shapes": [(10,), (10,)], - "device": "cuda", "description": "Simple tensor addition model", }, "batchnorm": { "model_class": BatchNorm, "input_shapes": [(1, 16, 32, 32)], - "device": "cuda", "description": "Single BatchNorm2d layer model", }, "single_resnet_block": { "model_class": SingleResNetBlock, "input_shapes": [(1, 64, 8, 8)], - "device": "cuda", "description": "Single ResNet block with skip connection", }, "llama31": { "model_class": Llama31, "input_shapes": [(1, 32)], # batch_size=1, sequence_length=128 - "device": "cuda", "description": "Llama 3.1 model with KV cache disabled", }, "whisper": { "model_class": Whisper, "input_shapes": [(1, 80, 3000)], - "device": "cuda", "description": "OpenAI Whisper ASR model. now is encoder only", }, "conv1d": { "model_class": MockConv1d, "input_shapes": [(1, 80, 3000)], - "device": "cuda", "description": "Conv1d layer with 80 input channels, 384 output channels", }, "transformer_block": { "model_class": TransformerBlock, "input_shapes": [(4, 32, 256)], # batch_size=4, seq_len=32, embed_dim=256 - "device": "cuda", "description": "Single transformer block with multi-head attention and feed-forward network", }, } @@ -350,7 +338,7 @@ def get_model_and_inputs( model_config = MODEL_REGISTRY[model_name] model_class = model_config["model_class"] input_shapes = model_config["input_shapes"] - device = model_config["device"] + device = "cpu" # Create model instance model = model_class().to(device).eval() From 034359affaf9ae69f97b0f986b86fa00e3205b40 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 10 Sep 2025 21:57:38 -0700 Subject: [PATCH 39/50] remove mis-introed libtorch header --- backends/aoti/runtime/aoti_backend.cpp | 7 ------- backends/aoti/runtime/shims/memory.cpp | 1 - backends/aoti/runtime/shims/memory.h | 1 - backends/aoti/runtime/shims/types.h | 1 - 4 files changed, 10 deletions(-) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 6ccd099da0f..efb96c2f363 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -31,9 +31,6 @@ #include "shims/tensor_attribute.h" #include "shims/utils.h" -// Include CUDA AOTI shims -#include - namespace executorch { namespace backends { namespace aoti { @@ -73,10 +70,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { FreeableBuffer* processed, // This will be a empty buffer ArrayRef compile_specs // This will be my empty list ) const override { - // const char* so_path = static_cast(processed->data()); - - // printf("so path: %s\n", so_path); - const NamedDataMap* named_data_map = context.get_named_data_map(); std::string so_path = "/tmp/test.so"; diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index cbf52932268..afe13fe8616 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -61,7 +61,6 @@ bool is_tensor_channels_last( (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); } -// Version 2: For use with ExecutorTorch tensors (int32_t sizes) // Check if tensor is in contiguous memory format (NCHW for 4D tensors) bool is_tensor_contiguous( int64_t ndim, diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index 57058397972..87639d9d8e4 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -8,7 +8,6 @@ #pragma once -#include #include #include #include diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h index 312d05a4d33..27b4394d1b6 100644 --- a/backends/aoti/runtime/shims/types.h +++ b/backends/aoti/runtime/shims/types.h @@ -11,7 +11,6 @@ #include #include #include -#include #include namespace executorch { From bc559a6664726bb2af067499df770406e69bad0b Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 10 Sep 2025 22:14:09 -0700 Subject: [PATCH 40/50] remove unnecessary cuda stream functions --- backends/aoti/runtime/aoti_backend.cpp | 32 +--------------- backends/aoti/runtime/shims/memory.cpp | 43 --------------------- backends/aoti/runtime/shims/memory.h | 53 +++++++++++--------------- 3 files changed, 24 insertions(+), 104 deletions(-) diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index efb96c2f363..6160670042b 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -288,19 +287,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "AOTIBackend output generated"); - // Create a CUDA stream for this execution - cudaStream_t cuda_stream; - cudaError_t stream_err = cudaStreamCreate(&cuda_stream); - if (stream_err != cudaSuccess) { - ET_LOG( - Error, - "Failed to create CUDA stream: %s", - cudaGetErrorString(stream_err)); - return Error::Internal; - } - - ET_LOG(Debug, "Created CUDA stream: %p", cuda_stream); - // Run AOTI container with GPU tensors AOTIRuntimeError error = AOTInductorModelContainerRun( handle->container_handle, @@ -308,7 +294,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { n_inputs, gpu_outputs.data(), // Use GPU output tensors n_outputs, - cuda_stream, // Pass the actual CUDA stream! + nullptr, // Pass the actual CUDA stream! nullptr); // proxy_executor_handle can remain nullptr if (error != Error::Ok) { @@ -321,18 +307,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "AOTIBackend running done"); - // Synchronize the CUDA stream to ensure kernels complete - cudaError_t sync_err = cudaStreamSynchronize(cuda_stream); - if (sync_err != cudaSuccess) { - ET_LOG( - Error, - "Failed to synchronize CUDA stream: %s", - cudaGetErrorString(sync_err)); - return Error::Internal; - } - - ET_LOG(Debug, "CUDA stream synchronized"); - // Copy GPU output results back to CPU output tensors for (int i = 0; i < n_outputs; i++) { auto cpu_output_tensor = &(args[i + n_inputs]->toTensor()); @@ -356,10 +330,6 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { aoti_torch_delete_tensor_object(gpu_outputs[i]); } - // Destroy the CUDA stream - cudaStreamDestroy(cuda_stream); - ET_LOG(Debug, "CUDA stream destroyed and GPU tensors cleaned up"); - ET_LOG(Debug, "AOTIBackend execution completed successfully"); return Error::Ok; diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index afe13fe8616..2b03468ddb3 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -639,49 +639,6 @@ AOTITorchError aoti_torch_copy_( return Error::Ok; } -AOTITorchError aoti_torch_create_cuda_stream_guard( - void* stream, - int32_t device_index, - CUDAStreamGuardHandle* ret_guard) { - std::cout << "Entering stream guard for device " << device_index - << " with stream " << stream << std::endl; - - // Set device - cudaError_t err = cudaSetDevice(device_index); - if (err != cudaSuccess) { - std::cerr << "Failed to set device " << device_index << ": " - << cudaGetErrorString(err) << std::endl; - return Error::Internal; - } - - // Create minimal guard structure - CUDAStreamGuardOpaque* guard = new CUDAStreamGuardOpaque(); - guard->device_index = device_index; - guard->original_stream = static_cast(stream); - guard->sync_event = nullptr; - - std::cout << "Stream guard created successfully for stream " << stream - << std::endl; - - *ret_guard = guard; - return Error::Ok; -} - -AOTITorchError aoti_torch_delete_cuda_stream_guard( - CUDAStreamGuardHandle guard) { - std::cout << "Exiting stream guard" << std::endl; - - if (guard == nullptr) { - return Error::Ok; - } - - // Clean up the guard structure - delete guard; - - std::cout << "Stream guard cleanup completed" << std::endl; - return Error::Ok; -} - AOTITorchError aoti_torch__reinterpret_tensor( AOTITensorHandle self, int64_t ndim, diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index 87639d9d8e4..e0a83109932 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -26,30 +26,30 @@ extern std::unordered_map is_tensor_own_memory; extern std::unordered_set> tensors; // Memory-related operations -// AOTITorchError aoti_torch_create_tensor_from_blob_v2( -// void* data, -// int64_t ndim, -// const int64_t* sizes_ptr, -// const int64_t* strides_ptr, -// int64_t storage_offset, -// int32_t dtype, -// int32_t device_type, -// int32_t device_index, -// AOTITensorHandle* ret_new_tensor, -// int32_t layout, -// const uint8_t* opaque_metadata, -// int64_t opaque_metadata_size); +AOTITorchError aoti_torch_create_tensor_from_blob_v2( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor, + int32_t layout, + const uint8_t* opaque_metadata, + int64_t opaque_metadata_size); -// AOTITorchError aoti_torch_create_tensor_from_blob( -// void* data, -// int64_t ndim, -// const int64_t* sizes_ptr, -// const int64_t* strides_ptr, -// int64_t storage_offset, -// int32_t dtype, -// int32_t device_type, -// int32_t device_index, -// AOTITensorHandle* ret_new_tensor); +AOTITorchError aoti_torch_create_tensor_from_blob( + void* data, + int64_t ndim, + const int64_t* sizes_ptr, + const int64_t* strides_ptr, + int64_t storage_offset, + int32_t dtype, + int32_t device_type, + int32_t device_index, + AOTITensorHandle* ret_new_tensor); AOTITorchError aoti_torch_empty_strided( int64_t ndim, @@ -67,13 +67,6 @@ AOTITorchError aoti_torch_copy_( AOTITensorHandle src, int32_t non_blocking); -AOTITorchError aoti_torch_create_cuda_stream_guard( - void* stream, - int32_t device_index, - CUDAStreamGuardHandle* ret_guard); - -AOTITorchError aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard); - AOTITorchError aoti_torch__reinterpret_tensor( AOTITensorHandle self, int64_t ndim, From 490a2b294900c47db710ba566c1450ff034c862e Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 11 Sep 2025 15:10:57 -0700 Subject: [PATCH 41/50] remove debug print in c++ --- backends/aoti/runtime/shims/memory.cpp | 205 +++++------------- backends/aoti/runtime/shims/memory.h | 12 +- .../aoti/runtime/shims/tensor_attribute.cpp | 27 +-- .../aoti/runtime/shims/tensor_attribute.h | 3 +- backends/aoti/runtime/shims/types.h | 8 - backends/aoti/runtime/shims/utils.cpp | 22 ++ backends/aoti/runtime/shims/utils.h | 6 + 7 files changed, 97 insertions(+), 186 deletions(-) diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 2b03468ddb3..bf5336e9867 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -18,6 +18,7 @@ #include #include #include "tensor_attribute.h" +#include "utils.h" namespace executorch { namespace backends { @@ -25,6 +26,20 @@ namespace aoti { namespace { // Internal namespace for utility functions +// Utility function to print array values in format [val1, val2, ...] +// For use with pointer-based arrays (e.g., int64_t* strides, int64_t* sizes) +template +void print_array_values( + const ValueType* values, + int64_t count, + const std::string& name = "values") { + std::cout << name << ": ["; + for (int i = 0; i < count; i++) { + std::cout << values[i] << (i < count - 1 ? ", " : ""); + } + std::cout << "]" << std::endl; +} + // Version 1: For use with int64_t sizes (e.g., from blob creation functions) // Check if tensor is in contiguous memory format (NCHW for 4D tensors) // Contiguous format means strides decrease from left to right: @@ -61,7 +76,8 @@ bool is_tensor_channels_last( (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); } -// Check if tensor is in contiguous memory format (NCHW for 4D tensors) +// Check if tensor is in contiguous memory format (NCHW for 4D tensors) for +// int32_t sizes bool is_tensor_contiguous( int64_t ndim, const int32_t* sizes, @@ -114,29 +130,22 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( int32_t layout, const uint8_t* opaque_metadata, int64_t opaque_metadata_size) { - std::cout << "Creating tensor from data blob " << data << " - ndim: " << ndim - << ", dtype: " << dtype << ", device_type: " << device_type - << ", storage_offset: " << storage_offset << std::endl; - // Only float32 tensors are supported - if (dtype != 6) { // 6 = float32 - std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " - << dtype << " (expected: 6 for float32)" << std::endl; - return Error::InvalidArgument; + AOTITorchError dtype_error = validate_dtype(dtype); + if (dtype_error != Error::Ok) { + return dtype_error; } // Storage offset must always be 0 - if (storage_offset != 0) { - std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " - << storage_offset << std::endl; - return Error::InvalidArgument; + AOTITorchError storage_offset_error = validate_storage_offset(storage_offset); + if (storage_offset_error != Error::Ok) { + return storage_offset_error; } // Convert sizes to the format expected by ExecutorTorch std::vector sizes(ndim); for (int i = 0; i < ndim; i++) { sizes[i] = static_cast(sizes_ptr[i]); - std::cout << "Size[" << i << "] = " << sizes[i] << std::endl; } // check the tensor format @@ -168,28 +177,11 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( tensors.insert(tensor); *ret_new_tensor = tensor.get(); - is_tensor_own_memory[tensor.get()] = false; - std::cout << "Successfully created tensor from blob: " << tensor.get() - << " wrapping data at: " << adjusted_data << std::endl; - return Error::Ok; } -AOTITorchError aoti_torch_create_tensor_from_blob( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor) { - throw std::runtime_error("Should never create from blob"); -} - AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, @@ -205,14 +197,14 @@ AOTITorchError aoti_torch_empty_strided( numel *= sizes_ptr[i]; } - if (dtype != 6) { // throw if not float32 - throw std::runtime_error("Need to implement empty_strided for non-float32"); + AOTITorchError dtype_error = validate_dtype(dtype); + if (dtype_error != Error::Ok) { + return dtype_error; } int64_t nbytes = numel * 4; if (device_type == 1) { // cuda - std::cout << "Allocating " << nbytes << " bytes on CUDA " << std::endl; cudaError_t err = cudaMalloc(&ptr, nbytes); if (err != cudaSuccess) { std::cout << "failed to allocate " << nbytes @@ -220,8 +212,8 @@ AOTITorchError aoti_torch_empty_strided( throw std::runtime_error("Failed to call cudaMalloc"); } } else if (device_type == 0) { // cpu - std::cout << "Allocating " << nbytes << " bytes on CPU " << std::endl; // Ensure 16-byte alignment for CPU memory to match CUDA requirements + // do we need to do this in cuda backend? int result = posix_memalign(&ptr, 16, nbytes); if (result != 0) { throw std::runtime_error("Failed to allocate aligned CPU memory"); @@ -233,8 +225,6 @@ AOTITorchError aoti_torch_empty_strided( throw std::runtime_error( "Need to implement empty_strided for non-CUDA non-CPU"); } - std::cout << "////Allocated " << nbytes << " bytes at " << ptr - << ", sizes_ptr " << sizes_ptr << std::endl; // ETensor sizes std::vector sizes(ndim); @@ -242,13 +232,6 @@ AOTITorchError aoti_torch_empty_strided( sizes[i] = sizes_ptr[i]; } - std::cout << "Sizes: "; - for (int i = 0; i < ndim; i++) { - std::cout << sizes[i] << ", "; - } - - std::cout << std::endl; - // ETensor strides std::vector strides(ndim); if (strides_ptr != nullptr) { @@ -263,7 +246,6 @@ AOTITorchError aoti_torch_empty_strided( strides[i] = strides[i + 1] * sizes_ptr[i + 1]; } } - std::cout << std::endl; // ETensor creation auto tensor = executorch::extension::from_blob(ptr, sizes, strides); @@ -273,22 +255,10 @@ AOTITorchError aoti_torch_empty_strided( *ret_new_tensor = tensor.get(); is_tensor_own_memory[tensor.get()] = true; - std::cout << "Finished. Created tensor " << tensor.get() << " with sizes " - << std::endl - << "sizes.data(): " << sizes.data() - << ", tensor->sizes().data(): " << tensor->sizes().data() - << std::endl; - std::cout << "Size[0] of tensor " << tensor.get() << " is " - << tensor->sizes()[0] << std::endl - << std::endl; - return Error::Ok; } AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { - std::cout << "Called aoti_torch_delete_tensor_object for tensor " << tensor - << std::endl; - // Check ownership before cleaning up metadata auto ownership_it = is_tensor_own_memory.find(tensor); bool owns_memory = (ownership_it != is_tensor_own_memory.end()) @@ -301,8 +271,7 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { is_tensor_own_memory.erase(tensor); if (!owns_memory) { - std::cout << "Tensor " << tensor << " does not own memory. Skipped \n\n" - << std::endl; + // Don't free memory since the tensor doesn't own it return Error::Ok; } @@ -320,26 +289,16 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { // et tensor does not own data; need to free them manually. if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) { // This is GPU memory - free with proper synchronization - std::cout << "Freeing GPU memory at " << data_ptr << std::endl; cudaDeviceSynchronize(); // Wait for all operations to complete BEFORE // freeing cudaFree(data_ptr); - std::cout << "GPU memory freed successfully" << std::endl; } else { // This is CPU memory - free immediately - std::cout << "Freeing CPU memory at " << data_ptr << std::endl; free(data_ptr); - std::cout << "CPU memory freed successfully" << std::endl; } - - std::cout << "Memory freed. Now erasing tensor " << tensor << std::endl; - // Remove from set (this will call the destructor if it's the last // reference) tensors.erase(it); - - std::cout << "Tensor erased. Now returning \n\n" << std::endl; - return Error::Ok; } } @@ -374,10 +333,14 @@ AOTITorchError aoti_torch_copy_( aoti_torch_get_dtype(self, &self_dtype); aoti_torch_get_dtype(src, &src_dtype); - if (self_dtype != 6 || src_dtype != 6) { // 6 = float32 - std::cout << "Error: Only float32 tensors supported. Got self.dtype=" - << self_dtype << ", src.dtype=" << src_dtype << std::endl; - return Error::InvalidArgument; + AOTITorchError self_dtype_error = validate_dtype(self_dtype); + if (self_dtype_error != Error::Ok) { + return self_dtype_error; + } + + AOTITorchError src_dtype_error = validate_dtype(src_dtype); + if (src_dtype_error != Error::Ok) { + return src_dtype_error; } // Get stride information for layout validation @@ -386,8 +349,10 @@ AOTITorchError aoti_torch_copy_( aoti_torch_get_strides(self, &self_strides); aoti_torch_get_strides(src, &src_strides); - auto self_sizes = self->sizes(); - auto src_sizes = src->sizes(); + int64_t* self_sizes; + int64_t* src_sizes; + aoti_torch_get_sizes(self, &self_sizes); + aoti_torch_get_sizes(src, &src_sizes); // Check if tensors have the same tensor schema (sizes, strides, dtype) bool same_schema = true; @@ -416,66 +381,46 @@ AOTITorchError aoti_torch_copy_( bool self_is_channels_last = false; bool src_is_channels_last = false; - if (same_schema) { - std::cout << "Same tensor schema detected - enabling naive copy" - << std::endl; - // For same schema, we don't need to check memory formats - just use direct - // copy - } else { + // For same schema, we don't need to check memory formats - just use direct + // copy + if (!same_schema) { // Different strides: check memory format and only support contiguous <-> // channels-last conversion - std::cout - << "Different tensor schemas - checking memory format compatibility" - << std::endl; // Check if contiguous (strides decrease from left to right) self_is_contiguous = - is_tensor_contiguous(self->dim(), self_sizes.data(), self_strides); + is_tensor_contiguous(self->dim(), self_sizes, self_strides); src_is_contiguous = - is_tensor_contiguous(src->dim(), src_sizes.data(), src_strides); + is_tensor_contiguous(src->dim(), src_sizes, src_strides); // Check if channels-last (4D: NHWC format) if (!self_is_contiguous) { self_is_channels_last = - is_tensor_channels_last(self->dim(), self_sizes.data(), self_strides); + is_tensor_channels_last(self->dim(), self_sizes, self_strides); } if (!src_is_contiguous) { src_is_channels_last = - is_tensor_channels_last(src->dim(), src_sizes.data(), src_strides); + is_tensor_channels_last(src->dim(), src_sizes, src_strides); } // Validate layout assumptions only when schemas differ if (!self_is_contiguous && !self_is_channels_last) { std::cout << "Error: self tensor must be contiguous or channels-last for stride conversion. " - << "Got strides: ["; - for (int i = 0; i < self->dim(); i++) { - std::cout << self_strides[i] << (i < self->dim() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << "self_sizes: ["; - for (int i = 0; i < self->dim(); i++) { - std::cout << self_sizes[i] << (i < self->dim() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; + << std::endl; + print_array_values(self_strides, self->dim(), "self strides"); + print_array_values(self_sizes, self->dim(), "self_sizes"); return Error::InvalidArgument; } if (!src_is_contiguous && !src_is_channels_last) { std::cout - << "Error: src tensor must be contiguous or channels-last for stride conversion. \n" - << "Got strides: ["; - for (int i = 0; i < src->dim(); i++) { - std::cout << src_strides[i] << (i < src->dim() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; - std::cout << "src_sizes: ["; - for (int i = 0; i < self->dim(); i++) { - std::cout << src_sizes[i] << (i < self->dim() - 1 ? ", " : ""); - } - std::cout << "]" << std::endl; + << "Error: src tensor must be contiguous or channels-last for stride conversion." + << std::endl; + print_array_values(src_strides, src->dim(), "self strides"); + print_array_values(src_sizes, src->dim(), "src_sizes"); return Error::InvalidArgument; } } @@ -493,18 +438,9 @@ AOTITorchError aoti_torch_copy_( bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; - std::cout << "Copy layout: src=" - << (src_is_contiguous ? "contiguous" : "channels-last") << " (" - << (srcIsDevice ? "GPU" : "CPU") << ") -> " - << "dst=" << (self_is_contiguous ? "contiguous" : "channels-last") - << " (" << (dstIsDevice ? "GPU" : "CPU") << ")" << std::endl; - size_t total_bytes = src->nbytes(); if (same_schema) { - std::cout << "Same layout - doing direct copy of " << total_bytes - << " bytes" << std::endl; - // Simple copy since layouts match if (srcIsDevice && dstIsDevice) { err = cudaMemcpy( @@ -646,23 +582,10 @@ AOTITorchError aoti_torch__reinterpret_tensor( const int64_t* strides_ptr, int64_t storage_offset, AOTITensorHandle* ret_new_tensor) { - std::cout << "aoti_torch__reinterpret_tensor called with tensor " << self - << ", ndim: " << ndim << ", storage_offset: " << storage_offset - << std::endl; - - for (int i = 0; i < ndim; i++) { - std::cout << "sizes[" << i << "]: " << sizes_ptr[i] << std::endl; - } - for (int i = 0; i < ndim; i++) { - std::cout << "strides[" << i << "]: " << strides_ptr[i] << std::endl; - } - // Check if storage_offset is not 0 - return error if not - if (storage_offset != 0) { - std::cout - << "Error: aoti_torch__reinterpret_tensor does not support non-zero storage_offset: " - << storage_offset << std::endl; - return Error::InvalidArgument; + AOTITorchError storage_offset_error = validate_storage_offset(storage_offset); + if (storage_offset_error != Error::Ok) { + return storage_offset_error; } // Check if dimensions match @@ -680,13 +603,6 @@ AOTITorchError aoti_torch__reinterpret_tensor( return dtype_err; } - if (dtype != 6) { // 6 = float32 - std::cout - << "ERROR: Only float32 tensors are supported in reinterpret_tensor. Got dtype: " - << dtype << " (expected: 6 for float32)" << std::endl; - return Error::InvalidArgument; - } - int32_t device_type; AOTITorchError device_type_err = aoti_torch_get_device_type(self, &device_type); @@ -705,10 +621,6 @@ AOTITorchError aoti_torch__reinterpret_tensor( return device_index_err; } - std::cout << "Creating new tensor with dtype: " << dtype - << ", device_type: " << device_type - << ", device_index: " << device_index << std::endl; - // Create new tensor with the provided sizes and strides using // aoti_torch_empty_strided AOTITorchError create_err = aoti_torch_empty_strided( @@ -737,9 +649,6 @@ AOTITorchError aoti_torch__reinterpret_tensor( return copy_err; } - std::cout << "Successfully created reinterpreted tensor " << *ret_new_tensor - << " from source tensor " << self << std::endl; - return Error::Ok; } diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index e0a83109932..37c5a5796f5 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -40,17 +41,6 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( const uint8_t* opaque_metadata, int64_t opaque_metadata_size); -AOTITorchError aoti_torch_create_tensor_from_blob( - void* data, - int64_t ndim, - const int64_t* sizes_ptr, - const int64_t* strides_ptr, - int64_t storage_offset, - int32_t dtype, - int32_t device_type, - int32_t device_index, - AOTITensorHandle* ret_new_tensor); - AOTITorchError aoti_torch_empty_strided( int64_t ndim, const int64_t* sizes_ptr, diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index 8e0097cd8bd..955beebd0ed 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -8,6 +8,7 @@ #include "tensor_attribute.h" #include +#include "utils.h" namespace executorch { namespace backends { @@ -44,10 +45,10 @@ AOTITorchError aoti_torch_get_storage_offset( *ret_storage_offset = 0; // ASSERTION: Storage offset must always be 0 - if (*ret_storage_offset != 0) { - std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " - << *ret_storage_offset << std::endl; - return Error::InvalidArgument; + AOTITorchError storage_offset_error = + validate_storage_offset(*ret_storage_offset); + if (storage_offset_error != Error::Ok) { + return storage_offset_error; } return Error::Ok; @@ -66,11 +67,7 @@ AOTITorchError aoti_torch_get_strides( it = tensor_to_strides.emplace(tensor, std::move(strides)).first; } *ret_strides = it->second.data(); - std::cout << "getting strides from tensor " << tensor << " with dim " - << tensor->dim() << std::endl; - for (int i = 0; i < tensor->dim(); i++) { - std::cout << "strides " << i << " = " << (*ret_strides)[i] << std::endl; - } + return Error::Ok; } @@ -80,10 +77,9 @@ AOTITorchError aoti_torch_get_dtype( *ret_dtype = static_cast(tensor->scalar_type()); // ASSERTION: Only float32 tensors are supported - if (*ret_dtype != 6) { // 6 = float32 - std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " - << *ret_dtype << " (expected: 6 for float32)" << std::endl; - return Error::InvalidArgument; + AOTITorchError dtype_error = validate_dtype(*ret_dtype); + if (dtype_error != Error::Ok) { + return dtype_error; } return Error::Ok; @@ -102,11 +98,6 @@ AOTITorchError aoti_torch_get_sizes( it = tensor_to_sizes.emplace(tensor, std::move(sizes)).first; } *ret_sizes = it->second.data(); - std::cout << "getting sizes from tensor " << tensor << " with dim " - << tensor->dim() << std::endl; - for (int i = 0; i < tensor->dim(); i++) { - std::cout << "size " << i << " = " << (*ret_sizes)[i] << std::endl; - } return Error::Ok; } diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/runtime/shims/tensor_attribute.h index 387056a30fd..20ea3d487a0 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.h +++ b/backends/aoti/runtime/shims/tensor_attribute.h @@ -8,9 +8,10 @@ #pragma once -#include "types.h" +#include #include #include +#include "types.h" namespace executorch { namespace backends { diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/runtime/shims/types.h index 27b4394d1b6..1bcae2058ca 100644 --- a/backends/aoti/runtime/shims/types.h +++ b/backends/aoti/runtime/shims/types.h @@ -29,14 +29,6 @@ using AOTITensorHandle = Tensor*; using AOTIRuntimeError = Error; using AOTITorchError = Error; -// CUDA-specific types -struct CUDAStreamGuardOpaque { - cudaStream_t original_stream; - int device_index; - cudaEvent_t sync_event; -}; -using CUDAStreamGuardHandle = CUDAStreamGuardOpaque*; - } // extern "C" } // namespace aoti diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp index a9dc5c84eb7..441cd719fa9 100644 --- a/backends/aoti/runtime/shims/utils.cpp +++ b/backends/aoti/runtime/shims/utils.cpp @@ -195,6 +195,28 @@ void cleanup_aoti_tensor_output() { // No cleanup needed since file is opened and closed on each call } +// Dtype validation utility function +AOTITorchError validate_dtype(int32_t dtype) { + // Only float32 tensors are supported (dtype 6) + if (dtype != 6) { + std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " + << dtype << " (expected: 6 for float32)" << std::endl; + return Error::InvalidArgument; + } + return Error::Ok; +} + +// Storage offset validation utility function +AOTITorchError validate_storage_offset(int64_t storage_offset) { + // Storage offset must always be 0 + if (storage_offset != 0) { + std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " + << storage_offset << std::endl; + return Error::InvalidArgument; + } + return Error::Ok; +} + } // extern "C" } // namespace aoti diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h index 06d2edce212..630bfa3d74c 100644 --- a/backends/aoti/runtime/shims/utils.h +++ b/backends/aoti/runtime/shims/utils.h @@ -24,6 +24,12 @@ extern "C" { // Cleanup function for tensor output file (called during backend destruction) void cleanup_aoti_tensor_output(); +// Dtype validation utility function +AOTITorchError validate_dtype(int32_t dtype); + +// Storage offset validation utility function +AOTITorchError validate_storage_offset(int64_t storage_offset); + } // extern "C" } // namespace aoti From 558d0c2ae06f29273bb9f43709a4e44251a46d5a Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 11 Sep 2025 17:12:54 -0700 Subject: [PATCH 42/50] remove debug print in c++ - 2 --- backends/aoti/runtime/shims/memory.cpp | 57 +------------------ .../aoti/runtime/shims/tensor_attribute.cpp | 18 +----- 2 files changed, 4 insertions(+), 71 deletions(-) diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index bf5336e9867..62f08ba8444 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -40,7 +40,6 @@ void print_array_values( std::cout << "]" << std::endl; } -// Version 1: For use with int64_t sizes (e.g., from blob creation functions) // Check if tensor is in contiguous memory format (NCHW for 4D tensors) // Contiguous format means strides decrease from left to right: // For NCHW: strides = [C*H*W, H*W, W, 1] @@ -76,39 +75,6 @@ bool is_tensor_channels_last( (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); } -// Check if tensor is in contiguous memory format (NCHW for 4D tensors) for -// int32_t sizes -bool is_tensor_contiguous( - int64_t ndim, - const int32_t* sizes, - const int64_t* strides) { - int64_t expected_stride = 1; - for (int i = ndim - 1; i >= 0; i--) { - if (strides[i] != expected_stride) { - return false; - } - expected_stride *= sizes[i]; - } - return true; -} - -// Check if tensor is in channels-last format (NHWC for 4D tensors) -bool is_tensor_channels_last( - int64_t ndim, - const int32_t* sizes, - const int64_t* strides) { - if (ndim != 4) { - return false; // Channels-last only defined for 4D tensors - } - - int64_t N = sizes[0], C = sizes[1], H = sizes[2], W = sizes[3]; - - // Check NHWC format: strides = [H*W*C, 1, W*C, C] - // Handle edge cases where dimensions might be 1 - return (strides[0] == H * W * C || N <= 1) && (strides[1] == 1 || C <= 1) && - (strides[2] == W * C || H <= 1) && (strides[3] == C || W <= 1); -} - } // anonymous namespace // Global storage for tensors and their metadata @@ -318,9 +284,6 @@ AOTITorchError aoti_torch_copy_( AOTITensorHandle self, AOTITensorHandle src, int32_t non_blocking) { - std::cout << "aoti_torch_copy_ called: self=" << self << ", src=" << src - << std::endl; - // assert same dim for now if (self->dim() != src->dim()) { std::cout << "Error: dimension mismatch. self.dim()=" << self->dim() @@ -357,24 +320,14 @@ AOTITorchError aoti_torch_copy_( // Check if tensors have the same tensor schema (sizes, strides, dtype) bool same_schema = true; - // Check sizes match + // Check schema match for (int i = 0; i < self->dim(); i++) { - if (self_sizes[i] != src_sizes[i]) { + if (self_sizes[i] != src_sizes[i] || self_strides[i] != src_strides[i]) { same_schema = false; break; } } - // Check strides match (only if sizes match) - if (same_schema) { - for (int i = 0; i < self->dim(); i++) { - if (self_strides[i] != src_strides[i]) { - same_schema = false; - break; - } - } - } - // Declare layout variables for both cases bool self_is_contiguous = true; bool src_is_contiguous = true; @@ -468,8 +421,6 @@ AOTITorchError aoti_torch_copy_( } } else { // Layout conversion needed (contiguous <-> channels-last) - std::cout << "Layout conversion needed - doing element-wise copy" - << std::endl; if (self->dim() != 4) { std::cout << "Error: Layout conversion only supported for 4D tensors" @@ -568,10 +519,6 @@ AOTITorchError aoti_torch_copy_( dst_first = static_cast(self->data_ptr())[0]; } - std::cout << "Copy verification: src[0]=" << src_first - << ", dst[0]=" << dst_first << std::endl; - std::cout << "aoti_torch_copy_ completed successfully" << std::endl; - return Error::Ok; } diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index 955beebd0ed..57a3805100f 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -44,13 +44,6 @@ AOTITorchError aoti_torch_get_storage_offset( // Storage offset is always 0 in ET *ret_storage_offset = 0; - // ASSERTION: Storage offset must always be 0 - AOTITorchError storage_offset_error = - validate_storage_offset(*ret_storage_offset); - if (storage_offset_error != Error::Ok) { - return storage_offset_error; - } - return Error::Ok; } @@ -110,10 +103,8 @@ AOTITorchError aoti_torch_get_storage_size( AOTITorchError aoti_torch_get_device_type( AOTITensorHandle tensor, int32_t* ret_device_type) { - // Let's assume all tensors AOTI using are on CUDA device + // All tensors in aoti-cuda delegate are on CUDA *ret_device_type = aoti_torch_device_type_cuda(); - std::cout << "getting device_type from tensor " << tensor << " = " - << *ret_device_type << std::endl; return Error::Ok; } @@ -122,15 +113,11 @@ AOTITorchError aoti_torch_get_device_index( int32_t* ret_device_index) { // Let's assume all tensors AOTI using are on CUDA:0 *ret_device_index = 0; - std::cout << "getting device_index from tensor " << tensor << " = " - << *ret_device_index << std::endl; return Error::Ok; } AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) { - *ret_dim = tensor->dim(); - std::cout << "getting dim from tensor " << tensor << " = " << *ret_dim - << std::endl; + *ret_dim = static_cast(tensor->dim()); return Error::Ok; } @@ -152,7 +139,6 @@ aoti_torch_device_type_cuda() { } __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { - // Let assume the dtype here is all we will support return 6; } From 5609a5d499cf4945dff083607f53832df1f89d04 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 11 Sep 2025 17:34:37 -0700 Subject: [PATCH 43/50] use et_log and et::error for err and its msg --- backends/aoti/runtime/shims/memory.cpp | 173 ++++++++++++++++--------- backends/aoti/runtime/shims/memory.h | 2 +- backends/aoti/runtime/shims/utils.cpp | 13 +- 3 files changed, 124 insertions(+), 64 deletions(-) diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index 62f08ba8444..ebc6a0012a0 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -7,6 +7,7 @@ */ #include "memory.h" +#include #include #include #include @@ -26,18 +27,28 @@ namespace aoti { namespace { // Internal namespace for utility functions -// Utility function to print array values in format [val1, val2, ...] +// Utility function to log array values as error msg in format [val1, val2, ...] // For use with pointer-based arrays (e.g., int64_t* strides, int64_t* sizes) -template -void print_array_values( - const ValueType* values, +void et_error_log_array_values( + const int64_t* values, int64_t count, const std::string& name = "values") { - std::cout << name << ": ["; - for (int i = 0; i < count; i++) { - std::cout << values[i] << (i < count - 1 ? ", " : ""); + if (count <= 0) { + ET_LOG(Error, "%s: empty array", name.c_str()); + return; } - std::cout << "]" << std::endl; + + // Build array string representation + std::string array_str = "["; + for (int64_t i = 0; i < count; i++) { + array_str += std::to_string(values[i]); + if (i < count - 1) { + array_str += ", "; + } + } + array_str += "]"; + + ET_LOG(Error, "%s: %s", name.c_str(), array_str.c_str()); } // Check if tensor is in contiguous memory format (NCHW for 4D tensors) @@ -117,9 +128,9 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( // check the tensor format // Only support contiguous format for now if (!is_tensor_contiguous(ndim, sizes_ptr, strides_ptr)) { - std::cout - << "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format. Return with Error" - << std::endl; + ET_LOG( + Error, + "aoti_torch_create_tensor_from_blob_v2 failed since input stride is not in contiguous format"); return Error::InvalidArgument; } @@ -135,7 +146,7 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( ); if (!tensor) { - std::cerr << "Failed to create tensor from blob" << std::endl; + ET_LOG(Error, "Failed to create tensor from blob"); return Error::InvalidArgument; } @@ -173,23 +184,31 @@ AOTITorchError aoti_torch_empty_strided( if (device_type == 1) { // cuda cudaError_t err = cudaMalloc(&ptr, nbytes); if (err != cudaSuccess) { - std::cout << "failed to allocate " << nbytes - << " error: " << cudaGetErrorString(err) << std::endl; - throw std::runtime_error("Failed to call cudaMalloc"); + ET_LOG( + Error, + "failed to allocate %ld bytes: %s", + nbytes, + cudaGetErrorString(err)); + return Error::MemoryAllocationFailed; } } else if (device_type == 0) { // cpu // Ensure 16-byte alignment for CPU memory to match CUDA requirements // do we need to do this in cuda backend? int result = posix_memalign(&ptr, 16, nbytes); if (result != 0) { - throw std::runtime_error("Failed to allocate aligned CPU memory"); + ET_LOG(Error, "Failed to allocate aligned CPU memory"); + return Error::MemoryAllocationFailed; } if (ptr == nullptr) { - throw std::runtime_error("Failed to call posix_memalign"); + ET_LOG(Error, "Failed to call posix_memalign"); + return Error::MemoryAllocationFailed; } } else { - throw std::runtime_error( - "Need to implement empty_strided for non-CUDA non-CPU"); + ET_LOG( + Error, + "Need to implement empty_strided for non-CUDA non-CPU device type %d", + device_type); + return Error::NotImplemented; } // ETensor sizes @@ -268,16 +287,16 @@ AOTITorchError aoti_torch_delete_tensor_object(AOTITensorHandle tensor) { return Error::Ok; } } - std::cout << "Error: Didn't find tensor " << tensor << std::endl; + ET_LOG(Error, "Didn't find tensor %p", tensor); return Error::InvalidArgument; } -void checkCudaError(cudaError_t err, const char* msg) { +AOTITorchError checkCudaError(cudaError_t err, const char* msg) { if (err != cudaSuccess) { - std::cerr << "Error: " << msg << " (" << cudaGetErrorString(err) << ")" - << std::endl; - exit(EXIT_FAILURE); + ET_LOG(Error, "%s (%s)", msg, cudaGetErrorString(err)); + return Error::Internal; } + return Error::Ok; } AOTITorchError aoti_torch_copy_( @@ -286,8 +305,11 @@ AOTITorchError aoti_torch_copy_( int32_t non_blocking) { // assert same dim for now if (self->dim() != src->dim()) { - std::cout << "Error: dimension mismatch. self.dim()=" << self->dim() - << ", src.dim()=" << src->dim() << std::endl; + ET_LOG( + Error, + "dimension mismatch. self.dim()=%d, src.dim()=%d", + self->dim(), + src->dim()); return Error::InvalidArgument; } @@ -360,20 +382,20 @@ AOTITorchError aoti_torch_copy_( // Validate layout assumptions only when schemas differ if (!self_is_contiguous && !self_is_channels_last) { - std::cout - << "Error: self tensor must be contiguous or channels-last for stride conversion. " - << std::endl; - print_array_values(self_strides, self->dim(), "self strides"); - print_array_values(self_sizes, self->dim(), "self_sizes"); + ET_LOG( + Error, + "self tensor must be contiguous or channels-last for stride conversion"); + et_error_log_array_values(self_strides, self->dim(), "self strides"); + et_error_log_array_values(self_sizes, self->dim(), "self_sizes"); return Error::InvalidArgument; } if (!src_is_contiguous && !src_is_channels_last) { - std::cout - << "Error: src tensor must be contiguous or channels-last for stride conversion." - << std::endl; - print_array_values(src_strides, src->dim(), "self strides"); - print_array_values(src_sizes, src->dim(), "src_sizes"); + ET_LOG( + Error, + "src tensor must be contiguous or channels-last for stride conversion"); + et_error_log_array_values(src_strides, src->dim(), "self strides"); + et_error_log_array_values(src_sizes, src->dim(), "src_sizes"); return Error::InvalidArgument; } } @@ -383,10 +405,18 @@ AOTITorchError aoti_torch_copy_( cudaError_t err; err = cudaPointerGetAttributes(&srcAttributes, src->data_ptr()); - checkCudaError(err, "Failed to get source pointer attributes"); + AOTITorchError cuda_err = + checkCudaError(err, "Failed to get source pointer attributes"); + if (cuda_err != Error::Ok) { + return cuda_err; + } err = cudaPointerGetAttributes(&dstAttributes, self->data_ptr()); - checkCudaError(err, "Failed to get destination pointer attributes"); + cuda_err = + checkCudaError(err, "Failed to get destination pointer attributes"); + if (cuda_err != Error::Ok) { + return cuda_err; + } bool srcIsDevice = srcAttributes.type == cudaMemoryTypeDevice; bool dstIsDevice = dstAttributes.type == cudaMemoryTypeDevice; @@ -401,21 +431,30 @@ AOTITorchError aoti_torch_copy_( src->data_ptr(), total_bytes, cudaMemcpyDeviceToDevice); - checkCudaError(err, "Failed to copy from device to device"); + cuda_err = checkCudaError(err, "Failed to copy from device to device"); + if (cuda_err != Error::Ok) { + return cuda_err; + } } else if (srcIsDevice && !dstIsDevice) { err = cudaMemcpy( self->mutable_data_ptr(), src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost); - checkCudaError(err, "Failed to copy from device to host"); + cuda_err = checkCudaError(err, "Failed to copy from device to host"); + if (cuda_err != Error::Ok) { + return cuda_err; + } } else if (!srcIsDevice && dstIsDevice) { err = cudaMemcpy( self->mutable_data_ptr(), src->data_ptr(), total_bytes, cudaMemcpyHostToDevice); - checkCudaError(err, "Failed to copy from host to device"); + cuda_err = checkCudaError(err, "Failed to copy from host to device"); + if (cuda_err != Error::Ok) { + return cuda_err; + } } else { std::memcpy(self->mutable_data_ptr(), src->data_ptr(), total_bytes); } @@ -423,8 +462,7 @@ AOTITorchError aoti_torch_copy_( // Layout conversion needed (contiguous <-> channels-last) if (self->dim() != 4) { - std::cout << "Error: Layout conversion only supported for 4D tensors" - << std::endl; + ET_LOG(Error, "Layout conversion only supported for 4D tensors"); return Error::NotImplemented; } @@ -439,7 +477,11 @@ AOTITorchError aoti_torch_copy_( src_host_data = new float[total_elements]; err = cudaMemcpy( src_host_data, src->data_ptr(), total_bytes, cudaMemcpyDeviceToHost); - checkCudaError(err, "Failed to copy src to host"); + cuda_err = checkCudaError(err, "Failed to copy src to host"); + if (cuda_err != Error::Ok) { + delete[] src_host_data; + return cuda_err; + } need_free_src = true; } else { src_host_data = static_cast(src->data_ptr()); @@ -491,7 +533,15 @@ AOTITorchError aoti_torch_copy_( dst_host_data, total_bytes, cudaMemcpyHostToDevice); - checkCudaError(err, "Failed to copy result to device"); + cuda_err = checkCudaError(err, "Failed to copy result to device"); + if (cuda_err != Error::Ok) { + // Clean up temporary buffers before returning + if (need_free_src) + delete[] src_host_data; + if (need_free_dst) + delete[] dst_host_data; + return cuda_err; + } } // Clean up temporary buffers @@ -506,7 +556,10 @@ AOTITorchError aoti_torch_copy_( if (srcIsDevice) { err = cudaMemcpy( &src_first, src->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost); - checkCudaError(err, "Failed to copy first src element"); + cuda_err = checkCudaError(err, "Failed to copy first src element"); + if (cuda_err != Error::Ok) { + return cuda_err; + } } else { src_first = static_cast(src->data_ptr())[0]; } @@ -514,7 +567,10 @@ AOTITorchError aoti_torch_copy_( if (dstIsDevice) { err = cudaMemcpy( &dst_first, self->data_ptr(), sizeof(float), cudaMemcpyDeviceToHost); - checkCudaError(err, "Failed to copy first dst element"); + cuda_err = checkCudaError(err, "Failed to copy first dst element"); + if (cuda_err != Error::Ok) { + return cuda_err; + } } else { dst_first = static_cast(self->data_ptr())[0]; } @@ -537,8 +593,11 @@ AOTITorchError aoti_torch__reinterpret_tensor( // Check if dimensions match if (self->dim() != ndim) { - std::cout << "Error: tensor dimension mismatch. self->dim(): " - << self->dim() << ", provided ndim: " << ndim << std::endl; + ET_LOG( + Error, + "tensor dimension mismatch. self->dim(): %d, provided ndim: %ld", + self->dim(), + ndim); return Error::InvalidArgument; } @@ -546,7 +605,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( int32_t dtype; AOTITorchError dtype_err = aoti_torch_get_dtype(self, &dtype); if (dtype_err != Error::Ok) { - std::cout << "Error: failed to get dtype from input tensor" << std::endl; + ET_LOG(Error, "failed to get dtype from input tensor"); return dtype_err; } @@ -554,8 +613,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( AOTITorchError device_type_err = aoti_torch_get_device_type(self, &device_type); if (device_type_err != Error::Ok) { - std::cout << "Error: failed to get device_type from input tensor" - << std::endl; + ET_LOG(Error, "failed to get device_type from input tensor"); return device_type_err; } @@ -563,8 +621,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( AOTITorchError device_index_err = aoti_torch_get_device_index(self, &device_index); if (device_index_err != Error::Ok) { - std::cout << "Error: failed to get device_index from input tensor" - << std::endl; + ET_LOG(Error, "failed to get device_index from input tensor"); return device_index_err; } @@ -580,16 +637,14 @@ AOTITorchError aoti_torch__reinterpret_tensor( ret_new_tensor); if (create_err != Error::Ok) { - std::cout << "Error: failed to create new tensor with empty_strided" - << std::endl; + ET_LOG(Error, "failed to create new tensor with empty_strided"); return create_err; } // Copy data from source tensor to new tensor AOTITorchError copy_err = aoti_torch_copy_(*ret_new_tensor, self, 0); if (copy_err != Error::Ok) { - std::cout << "Error: failed to copy data from source tensor to new tensor" - << std::endl; + ET_LOG(Error, "failed to copy data from source tensor to new tensor"); // Clean up the created tensor on failure aoti_torch_delete_tensor_object(*ret_new_tensor); *ret_new_tensor = nullptr; @@ -603,7 +658,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( void cleanup_memory() { is_tensor_own_memory.clear(); if (!tensors.empty()) { - std::cout << "Warning: tensors not empty" << std::endl; + ET_LOG(Error, "Warning: tensors not empty during cleanup"); } } diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/runtime/shims/memory.h index 37c5a5796f5..8e8e2910b03 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/runtime/shims/memory.h @@ -66,7 +66,7 @@ AOTITorchError aoti_torch__reinterpret_tensor( AOTITensorHandle* ret_new_tensor); // Utility functions -void checkCudaError(cudaError_t err, const char* msg); +AOTITorchError checkCudaError(cudaError_t err, const char* msg); void cleanup_memory(); } // extern "C" diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp index 441cd719fa9..d5399125b3b 100644 --- a/backends/aoti/runtime/shims/utils.cpp +++ b/backends/aoti/runtime/shims/utils.cpp @@ -7,6 +7,7 @@ */ #include "utils.h" +#include #include #include #include @@ -199,8 +200,10 @@ void cleanup_aoti_tensor_output() { AOTITorchError validate_dtype(int32_t dtype) { // Only float32 tensors are supported (dtype 6) if (dtype != 6) { - std::cout << "ERROR: Only float32 tensors are supported. Got dtype: " - << dtype << " (expected: 6 for float32)" << std::endl; + ET_LOG( + Error, + "Only float32 tensors are supported. Got dtype: %d (expected: 6 for float32)", + dtype); return Error::InvalidArgument; } return Error::Ok; @@ -210,8 +213,10 @@ AOTITorchError validate_dtype(int32_t dtype) { AOTITorchError validate_storage_offset(int64_t storage_offset) { // Storage offset must always be 0 if (storage_offset != 0) { - std::cout << "ERROR: Storage offset must be 0. Got storage_offset: " - << storage_offset << std::endl; + ET_LOG( + Error, + "Storage offset must be 0. Got storage_offset: %ld", + storage_offset); return Error::InvalidArgument; } return Error::Ok; From 5d4c928249c8743a0bc125ba481ffac38c440464 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Tue, 16 Sep 2025 11:26:45 -0700 Subject: [PATCH 44/50] remove spaghhtti code 1/n --- CMakeLists.txt | 1 - backends/aoti/aoti_backend.py | 47 ++-- backends/aoti/runtime/aoti_backend.cpp | 2 + backends/aoti/runtime/shims/memory.cpp | 14 +- .../aoti/runtime/shims/tensor_attribute.cpp | 43 +++- backends/aoti/runtime/shims/utils.cpp | 234 +++++------------- backends/aoti/runtime/shims/utils.h | 27 +- exir/backend/backend_api.py | 1 - exir/emit/_emit_program.py | 10 - exir/program/_program.py | 10 - runtime/executor/method.cpp | 2 - 11 files changed, 159 insertions(+), 232 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e3debc9fcf5..ad3163a2297 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,6 @@ # https://github.com/google/XNNPACK/commit/c690daa67f883e1b627aadf7684c06797e9a0684 cmake_minimum_required(VERSION 3.29) project(executorch) -# project(executorch LANGUAGES CXX CUDA) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index a64bb9c5cc5..a07f91eaee7 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -79,36 +79,34 @@ def preprocess( # print("here", edge_program.example_inputs) copy_edge_program = copy.deepcopy(edge_program) - # Move the edge_program from CPU to CUDA using move_to_device_pass - copy_edge_program = move_to_device_pass(copy_edge_program, "cuda") - # graph_module = copy_edge_program.graph_module - edge_program_module = copy_edge_program.module() + # Move the edge_program from CPU to CUDA for aoti compile + cuda_edge_program = move_to_device_pass(copy_edge_program, "cuda") + + edge_program_module = cuda_edge_program.module() args, kwargs = copy_edge_program.example_inputs - # Deep copy args and move tensors to CUDA for aot_compile - def move_to_cuda(obj): - if isinstance(obj, torch.Tensor): - return obj.cuda() - elif isinstance(obj, (list, tuple)): - return type(obj)(move_to_cuda(item) for item in obj) - elif isinstance(obj, dict): - return {key: move_to_cuda(value) for key, value in obj.items()} - else: - return obj - - args = move_to_cuda(copy.deepcopy(args)) - kwargs = move_to_cuda(copy.deepcopy(kwargs)) - - # print("args, kwargs", args, kwargs) - print("len(args)", len(args)) - print("args[0].shape", args[0].shape) - print("len(kwargs)", len(kwargs)) + # # Deep copy args and move tensors to CUDA for aot_compile + # def move_to_cuda(obj): + # if isinstance(obj, torch.Tensor): + # return obj.cuda() + # elif isinstance(obj, (list, tuple)): + # return type(obj)(move_to_cuda(item) for item in obj) + # elif isinstance(obj, dict): + # return {key: move_to_cuda(value) for key, value in obj.items()} + # else: + # return obj + + # args = move_to_cuda(copy.deepcopy(args)) + # kwargs = move_to_cuda(copy.deepcopy(kwargs)) output_path = os.path.join(os.getcwd(), "aoti.so") options: dict[str, typing.Any] = { + "aot_inductor.embed_kernel_binary": True, + "aot_inductor.link_libtorch": False, "aot_inductor.package_constants_in_so": True, "aot_inductor.output_path": output_path, + "aot_inductor.debug_compile": True, "aot_inductor.force_mmap_weights": False, "max_autotune": True, "max_autotune_gemm_backends": "TRITON", @@ -126,11 +124,6 @@ def move_to_cuda(obj): assert so_path == output_path, f"Expected {output_path} but got {so_path}" - check_call( - f"patchelf --remove-needed libtorch.so --remove-needed libc10.so --remove-needed libtorch_cuda.so --remove-needed libc10_cuda.so --remove-needed libtorch_cpu.so --add-needed libcudart.so {output_path}", - shell=True, - ) - print("so_path", so_path) with open(so_path, "rb") as f: diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/runtime/aoti_backend.cpp index 6160670042b..242ee24e1d9 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/runtime/aoti_backend.cpp @@ -71,6 +71,8 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ) const override { const NamedDataMap* named_data_map = context.get_named_data_map(); + // std::string so_path = "/home/gasoonjia/executorch/aoti.so"; + std::string so_path = "/tmp/test.so"; std::string so_blob_key = "so_blob"; diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/runtime/shims/memory.cpp index ebc6a0012a0..99990f4aeab 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/runtime/shims/memory.cpp @@ -134,15 +134,12 @@ AOTITorchError aoti_torch_create_tensor_from_blob_v2( return Error::InvalidArgument; } - // Since storage_offset is guaranteed to be 0, use data pointer directly - void* adjusted_data = data; - // Create ExecutorTorch tensor that wraps the existing memory // Note: We're NOT copying the data, just wrapping it auto tensor = executorch::extension::make_tensor_ptr( sizes, // tensor dimensions - adjusted_data, // existing memory (don't copy!) - executorch::aten::ScalarType::Float // only supported dtype + data, // existing memory (don't copy!) + dtype_to_scalar_type(dtype) // map int32_t dtype to ScalarType ); if (!tensor) { @@ -179,7 +176,12 @@ AOTITorchError aoti_torch_empty_strided( return dtype_error; } - int64_t nbytes = numel * 4; + size_t element_size = dtype_to_element_size(dtype); + if (element_size == 0) { + ET_LOG(Error, "Invalid element size for dtype: %d", dtype); + return Error::InvalidArgument; + } + int64_t nbytes = numel * element_size; if (device_type == 1) { // cuda cudaError_t err = cudaMalloc(&ptr, nbytes); diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/runtime/shims/tensor_attribute.cpp index 57a3805100f..8d26bbbbe30 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/runtime/shims/tensor_attribute.cpp @@ -138,9 +138,48 @@ aoti_torch_device_type_cuda() { return 1; } +// Dtype constants - these return the PyTorch dtype codes +// Currently only float32 is supported, but using robust enum-based approach __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { - return 6; -} + return static_cast(SupportedDTypes::FLOAT32); +} + +// Future dtype support (commented out for now): +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bool() { +// return static_cast(SupportedDTypes::BOOL); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_uint8() { +// return static_cast(SupportedDTypes::UINT8); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int8() { +// return static_cast(SupportedDTypes::INT8); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int16() { +// return static_cast(SupportedDTypes::INT16); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int32() { +// return static_cast(SupportedDTypes::INT32); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int64() { +// return static_cast(SupportedDTypes::INT64); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float16() { +// return static_cast(SupportedDTypes::FLOAT16); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float64() { +// return static_cast(SupportedDTypes::FLOAT64); +// } +// +// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bfloat16() { +// return static_cast(SupportedDTypes::BFLOAT16); +// } void cleanup_tensor_metadata() { tensor_to_sizes.clear(); diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp index d5399125b3b..8b1734082bd 100644 --- a/backends/aoti/runtime/shims/utils.cpp +++ b/backends/aoti/runtime/shims/utils.cpp @@ -26,187 +26,79 @@ const char* const TENSOR_OUTPUT_FILENAME = extern "C" { -// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg) { -// printf("Printing tensor handle: %p\n", self); - -// if (!self) { -// throw std::runtime_error("Tensor handle is null"); -// } - -// printf("Tensor handle is not null\n"); - -// // Get dtype and check if it's float32 (dtype 6 in PyTorch) -// int32_t dtype = 0; -// if (aoti_torch_get_dtype(self, &dtype) != AOTI_TORCH_SUCCESS) { -// throw std::runtime_error("Failed to get tensor dtype"); -// } - -// printf("Tensor dtype is: %d\n", dtype); - -// if (dtype != 6) { // 6 is the dtype code for float32 in PyTorch -// throw std::runtime_error( -// "Tensor dtype is not float32. Expected dtype 6, got: " + -// std::to_string(dtype)); -// } - -// printf("Tensor dtype is float32\n"); - -// // Get data pointer -// void* data_ptr = nullptr; -// if (aoti_torch_get_data_ptr(self, &data_ptr) != AOTI_TORCH_SUCCESS || -// !data_ptr) { -// throw std::runtime_error("Failed to get tensor data pointer"); -// } - -// printf("Tensor data pointer is %p not null\n", data_ptr); - -// // Get dimensions -// int64_t dim = 0; -// if (aoti_torch_get_dim(self, &dim) != AOTI_TORCH_SUCCESS) { -// throw std::runtime_error("Failed to get tensor dimensions"); -// } - -// printf("Tensor dimensions are: %ld\n", dim); - -// // Get sizes -// int64_t* sizes = nullptr; -// if (aoti_torch_get_sizes(self, &sizes) != AOTI_TORCH_SUCCESS || !sizes) { -// throw std::runtime_error("Failed to get tensor sizes"); -// } - -// printf("Tensor sizes are: %ld\n", sizes); - -// // Calculate total number of elements -// int64_t total_elements = 1; -// for (int i = 0; i < dim; i++) { -// total_elements *= sizes[i]; -// } - -// printf("Total elements in tensor: %ld\n", total_elements); - -// // Check device type to handle CUDA tensors properly -// int32_t device_type = 0; -// if (aoti_torch_get_device_type(self, &device_type) != AOTI_TORCH_SUCCESS) { -// throw std::runtime_error("Failed to get tensor device type"); -// } - -// printf("Tensor device type: %d\n", device_type); - -// AtenTensorHandle cpu_tensor = nullptr; -// const float* float_data = nullptr; -// bool need_cleanup = false; - -// // Check if tensor is on CUDA (device_type 1 is CUDA) -// if (device_type == 1) { -// printf("Tensor is on CUDA, copying to CPU...\n"); - -// // Get strides for creating CPU tensor -// int64_t* strides = nullptr; -// if (aoti_torch_get_strides(self, &strides) != AOTI_TORCH_SUCCESS || -// !strides) { -// throw std::runtime_error("Failed to get tensor strides"); -// } - -// // Create a CPU tensor with same shape and layout -// if (aoti_torch_empty_strided( -// dim, sizes, strides, dtype, 0, -1, &cpu_tensor) != -// AOTI_TORCH_SUCCESS) { -// throw std::runtime_error("Failed to create CPU tensor"); -// } - -// // Copy data from CUDA to CPU tensor -// if (aoti_torch_copy_(cpu_tensor, self, 0) != AOTI_TORCH_SUCCESS) { -// aoti_torch_delete_tensor_object(cpu_tensor); -// throw std::runtime_error("Failed to copy tensor from CUDA to CPU"); -// } - -// // Get CPU data pointer -// void* cpu_data_ptr = nullptr; -// if (aoti_torch_get_data_ptr(cpu_tensor, &cpu_data_ptr) != -// AOTI_TORCH_SUCCESS || -// !cpu_data_ptr) { -// aoti_torch_delete_tensor_object(cpu_tensor); -// throw std::runtime_error("Failed to get CPU tensor data pointer"); -// } - -// float_data = static_cast(cpu_data_ptr); -// need_cleanup = true; -// printf("Successfully copied CUDA tensor to CPU\n"); -// } else { -// // Tensor is already on CPU, use original data pointer -// printf("Tensor is on CPU, using original data pointer\n"); -// float_data = static_cast(data_ptr); -// } - -// // Open file for writing (append mode to not overwrite previous outputs) -// printf("Writing tensor to file: %s\n", internal::TENSOR_OUTPUT_FILENAME); - -// std::ofstream output_file( -// internal::TENSOR_OUTPUT_FILENAME, std::ios::out | std::ios::app); -// if (!output_file.is_open()) { -// if (need_cleanup) { -// aoti_torch_delete_tensor_object(cpu_tensor); -// } -// throw std::runtime_error( -// "Failed to open output file: " + -// std::string(internal::TENSOR_OUTPUT_FILENAME)); -// } - -// printf("Successfully opened file for writing\n"); +// Function to cleanup the tensor output file (to be called from +// aoti_backend.cpp) +void cleanup_aoti_tensor_output() { + // No cleanup needed since file is opened and closed on each call +} -// // Write message and tensor info to file -// output_file << "=== " << msg << " ===" << std::endl; -// output_file << "Device type: " << device_type << std::endl; -// output_file << "Dimensions: " << dim << std::endl; -// output_file << "Sizes: ["; -// for (int i = 0; i < dim; i++) { -// output_file << sizes[i]; -// if (i < dim - 1) -// output_file << ", "; -// } -// output_file << "]" << std::endl; -// output_file << "Total elements: " << total_elements << std::endl; -// output_file << "Data content:" << std::endl; +// Helper function to check if a dtype is supported +bool is_dtype_supported_in_et_cuda(int32_t dtype) { + switch (dtype) { + case static_cast(SupportedDTypes::FLOAT32): + return true; + // case static_cast(SupportedDTypes::BOOL): + // case static_cast(SupportedDTypes::UINT8): + // case static_cast(SupportedDTypes::INT8): + // case static_cast(SupportedDTypes::INT16): + // case static_cast(SupportedDTypes::INT32): + // case static_cast(SupportedDTypes::INT64): + // case static_cast(SupportedDTypes::FLOAT16): + // case static_cast(SupportedDTypes::FLOAT64): + // case static_cast(SupportedDTypes::BFLOAT16): + // return true; + default: + return false; + } +} -// // Write tensor data to file (now safe to access) -// for (int64_t i = 0; i < total_elements; i++) { -// output_file << float_data[i] << " "; -// if (i < total_elements - 1) { -// output_file << ", "; -// // Add newline every 10 elements for readability -// if ((i + 1) % 10 == 0) { -// output_file << std::endl; -// } -// } -// } -// output_file << std::endl << std::endl; +// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's +// elementSize function) +size_t dtype_to_element_size(int32_t dtype) { + // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing + // elementSize function + executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype); + if (scalar_type == executorch::aten::ScalarType::Undefined) { + ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype); + return 0; // Return 0 to indicate error + } -// // Clean up CPU tensor if we created one -// if (need_cleanup) { -// aoti_torch_delete_tensor_object(cpu_tensor); -// printf("Cleaned up temporary CPU tensor\n"); -// } + // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h + return executorch::runtime::elementSize(scalar_type); +} -// // File will be automatically closed when output_file goes out of scope -// } +// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded +// ScalarType::Float) +executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { + // First check if the dtype is supported + if (!is_dtype_supported_in_et_cuda(dtype)) { + ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype); + return executorch::aten::ScalarType::Undefined; + } -// Function to cleanup the tensor output file (to be called from -// aoti_backend.cpp) -void cleanup_aoti_tensor_output() { - // No cleanup needed since file is opened and closed on each call + // If supported, use switch to convert + switch (dtype) { + case static_cast(SupportedDTypes::FLOAT32): + return executorch::aten::ScalarType::Float; + default: + ET_LOG( + Error, "Unexpected error in dtype conversion for dtype: %d", dtype); + return executorch::aten::ScalarType::Undefined; + } } // Dtype validation utility function AOTITorchError validate_dtype(int32_t dtype) { - // Only float32 tensors are supported (dtype 6) - if (dtype != 6) { - ET_LOG( - Error, - "Only float32 tensors are supported. Got dtype: %d (expected: 6 for float32)", - dtype); - return Error::InvalidArgument; + if (is_dtype_supported_in_et_cuda(dtype)) { + return Error::Ok; } - return Error::Ok; + + ET_LOG( + Error, + "Unsupported dtype: %d. Supported dtypes: %d (float32)", + dtype, + static_cast(SupportedDTypes::FLOAT32)); + return Error::InvalidArgument; } // Storage offset validation utility function diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h index 630bfa3d74c..a2af9e95e56 100644 --- a/backends/aoti/runtime/shims/utils.h +++ b/backends/aoti/runtime/shims/utils.h @@ -16,10 +16,33 @@ namespace executorch { namespace backends { namespace aoti { +// Enum for supported data types in et-cuda backend +enum class SupportedDTypes : int32_t { + FLOAT32 = 6, // PyTorch's float32 dtype code + + // BOOL = 11, // PyTorch's bool dtype code + // UINT8 = 1, // PyTorch's uint8 dtype code + // INT8 = 2, // PyTorch's int8 dtype code + // INT16 = 3, // PyTorch's int16 dtype code + // INT32 = 4, // PyTorch's int32 dtype code + // INT64 = 5, // PyTorch's int64 dtype code + // FLOAT16 = 7, // PyTorch's float16 dtype code + // FLOAT64 = 8, // PyTorch's float64 dtype code + // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code +}; + extern "C" { -// // Utility function for printing tensor information -// void aoti_torch_print_tensor_handle(AOTITensorHandle self, const char* msg); +// Helper function to check if a dtype is supported +bool is_dtype_supported_in_et_cuda(int32_t dtype); + +// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's +// elementSize function) +size_t dtype_to_element_size(int32_t dtype); + +// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded +// ScalarType::Float) +executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype); // Cleanup function for tensor output file (called during backend destruction) void cleanup_aoti_tensor_output(); diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index 95c7c9caa6d..c93c41e223c 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -720,7 +720,6 @@ def to_backend( fake_edge_program = copy.deepcopy(edge_program) partitioner_result = partitioner_instance(fake_edge_program) tagged_exported_program = partitioner_result.tagged_exported_program - # Make sure tagged_exported_program has the same example_inputs as edge_program tagged_exported_program.example_inputs = edge_program.example_inputs method_to_tagged_exported_program[method_name] = tagged_exported_program diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index 61997e97687..3430ad7a920 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -156,13 +156,9 @@ def emit_program( instruction_id_to_num_outs_map = {} program_state = _ProgramState() - print( - "111111111111111111111111111111111111111111111111111111111111111111111111111111" - ) # emit each entry point in order according to name. for name, exported_program in sorted(methods.items()): - print(name) # create empty state emitter_state = _EmitterState( values=[], @@ -174,8 +170,6 @@ def emit_program( emit_mutable_buffer_names=emit_mutable_buffer_names, ) - print("222222222222222222222222222222222222222222222222222222222222222222222") - gm = _remove_non_user_outputs(exported_program) emitter = _TopLevelEmitter( @@ -184,8 +178,6 @@ def emit_program( emitter.run() - print("333333333333333333333333333333333333333333333333333333333333333333333") - plans.append(emitter.plan()) debug_handle_map[name] = emitter.debug_handle_map @@ -202,8 +194,6 @@ def emit_program( if prim_getters is not None: plans.extend(emitter._emit_prim_getters(prim_getters)) - print("333333333333333333333333333333333333333333333333333333333333333333333") - return EmitterOutput( debug_handle_map=debug_handle_map, method_to_delegate_debug_id_map=method_to_delegate_debug_id_map, diff --git a/exir/program/_program.py b/exir/program/_program.py index e3ada9301b7..c740bbcb7b3 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -1698,16 +1698,8 @@ def to_executorch( # noqa (FLAKE8) C901 """ config = config if config else ExecutorchBackendConfig() - def exported_program_to_device(exported_program, device): - for _, param in exported_program.named_parameters(): - param.data = param.data.to(device) - for _, buffer in exported_program.named_buffers(): - buffer.data = buffer.data.to(device) - return exported_program - execution_programs: Dict[str, ExportedProgram] = {} for name, program in self._edge_programs.items(): - # program = exported_program_to_device(program, "cpu") if config.do_quant_fusion_and_const_prop: if program.graph_signature.backward_signature is not None: raise Exception( @@ -1834,7 +1826,6 @@ def __init__( backend_config = backend_config or ExecutorchBackendConfig() - print("start emitting..") # Emit methods self._emitter_output: EmitterOutput = emit_program( self._execution_programs, @@ -1842,7 +1833,6 @@ def __init__( self._config_methods, backend_config.emit_mutable_buffer_names, ) - print("done. start serializing..") # Serialize emitter output, ready to be written to a file. self._data_serializer = FlatTensorSerializer() diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp index 1c90f88df7c..65a47594c8d 100644 --- a/runtime/executor/method.cpp +++ b/runtime/executor/method.cpp @@ -1580,8 +1580,6 @@ Error Method::execute() { "chain %" ET_PRIsize_t " has no instructions field", step_state_.chain_idx); - ET_LOG(Debug, "Executing chain idx: %" ET_PRIsize_t, step_state_.chain_idx); - // Loop over instructions step_state_.instr_idx = 0; while (step_state_.instr_idx < chain.s_chain_->instructions()->size()) { From 3e2f2b7b72b14962fc56051eabe8f3332682462e Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 17 Sep 2025 11:31:19 -0700 Subject: [PATCH 45/50] remove unnecessary export code --- backends/aoti/aoti_backend.py | 26 ++---------- backends/aoti/aoti_partitioner.py | 70 +++++++------------------------ 2 files changed, 17 insertions(+), 79 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index a07f91eaee7..58d94d4042f 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -73,31 +73,15 @@ def preprocess( compile_specs: List[CompileSpec], ) -> PreprocessResult: - print("entering the lowerable parts in AotiBackend.preprocess....") named_data_store = NamedDataStore() - # print("here", edge_program.example_inputs) - copy_edge_program = copy.deepcopy(edge_program) + # copy_edge_program = copy.deepcopy(edge_program) # Move the edge_program from CPU to CUDA for aoti compile - cuda_edge_program = move_to_device_pass(copy_edge_program, "cuda") + cuda_edge_program = move_to_device_pass(edge_program, "cuda") edge_program_module = cuda_edge_program.module() - args, kwargs = copy_edge_program.example_inputs - - # # Deep copy args and move tensors to CUDA for aot_compile - # def move_to_cuda(obj): - # if isinstance(obj, torch.Tensor): - # return obj.cuda() - # elif isinstance(obj, (list, tuple)): - # return type(obj)(move_to_cuda(item) for item in obj) - # elif isinstance(obj, dict): - # return {key: move_to_cuda(value) for key, value in obj.items()} - # else: - # return obj - - # args = move_to_cuda(copy.deepcopy(args)) - # kwargs = move_to_cuda(copy.deepcopy(kwargs)) + args, kwargs = cuda_edge_program.example_inputs output_path = os.path.join(os.getcwd(), "aoti.so") @@ -122,10 +106,6 @@ def preprocess( "Please add them to the AOTI backend." ) - assert so_path == output_path, f"Expected {output_path} but got {so_path}" - - print("so_path", so_path) - with open(so_path, "rb") as f: so_data = f.read() diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/aoti_partitioner.py index 6aeb63f959d..6b9089e5915 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/aoti_partitioner.py @@ -6,8 +6,7 @@ # pyre-unsafe -import operator -from typing import Callable, cast, Dict, final, List, Optional, Set, Tuple +from typing import Callable, Dict, final, List, Optional, Tuple import torch from executorch.backends.aoti.aoti_backend import AotiBackend # usort: skip @@ -18,65 +17,26 @@ PartitionResult, ) from executorch.exir.backend.utils import tag_constant_data -from executorch.exir.dialects._ops import ops as exir_ops from torch.export.exported_program import ExportedProgram -from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner - -from torch.fx.passes.operator_support import OperatorSupportBase - - -class AOTISupportedOperators(OperatorSupportBase): - def is_node_supported(self, submodules, node: torch.fx.Node) -> bool: - # supported = node.op == "call_function" and ( - # node.target == operator.getitem - # or str(node.target._op) not in inductor_fallback_ops - # or str(node.target._op) in supported_fallback_operators - # ) - - supported = node.op == "call_function" - - return supported - - def is_node_supported_custom(self, node: torch.fx.Node) -> bool: - if node.target == exir_ops.edge.aten.mean.dim: - keep_dim = node.args[2] if len(node.args) > 2 else False - return cast(bool, keep_dim) - if node.target == exir_ops.edge.aten.var.correction: - keep_dim = node.kwargs.get("keepdim", False) - return cast(bool, keep_dim) - return True @final class AotiPartitioner(Partitioner): def __init__(self, compile_spec: List[CompileSpec]) -> None: self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec) - print(self.delegation_spec) def partition(self, exported_program: ExportedProgram) -> PartitionResult: - # Run the CapabilityBasedPartitioner to return the largest possible - # subgraphs containing the nodes with the tags - # logger.info("AotiPartitioner::partition") - print("entering partitioner...") - - partition_tags = {} - - capability_partitioner = CapabilityBasedPartitioner( - exported_program.graph_module, - AOTISupportedOperators(), - allows_single_node_partition=True, - ) - partition_list = capability_partitioner.propose_partitions() - - assert len(partition_list) == 1, "Graph break is not supported yet" - - print(f"graph breaks into {len(partition_list)} parts") + """ + Fully delegate the graph to AOTInductor by tagging all nodes as a single partition. + """ - for partition in partition_list: - for node in partition.nodes: - tag = f"tag{partition.id}" - node.meta["delegation_tag"] = tag - partition_tags[tag] = self.delegation_spec + partition_tags: Dict[str, DelegationSpec] = {} + for node in exported_program.graph.nodes: + if node.op != "call_function": + continue + tag = f"tag0" + node.meta["delegation_tag"] = tag + partition_tags[tag] = self.delegation_spec tag_constant_data(exported_program) @@ -89,15 +49,13 @@ def ops_to_not_decompose( ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]: """ Return a list of operations that should not be decomposed and let the AOT compiler handle them. + Currently we skip decomposing all ops and let the AOT compiler handle them. """ do_not_decompose = set() - op_support = AOTISupportedOperators() for node in ep.graph.nodes: - if ( - node.op == "call_function" - and isinstance(node.target, torch._ops.OpOverload) - and op_support.is_node_supported(None, node) + if node.op == "call_function" and isinstance( + node.target, torch._ops.OpOverload ): do_not_decompose.add(node.target) return list(do_not_decompose), None From 32c14b12f154614d69418c3a38a7d280365f3124 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Wed, 17 Sep 2025 14:20:22 -0700 Subject: [PATCH 46/50] set example input to only the very first partition --- backends/aoti/aoti_backend.py | 2 -- exir/backend/backend_api.py | 10 ++++++++-- exir/lowered_backend_module.py | 8 +++++++- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/aoti_backend.py index 58d94d4042f..21fcd5d86f0 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/aoti_backend.py @@ -75,8 +75,6 @@ def preprocess( named_data_store = NamedDataStore() - # copy_edge_program = copy.deepcopy(edge_program) - # Move the edge_program from CPU to CUDA for aoti compile cuda_edge_program = move_to_device_pass(edge_program, "cuda") diff --git a/exir/backend/backend_api.py b/exir/backend/backend_api.py index c93c41e223c..d0225437c99 100644 --- a/exir/backend/backend_api.py +++ b/exir/backend/backend_api.py @@ -268,7 +268,9 @@ def _partition_and_lower_one_graph_module( """ Partitioned and lowered the graph module based on the partition tag, this is to handle one graph module. """ - for tag, delegation_spec in partition_result.partition_tags.items(): + for idx, (tag, delegation_spec) in enumerate( + partition_result.partition_tags.items() + ): # Create partition with nodes containing this tag. There should only be # one contained submodule per tag node_list = _get_node_list_with_same_tag( @@ -311,6 +313,7 @@ def _partition_and_lower_one_graph_module( tag, call_module_node, is_submodule, + idx == 0, ) lowered_submodule = to_backend( @@ -452,7 +455,9 @@ def _create_partitions_in_graph_module( is_submodule: bool, ) -> Dict[str, List[torch.fx.Node]]: backend_id_to_submodule_name = {} - for tag, delegation_spec in partition_result.partition_tags.items(): + for idx, (tag, delegation_spec) in enumerate( + partition_result.partition_tags.items() + ): # Create partition with nodes containing this tag. There should only be # one contained submodule per tag node_list = _get_node_list_with_same_tag( @@ -492,6 +497,7 @@ def _create_partitions_in_graph_module( tag, call_module_node, is_submodule, + idx == 0, ) call_module_node.meta["backend_id"] = delegation_spec.backend_id call_module_node.meta["compile_spec"] = delegation_spec.compile_specs diff --git a/exir/lowered_backend_module.py b/exir/lowered_backend_module.py index 2e889c6d81d..3c5ee5d36b0 100644 --- a/exir/lowered_backend_module.py +++ b/exir/lowered_backend_module.py @@ -682,6 +682,7 @@ def create_exported_program_from_submodule( tag: str, call_module_node: torch.fx.Node, is_submodule: bool, + is_first_partition: bool = False, ) -> Tuple[ExportedProgram, Dict[str, InputSpec], Dict[str, OutputSpec]]: """ Creates an ExportedProgram from the given submodule using the parameters and buffers @@ -720,6 +721,11 @@ def create_exported_program_from_submodule( in_spec = pytree.tree_flatten((tuple(subgraph_signature.user_inputs), {}))[1] out_spec = pytree.tree_flatten(subgraph_signature.user_outputs)[1] + # only the example inputs of first parition equals to the example inputs of the owning program + submodule_exmaple_inputs = ( + owning_program.example_inputs if is_first_partition else None + ) + return ( ExportedProgram( root=submodule, @@ -735,7 +741,7 @@ def create_exported_program_from_submodule( ), ) ], - example_inputs=owning_program.example_inputs, + example_inputs=submodule_exmaple_inputs, constants=subgraph_constants, verifiers=[owning_program.verifier], ), From 3b028297f9746eec69beb60b223d36f8de2817d4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 18 Sep 2025 01:39:48 -0700 Subject: [PATCH 47/50] refactor aoti-driven backends --- CMakeLists.txt | 9 +- backends/aoti/CMakeLists.txt | 48 +++---- backends/aoti/README.md | 2 - backends/aoti/{runtime => }/TARGETS | 0 .../{runtime => }/aoti_model_container.cpp | 0 .../aoti/{runtime => }/aoti_model_container.h | 2 +- .../tensor_attribute.cpp => common_shims.cpp} | 81 ++++-------- .../tensor_attribute.h => common_shims.h} | 24 +++- backends/aoti/cuda/CMakeLists.txt | 70 ++++++++++ backends/aoti/cuda/TARGETS | 3 + backends/aoti/cuda/__init__.py | 5 + .../{aoti_backend.py => cuda/cuda_backend.py} | 2 +- .../cuda_partitioner.py} | 6 +- .../runtime/cuda_backend.cpp} | 35 +++-- .../aoti/{ => cuda}/runtime/shims/memory.cpp | 7 +- .../aoti/{ => cuda}/runtime/shims/memory.h | 2 +- .../cuda/runtime/shims/tensor_attribute.cpp | 37 ++++++ .../runtime/shims/tensor_attribute.h} | 13 +- backends/aoti/cuda/runtime/utils.cpp | 71 ++++++++++ backends/aoti/cuda/runtime/utils.h | 36 ++++++ backends/aoti/cuda/targets.bzl | 28 ++++ backends/aoti/runtime/shims/utils.cpp | 121 ------------------ backends/aoti/runtime/shims/utils.h | 60 --------- backends/aoti/{runtime => }/targets.bzl | 16 +-- backends/aoti/utils.cpp | 83 ++++++++++++ backends/aoti/utils.h | 43 +++++++ exir/emit/_emit_program.py | 1 - export_and_run_aoti.sh | 4 +- export_aoti.py | 4 +- tools/cmake/preset/default.cmake | 5 +- 30 files changed, 485 insertions(+), 333 deletions(-) delete mode 100644 backends/aoti/README.md rename backends/aoti/{runtime => }/TARGETS (100%) rename backends/aoti/{runtime => }/aoti_model_container.cpp (100%) rename backends/aoti/{runtime => }/aoti_model_container.h (98%) rename backends/aoti/{runtime/shims/tensor_attribute.cpp => common_shims.cpp} (62%) rename backends/aoti/{runtime/shims/tensor_attribute.h => common_shims.h} (74%) create mode 100644 backends/aoti/cuda/CMakeLists.txt create mode 100644 backends/aoti/cuda/TARGETS create mode 100644 backends/aoti/cuda/__init__.py rename backends/aoti/{aoti_backend.py => cuda/cuda_backend.py} (99%) rename backends/aoti/{aoti_partitioner.py => cuda/cuda_partitioner.py} (91%) rename backends/aoti/{runtime/aoti_backend.cpp => cuda/runtime/cuda_backend.cpp} (93%) rename backends/aoti/{ => cuda}/runtime/shims/memory.cpp (98%) rename backends/aoti/{ => cuda}/runtime/shims/memory.h (97%) create mode 100644 backends/aoti/cuda/runtime/shims/tensor_attribute.cpp rename backends/aoti/{runtime/shims/types.h => cuda/runtime/shims/tensor_attribute.h} (74%) create mode 100644 backends/aoti/cuda/runtime/utils.cpp create mode 100644 backends/aoti/cuda/runtime/utils.h create mode 100644 backends/aoti/cuda/targets.bzl delete mode 100644 backends/aoti/runtime/shims/utils.cpp delete mode 100644 backends/aoti/runtime/shims/utils.h rename backends/aoti/{runtime => }/targets.bzl (68%) create mode 100644 backends/aoti/utils.cpp create mode 100644 backends/aoti/utils.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ad3163a2297..21ec1ba8e7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,7 +50,6 @@ cmake_minimum_required(VERSION 3.29) project(executorch) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}) include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake) @@ -592,9 +591,13 @@ if(EXECUTORCH_BUILD_CORTEX_M) list(APPEND _executorch_backends coretex_m_backend) endif() -if(EXECUTORCH_BUILD_AOTI) +if(EXECUTORCH_BUILD_CUDA) + # Build common AOTI functionality (required for CUDA) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) - list(APPEND _executorch_backends aoti_backend) + # Build CUDA-specific AOTI functionality + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti/cuda) + # Add aoti_cuda to backends - it already depends on aoti_common + list(APPEND _executorch_backends aoti_cuda) endif() if(EXECUTORCH_BUILD_EXTENSION_APPLE) diff --git a/backends/aoti/CMakeLists.txt b/backends/aoti/CMakeLists.txt index ca26f30d73e..ab3ac80e57a 100644 --- a/backends/aoti/CMakeLists.txt +++ b/backends/aoti/CMakeLists.txt @@ -21,48 +21,34 @@ if(NOT EXECUTORCH_ROOT) set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() -find_package(CUDAToolkit REQUIRED) - # Use ExecutorTorch's standard way to find PyTorch libraries for AOTI include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) find_package_torch() -set(_aoti_sources - runtime/aoti_backend.cpp - runtime/aoti_model_container.cpp - runtime/shims/memory.cpp - runtime/shims/tensor_attribute.cpp - runtime/shims/utils.cpp) -add_library(aoti_backend STATIC ${_aoti_sources}) +# Common AOTI functionality (non-CUDA) +set(_aoti_common_sources aoti_model_container.cpp common_shims.cpp utils.cpp) +add_library(aoti_common STATIC ${_aoti_common_sources}) target_include_directories( - aoti_backend - PUBLIC - ${CUDAToolkit_INCLUDE_DIRS} - $ - $ - # PyTorch AOTI headers from ExecutorTorch's torch detection - ${TORCH_INCLUDE_DIRS} + aoti_common + PUBLIC $ $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} ) -target_compile_options(aoti_backend PUBLIC -fexceptions -frtti -fPIC) +target_compile_options(aoti_common PUBLIC -fexceptions -frtti -fPIC) # Ensure symbols are exported properly -target_link_options(aoti_backend PUBLIC -Wl,--export-dynamic) +target_link_options(aoti_common PUBLIC -Wl,--export-dynamic) -# Link against CUDA::cudart, PyTorch libraries and standard libraries +# Link against PyTorch libraries and standard libraries target_link_libraries( - aoti_backend - PUBLIC - extension_tensor - CUDA::cudart - ${CMAKE_DL_LIBS} - # Link PyTorch libraries for AOTI CUDA functions - ${TORCH_LIBRARIES} + aoti_common + PUBLIC extension_tensor ${CMAKE_DL_LIBS} + # Link PyTorch libraries for AOTI functions + ${TORCH_LIBRARIES} ) -# If you need other CUDA libraries, link them similarly: -# target_link_libraries(aoti_backend PUBLIC CUDA::cublas CUDA::cufft ...) -# If you have a custom function, keep it -executorch_target_link_options_shared_lib(aoti_backend) +executorch_target_link_options_shared_lib(aoti_common) + install( - TARGETS aoti_backend + TARGETS aoti_common EXPORT ExecuTorchTargets DESTINATION lib ) diff --git a/backends/aoti/README.md b/backends/aoti/README.md deleted file mode 100644 index 9df05c99e07..00000000000 --- a/backends/aoti/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Experimental AOTI backend -Proceed with caution. This is an experimental backend that is not yet ready for production use. diff --git a/backends/aoti/runtime/TARGETS b/backends/aoti/TARGETS similarity index 100% rename from backends/aoti/runtime/TARGETS rename to backends/aoti/TARGETS diff --git a/backends/aoti/runtime/aoti_model_container.cpp b/backends/aoti/aoti_model_container.cpp similarity index 100% rename from backends/aoti/runtime/aoti_model_container.cpp rename to backends/aoti/aoti_model_container.cpp diff --git a/backends/aoti/runtime/aoti_model_container.h b/backends/aoti/aoti_model_container.h similarity index 98% rename from backends/aoti/runtime/aoti_model_container.h rename to backends/aoti/aoti_model_container.h index 39a8a35c14f..d5cae26cd05 100644 --- a/backends/aoti/runtime/aoti_model_container.h +++ b/backends/aoti/aoti_model_container.h @@ -10,7 +10,7 @@ #include #include -#include "shims/memory.h" +#include "cuda/runtime/shims/memory.h" namespace executorch { namespace backends { diff --git a/backends/aoti/runtime/shims/tensor_attribute.cpp b/backends/aoti/common_shims.cpp similarity index 62% rename from backends/aoti/runtime/shims/tensor_attribute.cpp rename to backends/aoti/common_shims.cpp index 8d26bbbbe30..fbc596ce8b0 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.cpp +++ b/backends/aoti/common_shims.cpp @@ -6,20 +6,31 @@ * LICENSE file in the root directory of this source tree. */ -#include "tensor_attribute.h" +#include "common_shims.h" +#include +#include +#include +#include #include -#include "utils.h" +#include namespace executorch { namespace backends { namespace aoti { +namespace internal { +// Constants for file operations +const char* const TENSOR_OUTPUT_FILENAME = + "/home/gasoonjia/executorch/aoti_intermediate_output.txt"; +} // namespace internal + // Global storage for tensor metadata std::unordered_map> tensor_to_sizes; std::unordered_map> tensor_to_strides; extern "C" { +// Autograd mode functions int32_t aoti_torch_grad_mode_is_enabled() { // No autograd ever return false; @@ -31,6 +42,7 @@ void aoti_torch_grad_mode_set_enabled(bool enabled) { } } +// Tensor attribute operations AOTITorchError aoti_torch_get_data_ptr( AOTITensorHandle tensor, void** ret_data_ptr) { @@ -69,12 +81,6 @@ AOTITorchError aoti_torch_get_dtype( int32_t* ret_dtype) { *ret_dtype = static_cast(tensor->scalar_type()); - // ASSERTION: Only float32 tensors are supported - AOTITorchError dtype_error = validate_dtype(*ret_dtype); - if (dtype_error != Error::Ok) { - return dtype_error; - } - return Error::Ok; } @@ -100,13 +106,6 @@ AOTITorchError aoti_torch_get_storage_size( throw std::runtime_error("Cannot get storage size on ETensor"); } -AOTITorchError aoti_torch_get_device_type( - AOTITensorHandle tensor, - int32_t* ret_device_type) { - // All tensors in aoti-cuda delegate are on CUDA - *ret_device_type = aoti_torch_device_type_cuda(); - return Error::Ok; -} AOTITorchError aoti_torch_get_device_index( AOTITensorHandle tensor, @@ -121,6 +120,7 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim) { return Error::Ok; } +// Device and layout utility functions int32_t aoti_torch_device_type_cpu() { // Let's say cpu is 0 for ET as well return 0; @@ -132,60 +132,23 @@ __attribute__((__visibility__("default"))) int32_t aoti_torch_layout_strided() { return 0; } -__attribute__((__visibility__("default"))) int32_t -aoti_torch_device_type_cuda() { - // Let's say cuda is 1 for ET as well - return 1; -} - // Dtype constants - these return the PyTorch dtype codes // Currently only float32 is supported, but using robust enum-based approach __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float32() { - return static_cast(SupportedDTypes::FLOAT32); + return 6; // PyTorch's float32 dtype code } -// Future dtype support (commented out for now): -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bool() { -// return static_cast(SupportedDTypes::BOOL); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_uint8() { -// return static_cast(SupportedDTypes::UINT8); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int8() { -// return static_cast(SupportedDTypes::INT8); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int16() { -// return static_cast(SupportedDTypes::INT16); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int32() { -// return static_cast(SupportedDTypes::INT32); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_int64() { -// return static_cast(SupportedDTypes::INT64); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float16() { -// return static_cast(SupportedDTypes::FLOAT16); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_float64() { -// return static_cast(SupportedDTypes::FLOAT64); -// } -// -// __attribute__((__visibility__("default"))) int32_t aoti_torch_dtype_bfloat16() { -// return static_cast(SupportedDTypes::BFLOAT16); -// } - +// Cleanup functions void cleanup_tensor_metadata() { tensor_to_sizes.clear(); tensor_to_strides.clear(); } +void cleanup_aoti_tensor_output() { + // Clean up any tensor output related resources + // For now this is a no-op, but can be extended if needed +} + } // extern "C" } // namespace aoti diff --git a/backends/aoti/runtime/shims/tensor_attribute.h b/backends/aoti/common_shims.h similarity index 74% rename from backends/aoti/runtime/shims/tensor_attribute.h rename to backends/aoti/common_shims.h index 20ea3d487a0..260a7661c6b 100644 --- a/backends/aoti/runtime/shims/tensor_attribute.h +++ b/backends/aoti/common_shims.h @@ -9,16 +9,30 @@ #pragma once #include +#include +#include +#include +#include +#include #include #include -#include "types.h" namespace executorch { namespace backends { namespace aoti { +// Common using declarations for ExecutorTorch types +using executorch::runtime::Error; +using executorch::runtime::etensor::Tensor; + extern "C" { +// Common AOTI type aliases +// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility +using AOTITensorHandle = Tensor*; +using AOTIRuntimeError = Error; +using AOTITorchError = Error; + // Global storage for tensor metadata extern std::unordered_map> tensor_to_sizes; extern std::unordered_map> tensor_to_strides; @@ -48,10 +62,6 @@ AOTITorchError aoti_torch_get_storage_size( AOTITensorHandle tensor, int64_t* ret_size); -AOTITorchError aoti_torch_get_device_type( - AOTITensorHandle tensor, - int32_t* ret_device_type); - AOTITorchError aoti_torch_get_device_index( AOTITensorHandle tensor, int32_t* ret_device_index); @@ -60,7 +70,6 @@ AOTITorchError aoti_torch_get_dim(AOTITensorHandle tensor, int64_t* ret_dim); // Utility functions for device and layout information int32_t aoti_torch_device_type_cpu(); -int32_t aoti_torch_device_type_cuda(); int32_t aoti_torch_layout_strided(); int32_t aoti_torch_dtype_float32(); @@ -68,8 +77,9 @@ int32_t aoti_torch_dtype_float32(); int32_t aoti_torch_grad_mode_is_enabled(); void aoti_torch_grad_mode_set_enabled(bool enabled); -// Cleanup function for clearing global state +// Cleanup functions for clearing global state void cleanup_tensor_metadata(); +void cleanup_aoti_tensor_output(); } // extern "C" diff --git a/backends/aoti/cuda/CMakeLists.txt b/backends/aoti/cuda/CMakeLists.txt new file mode 100644 index 00000000000..971d92bd044 --- /dev/null +++ b/backends/aoti/cuda/CMakeLists.txt @@ -0,0 +1,70 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Build AOTI CUDA backend for runtime. +# +# ### Editing this file ### +# +# This file should be formatted with +# ~~~ +# cmake-format -i CMakeLists.txt +# ~~~ +# It should also be cmake-lint clean. +# + +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Source root directory for executorch. +if(NOT EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) +endif() + +find_package(CUDAToolkit REQUIRED) + +# Use ExecutorTorch's standard way to find PyTorch libraries for AOTI +include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) +find_package_torch() + +# CUDA-specific AOTI functionality +set(_aoti_cuda_sources + runtime/cuda_backend.cpp + runtime/shims/memory.cpp + runtime/shims/tensor_attribute.cpp + runtime/utils.cpp) +add_library(aoti_cuda STATIC ${_aoti_cuda_sources}) +target_include_directories( + aoti_cuda + PUBLIC + ${CUDAToolkit_INCLUDE_DIRS} + $ + $ + # PyTorch AOTI headers from ExecutorTorch's torch detection + ${TORCH_INCLUDE_DIRS} +) +target_compile_options(aoti_cuda PUBLIC -fexceptions -frtti -fPIC) +# Ensure symbols are exported properly +target_link_options(aoti_cuda PUBLIC -Wl,--export-dynamic) + +# Link against CUDA::cudart, common AOTI library, and PyTorch CUDA libraries +target_link_libraries( + aoti_cuda + PUBLIC + aoti_common + CUDA::cudart + ${CMAKE_DL_LIBS} + # Link PyTorch libraries for AOTI CUDA functions + ${TORCH_LIBRARIES} +) +# If you need other CUDA libraries, link them similarly: +# target_link_libraries(aoti_cuda PUBLIC CUDA::cublas CUDA::cufft ...) +executorch_target_link_options_shared_lib(aoti_cuda) + + +install( + TARGETS aoti_cuda + EXPORT ExecuTorchTargets + DESTINATION lib +) diff --git a/backends/aoti/cuda/TARGETS b/backends/aoti/cuda/TARGETS new file mode 100644 index 00000000000..77871de4469 --- /dev/null +++ b/backends/aoti/cuda/TARGETS @@ -0,0 +1,3 @@ +load("targets.bzl", "define_common_targets") + +define_common_targets() diff --git a/backends/aoti/cuda/__init__.py b/backends/aoti/cuda/__init__.py new file mode 100644 index 00000000000..2e41cd717f6 --- /dev/null +++ b/backends/aoti/cuda/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/backends/aoti/aoti_backend.py b/backends/aoti/cuda/cuda_backend.py similarity index 99% rename from backends/aoti/aoti_backend.py rename to backends/aoti/cuda/cuda_backend.py index 21fcd5d86f0..99599de6b6c 100644 --- a/backends/aoti/aoti_backend.py +++ b/backends/aoti/cuda/cuda_backend.py @@ -66,7 +66,7 @@ def generate_c_shim_extern_kernel_call_and_collect_unsupported_kernels( @final -class AotiBackend(BackendDetails): +class CudaBackend(BackendDetails): @staticmethod def preprocess( edge_program: ExportedProgram, diff --git a/backends/aoti/aoti_partitioner.py b/backends/aoti/cuda/cuda_partitioner.py similarity index 91% rename from backends/aoti/aoti_partitioner.py rename to backends/aoti/cuda/cuda_partitioner.py index 6b9089e5915..f48759afa80 100644 --- a/backends/aoti/aoti_partitioner.py +++ b/backends/aoti/cuda/cuda_partitioner.py @@ -9,7 +9,7 @@ from typing import Callable, Dict, final, List, Optional, Tuple import torch -from executorch.backends.aoti.aoti_backend import AotiBackend # usort: skip +from executorch.backends.aoti.cuda.cuda_backend import CudaBackend # usort: skip from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.backend.partitioner import ( DelegationSpec, @@ -21,9 +21,9 @@ @final -class AotiPartitioner(Partitioner): +class CudaPartitioner(Partitioner): def __init__(self, compile_spec: List[CompileSpec]) -> None: - self.delegation_spec = DelegationSpec(AotiBackend.__name__, compile_spec) + self.delegation_spec = DelegationSpec(CudaBackend.__name__, compile_spec) def partition(self, exported_program: ExportedProgram) -> PartitionResult: """ diff --git a/backends/aoti/runtime/aoti_backend.cpp b/backends/aoti/cuda/runtime/cuda_backend.cpp similarity index 93% rename from backends/aoti/runtime/aoti_backend.cpp rename to backends/aoti/cuda/runtime/cuda_backend.cpp index 242ee24e1d9..b6d9bb7d75d 100644 --- a/backends/aoti/runtime/aoti_backend.cpp +++ b/backends/aoti/cuda/runtime/cuda_backend.cpp @@ -25,10 +25,8 @@ #include // Include our shim layer headers -#include "aoti_model_container.h" -#include "shims/memory.h" -#include "shims/tensor_attribute.h" -#include "shims/utils.h" +#include "../../aoti_model_container.h" +#include "../../common_shims.h" namespace executorch { namespace backends { @@ -52,11 +50,11 @@ using executorch::runtime::Result; using executorch::runtime::Span; using executorch::runtime::etensor::Tensor; -class AOTIBackend final : public ::executorch::runtime::BackendInterface { +class CudaBackend final : public ::executorch::runtime::BackendInterface { public: // Once in program - AOTIBackend() { - ET_LOG(Info, "AOTIBackend ctor"); + CudaBackend() { + ET_LOG(Info, "CudaBackend ctor"); } bool is_available() const override { @@ -172,11 +170,11 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { BackendExecutionContext& context, DelegateHandle* handle_, Span args) const override { - ET_LOG(Debug, "AOTIBackend execute"); + ET_LOG(Debug, "CudaBackend execute"); AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_; - ET_LOG(Debug, "AOTIBackend Handle generated"); + ET_LOG(Debug, "CudaBackend Handle generated"); size_t n_inputs; AOTInductorModelContainerGetNumInputs(handle->container_handle, &n_inputs); @@ -185,7 +183,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { AOTInductorModelContainerGetNumOutputs( handle->container_handle, &n_outputs); - ET_LOG(Debug, "AOTIBackend n_outputs %zd generated", n_outputs); + ET_LOG(Debug, "CudaBackend n_outputs %zd generated", n_outputs); if (n_inputs + n_outputs != args.size()) { ET_LOG( @@ -211,7 +209,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { std::vector gpu_outputs( n_outputs); // GPU tensors for kernel output - ET_LOG(Debug, "AOTIBackend input/output vectors generated"); + ET_LOG(Debug, "CudaBackend input/output vectors generated"); // Process input tensors: ExecutorTorch provides CPU tensors, create GPU // copies @@ -255,7 +253,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "Successfully copied input %d from CPU to GPU", i); } - ET_LOG(Debug, "AOTIBackend GPU inputs generated"); + ET_LOG(Debug, "CudaBackend GPU inputs generated"); // Process output tensors: create GPU counterparts for ExecutorTorch CPU // tensors @@ -287,7 +285,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Debug, "Created GPU output tensor %d", i); } - ET_LOG(Debug, "AOTIBackend output generated"); + ET_LOG(Debug, "CudaBackend output generated"); // Run AOTI container with GPU tensors AOTIRuntimeError error = AOTInductorModelContainerRun( @@ -307,7 +305,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { return Error::Internal; } - ET_LOG(Debug, "AOTIBackend running done"); + ET_LOG(Debug, "CudaBackend running done"); // Copy GPU output results back to CPU output tensors for (int i = 0; i < n_outputs; i++) { @@ -332,7 +330,7 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { aoti_torch_delete_tensor_object(gpu_outputs[i]); } - ET_LOG(Debug, "AOTIBackend execution completed successfully"); + ET_LOG(Debug, "CudaBackend execution completed successfully"); return Error::Ok; } @@ -360,16 +358,15 @@ class AOTIBackend final : public ::executorch::runtime::BackendInterface { free(handle); cleanup_memory(); cleanup_tensor_metadata(); - cleanup_aoti_tensor_output(); - ET_LOG(Debug, "AOTIBackend handle %p destroy", handle_); + ET_LOG(Debug, "CudaBackend handle %p destroy", handle_); } }; } // namespace aoti namespace { -auto cls = aoti::AOTIBackend(); -executorch::runtime::Backend backend{"AotiBackend", &cls}; +auto cls = aoti::CudaBackend(); +executorch::runtime::Backend backend{"CudaBackend", &cls}; static executorch::runtime::Error success_with_compiler = register_backend(backend); } // namespace diff --git a/backends/aoti/runtime/shims/memory.cpp b/backends/aoti/cuda/runtime/shims/memory.cpp similarity index 98% rename from backends/aoti/runtime/shims/memory.cpp rename to backends/aoti/cuda/runtime/shims/memory.cpp index 99990f4aeab..7ca83973b8c 100644 --- a/backends/aoti/runtime/shims/memory.cpp +++ b/backends/aoti/cuda/runtime/shims/memory.cpp @@ -6,7 +6,10 @@ * LICENSE file in the root directory of this source tree. */ -#include "memory.h" +#include +#include +#include +#include #include #include #include @@ -18,8 +21,6 @@ #include #include #include -#include "tensor_attribute.h" -#include "utils.h" namespace executorch { namespace backends { diff --git a/backends/aoti/runtime/shims/memory.h b/backends/aoti/cuda/runtime/shims/memory.h similarity index 97% rename from backends/aoti/runtime/shims/memory.h rename to backends/aoti/cuda/runtime/shims/memory.h index 8e8e2910b03..41c03a1f552 100644 --- a/backends/aoti/runtime/shims/memory.h +++ b/backends/aoti/cuda/runtime/shims/memory.h @@ -9,12 +9,12 @@ #pragma once #include +#include #include #include #include #include #include -#include "types.h" namespace executorch { namespace backends { diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp b/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp new file mode 100644 index 00000000000..cb564f10129 --- /dev/null +++ b/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +namespace executorch { +namespace backends { +namespace aoti { + +extern "C" { + +// Device type functions for tensor attributes +AOTITorchError aoti_torch_get_device_type( + AOTITensorHandle tensor, + int32_t* ret_device_type) { + // All tensors in aoti-cuda delegate are on CUDA + *ret_device_type = aoti_torch_device_type_cuda(); + return Error::Ok; +} + +// Device type constants +__attribute__((__visibility__("default"))) int32_t +aoti_torch_device_type_cuda() { + // Let's say cuda is 1 for ET as well + return 1; +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/runtime/shims/types.h b/backends/aoti/cuda/runtime/shims/tensor_attribute.h similarity index 74% rename from backends/aoti/runtime/shims/types.h rename to backends/aoti/cuda/runtime/shims/tensor_attribute.h index 1bcae2058ca..d8866c19f24 100644 --- a/backends/aoti/runtime/shims/types.h +++ b/backends/aoti/cuda/runtime/shims/tensor_attribute.h @@ -8,7 +8,6 @@ #pragma once -#include #include #include #include @@ -24,13 +23,19 @@ using executorch::runtime::etensor::Tensor; extern "C" { // Common AOTI type aliases -// Note: AOTITensorHandle is aliased to Tensor* for ExecutorTorch compatibility using AOTITensorHandle = Tensor*; -using AOTIRuntimeError = Error; using AOTITorchError = Error; +// Device type functions for tensor attributes +AOTITorchError aoti_torch_get_device_type( + AOTITensorHandle tensor, + int32_t* ret_device_type); + +// Device type constants +int32_t aoti_torch_device_type_cuda(); + } // extern "C" } // namespace aoti } // namespace backends -} // namespace executorch +} // namespace executorch \ No newline at end of file diff --git a/backends/aoti/cuda/runtime/utils.cpp b/backends/aoti/cuda/runtime/utils.cpp new file mode 100644 index 00000000000..aee585f3a2e --- /dev/null +++ b/backends/aoti/cuda/runtime/utils.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "utils.h" +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Enum for supported data types in et-cuda backend +enum class SupportedDTypes : int32_t { + FLOAT32 = 6, // PyTorch's float32 dtype code + + // BOOL = 11, // PyTorch's bool dtype code + // UINT8 = 1, // PyTorch's uint8 dtype code + // INT8 = 2, // PyTorch's int8 dtype code + // INT16 = 3, // PyTorch's int16 dtype code + // INT32 = 4, // PyTorch's int32 dtype code + // INT64 = 5, // PyTorch's int64 dtype code + // FLOAT16 = 7, // PyTorch's float16 dtype code + // FLOAT64 = 8, // PyTorch's float64 dtype code + // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code +}; + +extern "C" { + +// Helper function to check if a dtype is supported in ET CUDA backend +bool is_dtype_supported_in_et_cuda(int32_t dtype) { + switch (dtype) { + case static_cast(SupportedDTypes::FLOAT32): + return true; + // case static_cast(SupportedDTypes::BOOL): + // case static_cast(SupportedDTypes::UINT8): + // case static_cast(SupportedDTypes::INT8): + // case static_cast(SupportedDTypes::INT16): + // case static_cast(SupportedDTypes::INT32): + // case static_cast(SupportedDTypes::INT64): + // case static_cast(SupportedDTypes::FLOAT16): + // case static_cast(SupportedDTypes::FLOAT64): + // case static_cast(SupportedDTypes::BFLOAT16): + // return true; + default: + return false; + } +} + +// Dtype validation utility function +AOTITorchError validate_dtype(int32_t dtype) { + if (is_dtype_supported_in_et_cuda(dtype)) { + return Error::Ok; + } + + ET_LOG( + Error, + "Unsupported dtype: %d. Supported dtypes: %d (float32)", + dtype, + static_cast(SupportedDTypes::FLOAT32)); + return Error::InvalidArgument; +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/cuda/runtime/utils.h b/backends/aoti/cuda/runtime/utils.h new file mode 100644 index 00000000000..c941917577c --- /dev/null +++ b/backends/aoti/cuda/runtime/utils.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Common using declarations for ExecutorTorch types +using executorch::runtime::Error; + +extern "C" { + +// Common AOTI type aliases +using AOTITorchError = Error; + +// Helper function to check if a dtype is supported in ET CUDA backend +bool is_dtype_supported_in_et_cuda(int32_t dtype); + +// Dtype validation utility function +AOTITorchError validate_dtype(int32_t dtype); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch diff --git a/backends/aoti/cuda/targets.bzl b/backends/aoti/cuda/targets.bzl new file mode 100644 index 00000000000..be692cbb5a2 --- /dev/null +++ b/backends/aoti/cuda/targets.bzl @@ -0,0 +1,28 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def define_common_targets(): + # CUDA-specific AOTI functionality + runtime.cxx_library( + name = "aoti_cuda", + srcs = [ + "runtime/cuda_backend.cpp", + "runtime/shims/memory.cpp", + "runtime/shims/tensor_attribute.cpp", + "runtime/utils.cpp", + ], + headers = [ + "runtime/shims/memory.h", + "runtime/shims/tensor_attribute.h", + "runtime/utils.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + # Constructor needed for backend registration. + compiler_flags = ["-Wno-global-constructors"], + visibility = ["@EXECUTORCH_CLIENTS"], + deps = [ + "//executorch/backends/aoti:aoti_common", + "//caffe2/torch/csrc/inductor:aoti_torch_cuda", + ], + ) diff --git a/backends/aoti/runtime/shims/utils.cpp b/backends/aoti/runtime/shims/utils.cpp deleted file mode 100644 index 8b1734082bd..00000000000 --- a/backends/aoti/runtime/shims/utils.cpp +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "utils.h" -#include -#include -#include -#include -#include -#include - -namespace executorch { -namespace backends { -namespace aoti { - -namespace internal { -// Constants for file operations -const char* const TENSOR_OUTPUT_FILENAME = - "/home/gasoonjia/executorch/aoti_intermediate_output.txt"; -} // namespace internal - -extern "C" { - -// Function to cleanup the tensor output file (to be called from -// aoti_backend.cpp) -void cleanup_aoti_tensor_output() { - // No cleanup needed since file is opened and closed on each call -} - -// Helper function to check if a dtype is supported -bool is_dtype_supported_in_et_cuda(int32_t dtype) { - switch (dtype) { - case static_cast(SupportedDTypes::FLOAT32): - return true; - // case static_cast(SupportedDTypes::BOOL): - // case static_cast(SupportedDTypes::UINT8): - // case static_cast(SupportedDTypes::INT8): - // case static_cast(SupportedDTypes::INT16): - // case static_cast(SupportedDTypes::INT32): - // case static_cast(SupportedDTypes::INT64): - // case static_cast(SupportedDTypes::FLOAT16): - // case static_cast(SupportedDTypes::FLOAT64): - // case static_cast(SupportedDTypes::BFLOAT16): - // return true; - default: - return false; - } -} - -// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's -// elementSize function) -size_t dtype_to_element_size(int32_t dtype) { - // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing - // elementSize function - executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype); - if (scalar_type == executorch::aten::ScalarType::Undefined) { - ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype); - return 0; // Return 0 to indicate error - } - - // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h - return executorch::runtime::elementSize(scalar_type); -} - -// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded -// ScalarType::Float) -executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { - // First check if the dtype is supported - if (!is_dtype_supported_in_et_cuda(dtype)) { - ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype); - return executorch::aten::ScalarType::Undefined; - } - - // If supported, use switch to convert - switch (dtype) { - case static_cast(SupportedDTypes::FLOAT32): - return executorch::aten::ScalarType::Float; - default: - ET_LOG( - Error, "Unexpected error in dtype conversion for dtype: %d", dtype); - return executorch::aten::ScalarType::Undefined; - } -} - -// Dtype validation utility function -AOTITorchError validate_dtype(int32_t dtype) { - if (is_dtype_supported_in_et_cuda(dtype)) { - return Error::Ok; - } - - ET_LOG( - Error, - "Unsupported dtype: %d. Supported dtypes: %d (float32)", - dtype, - static_cast(SupportedDTypes::FLOAT32)); - return Error::InvalidArgument; -} - -// Storage offset validation utility function -AOTITorchError validate_storage_offset(int64_t storage_offset) { - // Storage offset must always be 0 - if (storage_offset != 0) { - ET_LOG( - Error, - "Storage offset must be 0. Got storage_offset: %ld", - storage_offset); - return Error::InvalidArgument; - } - return Error::Ok; -} - -} // extern "C" - -} // namespace aoti -} // namespace backends -} // namespace executorch diff --git a/backends/aoti/runtime/shims/utils.h b/backends/aoti/runtime/shims/utils.h deleted file mode 100644 index a2af9e95e56..00000000000 --- a/backends/aoti/runtime/shims/utils.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include "types.h" - -namespace executorch { -namespace backends { -namespace aoti { - -// Enum for supported data types in et-cuda backend -enum class SupportedDTypes : int32_t { - FLOAT32 = 6, // PyTorch's float32 dtype code - - // BOOL = 11, // PyTorch's bool dtype code - // UINT8 = 1, // PyTorch's uint8 dtype code - // INT8 = 2, // PyTorch's int8 dtype code - // INT16 = 3, // PyTorch's int16 dtype code - // INT32 = 4, // PyTorch's int32 dtype code - // INT64 = 5, // PyTorch's int64 dtype code - // FLOAT16 = 7, // PyTorch's float16 dtype code - // FLOAT64 = 8, // PyTorch's float64 dtype code - // BFLOAT16 = 15 // PyTorch's bfloat16 dtype code -}; - -extern "C" { - -// Helper function to check if a dtype is supported -bool is_dtype_supported_in_et_cuda(int32_t dtype); - -// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's -// elementSize function) -size_t dtype_to_element_size(int32_t dtype); - -// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded -// ScalarType::Float) -executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype); - -// Cleanup function for tensor output file (called during backend destruction) -void cleanup_aoti_tensor_output(); - -// Dtype validation utility function -AOTITorchError validate_dtype(int32_t dtype); - -// Storage offset validation utility function -AOTITorchError validate_storage_offset(int64_t storage_offset); - -} // extern "C" - -} // namespace aoti -} // namespace backends -} // namespace executorch diff --git a/backends/aoti/runtime/targets.bzl b/backends/aoti/targets.bzl similarity index 68% rename from backends/aoti/runtime/targets.bzl rename to backends/aoti/targets.bzl index d57a187366f..bd46550d81e 100644 --- a/backends/aoti/runtime/targets.bzl +++ b/backends/aoti/targets.bzl @@ -1,21 +1,18 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") def define_common_targets(): + # Common AOTI functionality (non-CUDA) runtime.cxx_library( - name = "aoti_backend", + name = "aoti_common", srcs = [ - "aoti_backend.cpp", "aoti_model_container.cpp", - "shims/memory.cpp", - "shims/tensor_attribute.cpp", - "shims/utils.cpp", + "common_shims.cpp", + "utils.cpp", ], headers = [ "aoti_model_container.h", - "shims/memory.h", - "shims/tensor_attribute.h", - "shims/types.h", - "shims/utils.h", + "common_shims.h", + "utils.h", ], # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) link_whole = True, @@ -27,6 +24,5 @@ def define_common_targets(): "//executorch/runtime/backend:interface", "//executorch/runtime/core:core", "//caffe2/torch/csrc/inductor:aoti_torch", - "//caffe2/torch/csrc/inductor:aoti_torch_cuda", ], ) diff --git a/backends/aoti/utils.cpp b/backends/aoti/utils.cpp new file mode 100644 index 00000000000..95b4f0c4b4f --- /dev/null +++ b/backends/aoti/utils.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "utils.h" +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +extern "C" { + +// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's +// elementSize function) +size_t dtype_to_element_size(int32_t dtype) { + // First convert int32_t dtype to ExecutorTorch ScalarType, then use existing + // elementSize function + executorch::aten::ScalarType scalar_type = dtype_to_scalar_type(dtype); + if (scalar_type == executorch::aten::ScalarType::Undefined) { + ET_LOG(Error, "Unsupported dtype: %d for element size calculation", dtype); + return 0; // Return 0 to indicate error + } + + // Reuse ExecutorTorch's existing elementSize function from scalar_type_util.h + return executorch::runtime::elementSize(scalar_type); +} + +// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded +// ScalarType::Float) +executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { + // Convert based on known PyTorch dtype codes (without CUDA-specific dependency) + switch (dtype) { + case 6: // PyTorch's float32 dtype code + return executorch::aten::ScalarType::Float; + // Future support for additional dtypes can be added here + // case 11: // PyTorch's bool dtype code + // return executorch::aten::ScalarType::Bool; + // case 1: // PyTorch's uint8 dtype code + // return executorch::aten::ScalarType::Byte; + // case 2: // PyTorch's int8 dtype code + // return executorch::aten::ScalarType::Char; + // case 3: // PyTorch's int16 dtype code + // return executorch::aten::ScalarType::Short; + // case 4: // PyTorch's int32 dtype code + // return executorch::aten::ScalarType::Int; + // case 5: // PyTorch's int64 dtype code + // return executorch::aten::ScalarType::Long; + // case 7: // PyTorch's float16 dtype code + // return executorch::aten::ScalarType::Half; + // case 8: // PyTorch's float64 dtype code + // return executorch::aten::ScalarType::Double; + // case 15: // PyTorch's bfloat16 dtype code + // return executorch::aten::ScalarType::BFloat16; + default: + ET_LOG(Error, "Unsupported dtype: %d for ScalarType conversion", dtype); + return executorch::aten::ScalarType::Undefined; + } +} + +// Storage offset validation utility function +AOTITorchError validate_storage_offset(int64_t storage_offset) { + // Storage offset must always be 0 + if (storage_offset != 0) { + ET_LOG( + Error, + "Storage offset must be 0. Got storage_offset: %ld", + storage_offset); + return Error::InvalidArgument; + } + return Error::Ok; +} + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch \ No newline at end of file diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h new file mode 100644 index 00000000000..3fb710a24d8 --- /dev/null +++ b/backends/aoti/utils.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace backends { +namespace aoti { + +// Common using declarations for ExecutorTorch types +using executorch::runtime::Error; + +extern "C" { + +// Common AOTI type aliases +using AOTITorchError = Error; + +// Map int32_t dtype to number of bytes per element (reusing ExecutorTorch's +// elementSize function) +size_t dtype_to_element_size(int32_t dtype); + +// Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded +// ScalarType::Float) +executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype); + +// Storage offset validation utility function +AOTITorchError validate_storage_offset(int64_t storage_offset); + +} // extern "C" + +} // namespace aoti +} // namespace backends +} // namespace executorch \ No newline at end of file diff --git a/exir/emit/_emit_program.py b/exir/emit/_emit_program.py index 3430ad7a920..eb84d508c2c 100644 --- a/exir/emit/_emit_program.py +++ b/exir/emit/_emit_program.py @@ -156,7 +156,6 @@ def emit_program( instruction_id_to_num_outs_map = {} program_state = _ProgramState() - # emit each entry point in order according to name. for name, exported_program in sorted(methods.items()): # create empty state diff --git a/export_and_run_aoti.sh b/export_and_run_aoti.sh index a971df35b13..dd4aeef1017 100644 --- a/export_and_run_aoti.sh +++ b/export_and_run_aoti.sh @@ -136,7 +136,7 @@ build_runtime() { if [[ "$DEBUG_MODE" == true ]]; then echo "Building with debug configuration..." - cmake -DEXECUTORCH_BUILD_AOTI=ON \ + cmake -DEXECUTORCH_BUILD_CUDA=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_LOG_LEVEL=Debug \ @@ -146,7 +146,7 @@ build_runtime() { .. else echo "Building with release configuration..." - cmake -DEXECUTORCH_BUILD_AOTI=ON \ + cmake -DEXECUTORCH_BUILD_CUDA=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \ -DEXECUTORCH_LOG_LEVEL=Info \ diff --git a/export_aoti.py b/export_aoti.py index e644177568c..0fda74a04f7 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -23,7 +23,7 @@ from typing import Any, Dict, Tuple import torch -from executorch.backends.aoti.aoti_partitioner import AotiPartitioner +from executorch.backends.aoti.cuda.cuda_partitioner import CudaPartitioner # from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.exir import to_edge, to_edge_transform_and_lower @@ -402,7 +402,7 @@ def export_model_to_et_aoti( # Q: maybe need to turn on fallback_random? edge_program = to_edge_transform_and_lower( - aten_dialect, partitioner=[AotiPartitioner([])] + aten_dialect, partitioner=[CudaPartitioner([])] ) # edge_program = to_edge(aten_dialect) diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake index 6911aea3e9b..fb993f7d5f0 100644 --- a/tools/cmake/preset/default.cmake +++ b/tools/cmake/preset/default.cmake @@ -161,10 +161,9 @@ define_overridable_option( ) define_overridable_option( - EXECUTORCH_BUILD_AOTI "Build the AOTI backend" BOOL OFF + EXECUTORCH_BUILD_CUDA "Build the AOTI CUDA backend" BOOL OFF ) - if(EXECUTORCH_BUILD_ARM_BAREMETAL) set(_default_executorch_build_pthreadpool OFF) set(_default_executorch_build_cpuinfo OFF) @@ -323,7 +322,7 @@ check_required_options_on( ) check_required_options_on( - IF_ON EXECUTORCH_BUILD_AOTI REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR + IF_ON EXECUTORCH_BUILD_CUDA REQUIRES EXECUTORCH_BUILD_EXTENSION_TENSOR ) check_conflicting_options_on( From 2fe871ce5e1810f5ad36062170bb083cd808e656 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 18 Sep 2025 10:51:44 -0700 Subject: [PATCH 48/50] code refacotr to backend/cuda and backend/aoti --- CMakeLists.txt | 2 +- backends/aoti/aoti_model_container.h | 2 +- backends/aoti/common_shims.cpp | 1 - backends/aoti/utils.cpp | 3 ++- backends/aoti/utils.h | 2 +- backends/{aoti => }/cuda/CMakeLists.txt | 2 +- backends/{aoti => }/cuda/TARGETS | 0 backends/{aoti => }/cuda/__init__.py | 0 backends/{aoti => }/cuda/cuda_backend.py | 0 backends/{aoti => }/cuda/cuda_partitioner.py | 0 backends/{aoti => }/cuda/runtime/cuda_backend.cpp | 4 ++-- backends/{aoti => }/cuda/runtime/shims/memory.cpp | 6 +++--- backends/{aoti => }/cuda/runtime/shims/memory.h | 0 backends/{aoti => }/cuda/runtime/shims/tensor_attribute.cpp | 2 +- backends/{aoti => }/cuda/runtime/shims/tensor_attribute.h | 0 backends/{aoti => }/cuda/runtime/utils.cpp | 0 backends/{aoti => }/cuda/runtime/utils.h | 0 backends/{aoti => }/cuda/targets.bzl | 0 18 files changed, 12 insertions(+), 12 deletions(-) rename backends/{aoti => }/cuda/CMakeLists.txt (96%) rename backends/{aoti => }/cuda/TARGETS (100%) rename backends/{aoti => }/cuda/__init__.py (100%) rename backends/{aoti => }/cuda/cuda_backend.py (100%) rename backends/{aoti => }/cuda/cuda_partitioner.py (100%) rename backends/{aoti => }/cuda/runtime/cuda_backend.cpp (99%) rename backends/{aoti => }/cuda/runtime/shims/memory.cpp (99%) rename backends/{aoti => }/cuda/runtime/shims/memory.h (100%) rename backends/{aoti => }/cuda/runtime/shims/tensor_attribute.cpp (91%) rename backends/{aoti => }/cuda/runtime/shims/tensor_attribute.h (100%) rename backends/{aoti => }/cuda/runtime/utils.cpp (100%) rename backends/{aoti => }/cuda/runtime/utils.h (100%) rename backends/{aoti => }/cuda/targets.bzl (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21ec1ba8e7e..586f1b1128f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -595,7 +595,7 @@ if(EXECUTORCH_BUILD_CUDA) # Build common AOTI functionality (required for CUDA) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti) # Build CUDA-specific AOTI functionality - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/aoti/cuda) + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cuda) # Add aoti_cuda to backends - it already depends on aoti_common list(APPEND _executorch_backends aoti_cuda) endif() diff --git a/backends/aoti/aoti_model_container.h b/backends/aoti/aoti_model_container.h index d5cae26cd05..e8bc253d9c0 100644 --- a/backends/aoti/aoti_model_container.h +++ b/backends/aoti/aoti_model_container.h @@ -8,9 +8,9 @@ #pragma once +#include #include #include -#include "cuda/runtime/shims/memory.h" namespace executorch { namespace backends { diff --git a/backends/aoti/common_shims.cpp b/backends/aoti/common_shims.cpp index fbc596ce8b0..97a0478ba52 100644 --- a/backends/aoti/common_shims.cpp +++ b/backends/aoti/common_shims.cpp @@ -106,7 +106,6 @@ AOTITorchError aoti_torch_get_storage_size( throw std::runtime_error("Cannot get storage size on ETensor"); } - AOTITorchError aoti_torch_get_device_index( AOTITensorHandle tensor, int32_t* ret_device_index) { diff --git a/backends/aoti/utils.cpp b/backends/aoti/utils.cpp index 95b4f0c4b4f..68c28eed265 100644 --- a/backends/aoti/utils.cpp +++ b/backends/aoti/utils.cpp @@ -34,7 +34,8 @@ size_t dtype_to_element_size(int32_t dtype) { // Map int32_t dtype to ExecutorTorch ScalarType (robust version of hardcoded // ScalarType::Float) executorch::aten::ScalarType dtype_to_scalar_type(int32_t dtype) { - // Convert based on known PyTorch dtype codes (without CUDA-specific dependency) + // Convert based on known PyTorch dtype codes (without CUDA-specific + // dependency) switch (dtype) { case 6: // PyTorch's float32 dtype code return executorch::aten::ScalarType::Float; diff --git a/backends/aoti/utils.h b/backends/aoti/utils.h index 3fb710a24d8..828f15ee1a4 100644 --- a/backends/aoti/utils.h +++ b/backends/aoti/utils.h @@ -10,8 +10,8 @@ #include #include -#include #include +#include namespace executorch { namespace backends { diff --git a/backends/aoti/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt similarity index 96% rename from backends/aoti/cuda/CMakeLists.txt rename to backends/cuda/CMakeLists.txt index 971d92bd044..ef6a4ddb8bd 100644 --- a/backends/aoti/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -19,7 +19,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # Source root directory for executorch. if(NOT EXECUTORCH_ROOT) - set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) + set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..) endif() find_package(CUDAToolkit REQUIRED) diff --git a/backends/aoti/cuda/TARGETS b/backends/cuda/TARGETS similarity index 100% rename from backends/aoti/cuda/TARGETS rename to backends/cuda/TARGETS diff --git a/backends/aoti/cuda/__init__.py b/backends/cuda/__init__.py similarity index 100% rename from backends/aoti/cuda/__init__.py rename to backends/cuda/__init__.py diff --git a/backends/aoti/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py similarity index 100% rename from backends/aoti/cuda/cuda_backend.py rename to backends/cuda/cuda_backend.py diff --git a/backends/aoti/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py similarity index 100% rename from backends/aoti/cuda/cuda_partitioner.py rename to backends/cuda/cuda_partitioner.py diff --git a/backends/aoti/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp similarity index 99% rename from backends/aoti/cuda/runtime/cuda_backend.cpp rename to backends/cuda/runtime/cuda_backend.cpp index b6d9bb7d75d..6cd20537e80 100644 --- a/backends/aoti/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -25,8 +25,8 @@ #include // Include our shim layer headers -#include "../../aoti_model_container.h" -#include "../../common_shims.h" +#include +#include namespace executorch { namespace backends { diff --git a/backends/aoti/cuda/runtime/shims/memory.cpp b/backends/cuda/runtime/shims/memory.cpp similarity index 99% rename from backends/aoti/cuda/runtime/shims/memory.cpp rename to backends/cuda/runtime/shims/memory.cpp index 7ca83973b8c..4518b359646 100644 --- a/backends/aoti/cuda/runtime/shims/memory.cpp +++ b/backends/cuda/runtime/shims/memory.cpp @@ -6,9 +6,9 @@ * LICENSE file in the root directory of this source tree. */ -#include -#include -#include +#include +#include +#include #include #include #include diff --git a/backends/aoti/cuda/runtime/shims/memory.h b/backends/cuda/runtime/shims/memory.h similarity index 100% rename from backends/aoti/cuda/runtime/shims/memory.h rename to backends/cuda/runtime/shims/memory.h diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp b/backends/cuda/runtime/shims/tensor_attribute.cpp similarity index 91% rename from backends/aoti/cuda/runtime/shims/tensor_attribute.cpp rename to backends/cuda/runtime/shims/tensor_attribute.cpp index cb564f10129..789c16d7555 100644 --- a/backends/aoti/cuda/runtime/shims/tensor_attribute.cpp +++ b/backends/cuda/runtime/shims/tensor_attribute.cpp @@ -6,7 +6,7 @@ * LICENSE file in the root directory of this source tree. */ -#include +#include namespace executorch { namespace backends { diff --git a/backends/aoti/cuda/runtime/shims/tensor_attribute.h b/backends/cuda/runtime/shims/tensor_attribute.h similarity index 100% rename from backends/aoti/cuda/runtime/shims/tensor_attribute.h rename to backends/cuda/runtime/shims/tensor_attribute.h diff --git a/backends/aoti/cuda/runtime/utils.cpp b/backends/cuda/runtime/utils.cpp similarity index 100% rename from backends/aoti/cuda/runtime/utils.cpp rename to backends/cuda/runtime/utils.cpp diff --git a/backends/aoti/cuda/runtime/utils.h b/backends/cuda/runtime/utils.h similarity index 100% rename from backends/aoti/cuda/runtime/utils.h rename to backends/cuda/runtime/utils.h diff --git a/backends/aoti/cuda/targets.bzl b/backends/cuda/targets.bzl similarity index 100% rename from backends/aoti/cuda/targets.bzl rename to backends/cuda/targets.bzl From 7542caec63bf7ada9d21933e611069ca45de6323 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 18 Sep 2025 12:15:01 -0700 Subject: [PATCH 49/50] solve cuda backend dependency issue --- backends/cuda/cuda_partitioner.py | 2 +- export_aoti.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/cuda/cuda_partitioner.py b/backends/cuda/cuda_partitioner.py index f48759afa80..227d13ba093 100644 --- a/backends/cuda/cuda_partitioner.py +++ b/backends/cuda/cuda_partitioner.py @@ -9,7 +9,7 @@ from typing import Callable, Dict, final, List, Optional, Tuple import torch -from executorch.backends.aoti.cuda.cuda_backend import CudaBackend # usort: skip +from executorch.backends.cuda.cuda_backend import CudaBackend # usort: skip from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.backend.partitioner import ( DelegationSpec, diff --git a/export_aoti.py b/export_aoti.py index 0fda74a04f7..d0bf916f387 100644 --- a/export_aoti.py +++ b/export_aoti.py @@ -23,7 +23,7 @@ from typing import Any, Dict, Tuple import torch -from executorch.backends.aoti.cuda.cuda_partitioner import CudaPartitioner +from executorch.backends.cuda.cuda_partitioner import CudaPartitioner # from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner from executorch.exir import to_edge, to_edge_transform_and_lower From f93d194d52dc2ae443e1f3a586304c0e19fc4d31 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Thu, 18 Sep 2025 15:51:21 -0700 Subject: [PATCH 50/50] add cuda export ci --- .ci/scripts/test-cuda-export-aoti.sh | 105 +++++++++++ .ci/scripts/test_cuda_export_aoti.py | 228 ++++++++++++++++++++++++ .github/workflows/test-backend-cuda.yml | 68 +++++++ 3 files changed, 401 insertions(+) create mode 100755 .ci/scripts/test-cuda-export-aoti.sh create mode 100755 .ci/scripts/test_cuda_export_aoti.py create mode 100644 .github/workflows/test-backend-cuda.yml diff --git a/.ci/scripts/test-cuda-export-aoti.sh b/.ci/scripts/test-cuda-export-aoti.sh new file mode 100755 index 00000000000..6ea701b8f4b --- /dev/null +++ b/.ci/scripts/test-cuda-export-aoti.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +CUDA_VERSION=${1:-"12.6"} + +echo "=== Testing ExecutorTorch CUDA AOTI Export ${CUDA_VERSION} ===" + +# Function to test CUDA AOTI export functionality +test_cuda_aoti_export() { + local cuda_version=$1 + + echo "Testing CUDA AOTI export with CUDA ${cuda_version} support..." + + # Check available resources before starting + echo "=== System Information ===" + echo "Available memory: $(free -h | grep Mem | awk '{print $2}')" + echo "Available disk space: $(df -h . | tail -1 | awk '{print $4}')" + echo "CPU cores: $(nproc)" + echo "CUDA version check:" + nvcc --version || echo "nvcc not found" + nvidia-smi || echo "nvidia-smi not found" + + # Set up environment for CUDA builds + export CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + + echo "=== Installing ExecutorTorch with CUDA support ===" + # Install ExecutorTorch with CUDA support with timeout and error handling + timeout 5400 ./install_executorch.sh || { + local exit_code=$? + echo "ERROR: install_executorch.sh failed with exit code: $exit_code" + if [ $exit_code -eq 124 ]; then + echo "ERROR: Installation timed out after 90 minutes" + fi + exit $exit_code + } + + echo "SUCCESS: ExecutorTorch CUDA installation completed" + + # Verify the installation + echo "=== Verifying ExecutorTorch CUDA Installation ===" + + # Test that ExecutorTorch was built successfully + python -c " +import executorch +print('SUCCESS: ExecutorTorch imported successfully') +" + + # Test CUDA availability and show details + python -c " +try: + import torch + print('INFO: PyTorch version:', torch.__version__) + print('INFO: CUDA available:', torch.cuda.is_available()) + + if torch.cuda.is_available(): + print('SUCCESS: CUDA is available for ExecutorTorch') + print('INFO: CUDA version:', torch.version.cuda) + print('INFO: GPU device count:', torch.cuda.device_count()) + print('INFO: Current GPU device:', torch.cuda.current_device()) + print('INFO: GPU device name:', torch.cuda.get_device_name()) + + # Test basic CUDA tensor operation + device = torch.device('cuda') + x = torch.randn(10, 10).to(device) + y = torch.randn(10, 10).to(device) + z = torch.mm(x, y) + print('SUCCESS: CUDA tensor operation completed on device:', z.device) + print('INFO: Result tensor shape:', z.shape) + + print('SUCCESS: ExecutorTorch CUDA integration verified') + else: + print('WARNING: CUDA not detected, but ExecutorTorch built successfully') + exit(1) +except Exception as e: + print('ERROR: ExecutorTorch CUDA test failed:', e) + exit(1) +" + + echo "=== Running CUDA AOTI Export Tests ===" + # Run the CUDA AOTI export tests using the Python script + python .ci/scripts/test_cuda_export_aoti.py \ + --models linear conv2d add resnet18 \ + --export-mode export_aoti_only \ + --timeout 600 \ + --cleanup + + echo "SUCCESS: ExecutorTorch CUDA AOTI export ${cuda_version} tests completed successfully" +} + +# Main execution +echo "Current working directory: $(pwd)" +echo "Directory contents:" +ls -la + +# Run the CUDA AOTI export test +test_cuda_aoti_export "${CUDA_VERSION}" diff --git a/.ci/scripts/test_cuda_export_aoti.py b/.ci/scripts/test_cuda_export_aoti.py new file mode 100755 index 00000000000..3748dc5fe33 --- /dev/null +++ b/.ci/scripts/test_cuda_export_aoti.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Test script for CUDA AOTI export functionality. +This script tests basic CUDA export functionality for a subset of models: +linear, conv2d, add, and resnet18. +""" + +import argparse +import os +import subprocess +import sys +from typing import List, Optional + + +def run_command( + cmd: List[str], cwd: Optional[str] = None, timeout: int = 300 +) -> subprocess.CompletedProcess: + """Run a command with proper error handling and timeout.""" + print(f"Running command: {' '.join(cmd)}") + if cwd: + print(f"Working directory: {cwd}") + + try: + result = subprocess.run( + cmd, + cwd=cwd, + capture_output=True, + text=True, + timeout=timeout, + check=False, # We'll handle the return code ourselves + ) + + if result.stdout: + print("STDOUT:") + print(result.stdout) + if result.stderr: + print("STDERR:") + print(result.stderr) + + return result + except subprocess.TimeoutExpired as e: + print(f"ERROR: Command timed out after {timeout} seconds") + raise e + except Exception as e: + print(f"ERROR: Failed to run command: {e}") + raise e + + +def test_cuda_export( + model_name: str, export_mode: str = "export_aoti_only", timeout: int = 300 +) -> bool: + """Test CUDA export for a specific model.""" + print(f"\n{'='*60}") + print(f"Testing CUDA export for model: {model_name}") + print(f"Export mode: {export_mode}") + print(f"{'='*60}") + + try: + # Run the export using export_aoti.py + cmd = ["python", "export_aoti.py", model_name] + if export_mode == "export_aoti_only": + cmd.append("--aoti_only") + + result = run_command(cmd, timeout=timeout) + + if result.returncode == 0: + print(f"SUCCESS: {model_name} export completed successfully") + return True + else: + print( + f"ERROR: {model_name} export failed with return code {result.returncode}" + ) + return False + + except subprocess.TimeoutExpired: + print(f"ERROR: {model_name} export timed out after {timeout} seconds") + return False + except Exception as e: + print(f"ERROR: {model_name} export failed with exception: {e}") + return False + + +def cleanup_temp_files(): + """Clean up temporary files generated during export.""" + print("Cleaning up temporary files...") + + # List of file patterns to clean up + cleanup_patterns = [ + "*.cubin", + "*.pte", + "*.so", + "*kernel_metadata.json", + "*kernel.cpp", + "*wrapper_metadata.json", + "*wrapper.cpp", + "*wrapper.json", + "aoti_intermediate_output.txt", + ] + + # Remove files matching patterns + for pattern in cleanup_patterns: + try: + import glob + + files = glob.glob(pattern) + for file in files: + if os.path.isfile(file): + os.remove(file) + print(f"Removed file: {file}") + except Exception as e: + print(f"Warning: Failed to remove {pattern}: {e}") + + # Remove temporary directories created by wrappers + try: + import glob + + for wrapper_file in glob.glob("*wrapper.cpp"): + basename = wrapper_file.replace("wrapper.cpp", "") + if os.path.isdir(basename): + import shutil + + shutil.rmtree(basename) + print(f"Removed directory: {basename}") + except Exception as e: + print(f"Warning: Failed to remove wrapper directories: {e}") + + print("Cleanup completed.") + + +def main(): + """Main function to test CUDA export for specified models.""" + parser = argparse.ArgumentParser( + description="Test CUDA AOTI export functionality", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument( + "--models", + nargs="+", + default=["linear", "conv2d", "add", "resnet18"], + help="List of models to test (default: linear, conv2d, add, resnet18)", + ) + + parser.add_argument( + "--export-mode", + choices=["export_aoti_only", "full"], + default="export_aoti_only", + help="Export mode: export_aoti_only (AOTI only) or full (full pipeline)", + ) + + parser.add_argument( + "--timeout", + type=int, + default=300, + help="Timeout for each model export in seconds (default: 300)", + ) + + parser.add_argument( + "--cleanup", + action="store_true", + default=True, + help="Clean up temporary files after testing (default: True)", + ) + + args = parser.parse_args() + + print("CUDA AOTI Export Test") + print("=" * 60) + print(f"Models to test: {args.models}") + print(f"Export mode: {args.export_mode}") + print(f"Timeout per model: {args.timeout} seconds") + print(f"Cleanup enabled: {args.cleanup}") + print("=" * 60) + + # Check if we're in the correct directory (should have export_aoti.py) + if not os.path.exists("export_aoti.py"): + print("ERROR: export_aoti.py not found in current directory") + print("Please run this script from the executorch root directory") + sys.exit(1) + + # Test each model + successful_models = [] + failed_models = [] + + for model in args.models: + # Clean up before each test + if args.cleanup: + cleanup_temp_files() + + success = test_cuda_export(model, args.export_mode, args.timeout) + + if success: + successful_models.append(model) + else: + failed_models.append(model) + + # Final cleanup + if args.cleanup: + cleanup_temp_files() + + # Print summary + print("\n" + "=" * 60) + print("CUDA AOTI Export Test Summary") + print("=" * 60) + print(f"Total models tested: {len(args.models)}") + print(f"Successful exports: {len(successful_models)}") + print(f"Failed exports: {len(failed_models)}") + + if successful_models: + print(f"Successful models: {', '.join(successful_models)}") + + if failed_models: + print(f"Failed models: {', '.join(failed_models)}") + print("\nERROR: One or more model exports failed!") + sys.exit(1) + else: + print("\nSUCCESS: All model exports completed successfully!") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml new file mode 100644 index 00000000000..bc8063b5b73 --- /dev/null +++ b/.github/workflows/test-backend-cuda.yml @@ -0,0 +1,68 @@ +# Test ExecutorTorch CUDA AOTI Export Functionality +# This workflow tests whether ExecutorTorch can successfully export models using CUDA AOTI +# across different CUDA versions (12.6, 12.8, 12.9) for a subset of models: +# linear, conv2d, add, and resnet18 +# +# The test focuses on export-only functionality and verifies that no errors are raised +# during the AOTI export process. + +name: Test CUDA AOTI Export + +on: + pull_request: + push: + branches: + - main + - release/* + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: false + +jobs: + test-cuda-aoti-export: + strategy: + fail-fast: false + matrix: + cuda-version: ["12.6", "12.8", "12.9"] + + name: test-executorch-cuda-aoti-export-${{ matrix.cuda-version }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 120 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda-version }} + submodules: recursive + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + if [ -n "$CONDA_ENV" ]; then + conda activate "${CONDA_ENV}" + fi + + # Test ExecutorTorch CUDA AOTI export - ExecutorTorch will automatically detect CUDA version + # and install the appropriate PyTorch wheel when CMAKE_ARGS="-DEXECUTORCH_BUILD_CUDA=ON" + PYTHON_EXECUTABLE=python bash .ci/scripts/test-cuda-export-aoti.sh "${{ matrix.cuda-version }}" + + # This job will fail if any of the CUDA AOTI export tests fail + check-all-cuda-aoti-exports: + needs: test-cuda-aoti-export + runs-on: ubuntu-latest + if: always() + steps: + - name: Check if all CUDA AOTI export tests succeeded + run: | + if [[ "${{ needs.test-cuda-aoti-export.result }}" != "success" ]]; then + echo "ERROR: One or more ExecutorTorch CUDA AOTI export tests failed!" + echo "CUDA AOTI export test results: ${{ needs.test-cuda-aoti-export.result }}" + exit 1 + else + echo "SUCCESS: All ExecutorTorch CUDA AOTI export tests (12.6, 12.8, 12.9) completed successfully!" + fi \ No newline at end of file