flashinfer-ai
diff --git a/‎.github/workflows/release-flashinfer-jit-cache-wheel.yml‎
Lines changed: 124 additions & 0 deletions b/‎.github/workflows/release-flashinfer-jit-cache-wheel.yml‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎Jenkinsfile‎
Lines changed: 14 additions & 14 deletions b/‎Jenkinsfile‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎README.md‎
Lines changed: 17 additions & 12 deletions b/‎README.md‎
Lines changed: 17 additions & 12 deletions
diff --git a/‎csrc/cudnn_sdpa_kernel_launcher.cu‎
Lines changed: 6 additions & 6 deletions b/‎csrc/cudnn_sdpa_kernel_launcher.cu‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎custom_backend.py‎
Lines changed: 0 additions & 32 deletions b/‎custom_backend.py‎
Lines changed: 0 additions & 32 deletions
@@ -0,0 +1,124 @@
+name: Build and Release flashinfer-jit-cache wheels
+
+on:
+    workflow_dispatch:
+      inputs:
+        tag:
+          description: 'Tag (e.g., v1.2.3) to build wheels for'
+          required: true
+          type: string
+
+jobs:
+    validate-tag:
+      runs-on: ubuntu-latest
+      steps:
+        - name: Validate tag format
+          run: |
+            if [[ ! "${{ inputs.tag }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+([a-z0-9]+)?$ ]]; then
+              echo "Error: Tag '${{ inputs.tag }}' does not match the expected format (e.g., v1.2.3 or v1.2.3.post1 or v1.2.3rc1)"
+              exit 1
+            fi
+            echo "✓ Tag format is valid: ${{ inputs.tag }}"
+
+        - name: Check out tag
+          uses: actions/checkout@v4
+          with:
+            ref: ${{ inputs.tag }}
+            submodules: true
+
+        - name: Verify tag matches version.txt
+          run: |
+            # Extract version from tag (remove 'v' prefix)
+            TAG_VERSION="${{ inputs.tag }}"
+            TAG_VERSION="${TAG_VERSION#v}"
+
+            # Check version.txt - this is the source of truth
+            if [ ! -f "version.txt" ]; then
+              echo "Error: version.txt file not found!"
+              exit 1
+            fi
+
+            VERSION_TXT=$(cat version.txt | tr -d '[:space:]')
+
+            if [ "$TAG_VERSION" != "$VERSION_TXT" ]; then
+              echo "❌ CRITICAL ERROR: version.txt does not match tag!"
+              echo "  Tag version: $TAG_VERSION"
+              echo "  version.txt: $VERSION_TXT"
+              echo ""
+              echo "Please update version.txt to match the release version before creating a release."
+              echo "The tag should be 'v$VERSION_TXT' (e.g., if version.txt contains '1.2.3', tag should be 'v1.2.3')"
+              exit 1
+            fi
+
+            echo "✓ version.txt matches tag version: $VERSION_TXT"
+
+    build-wheel:
+        needs: validate-tag
+        strategy:
+            fail-fast: false
+            matrix:
+                cuda: ["12.8", "12.9", "13.0"]
+                arch: ['x86_64', 'aarch64']
+
+        # Use self-hosted runners with specific labels based on architecture
+        runs-on: [self-hosted, "${{ matrix.arch == 'aarch64' && 'arm64' || matrix.arch }}"]
+
+        steps:
+            - name: Display Machine Information
+              run: |
+                echo "CPU: $(nproc) cores, $(lscpu | grep 'Model name' | cut -d':' -f2 | xargs)"
+                echo "RAM: $(free -h | awk '/^Mem:/ {print $7 " available out of " $2}')"
+                echo "Disk: $(df -h / | awk 'NR==2 {print $4 " available out of " $2}')"
+                echo "Architecture: $(uname -m)"
+            - uses: actions/checkout@v4
+              with:
+                  ref: ${{ inputs.tag }}
+                  submodules: true
+
+            - name: Build wheel in container
+              env:
+                  DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }}
+                  FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }}
+              run: |
+                  # Extract CUDA major and minor versions
+                  CUDA_MAJOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f1)
+                  CUDA_MINOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f2)
+                  export CUDA_MAJOR
+                  export CUDA_MINOR
+                  export CUDA_VERSION_SUFFIX="cu${CUDA_MAJOR}${CUDA_MINOR}"
+
+                  chown -R $(id -u):$(id -g) ${{ github.workspace }}
+                  mkdir -p ${{ github.workspace }}/ci-cache
+                  chown -R $(id -u):$(id -g) ${{ github.workspace }}/ci-cache
+
+                  # Run the build script inside the container with proper mounts
+                  docker run --rm \
+                    -v ${{ github.workspace }}:/workspace \
+                    -v ${{ github.workspace }}/ci-cache:/ci-cache \
+                    -e FLASHINFER_CI_CACHE=/ci-cache \
+                    -e CUDA_VERSION="${{ matrix.cuda }}" \
+                    -e CUDA_MAJOR="$CUDA_MAJOR" \
+                    -e CUDA_MINOR="$CUDA_MINOR" \
+                    -e CUDA_VERSION_SUFFIX="$CUDA_VERSION_SUFFIX" \
+                    -e ARCH="${{ matrix.arch }}" \
+                    -e FLASHINFER_CUDA_ARCH_LIST="${FLASHINFER_CUDA_ARCH_LIST}" \
+                    --user $(id -u):$(id -g) \
+                    -w /workspace \
+                    ${{ env.DOCKER_IMAGE }} \
+                    bash /workspace/scripts/build_flashinfer_jit_cache_whl.sh
+              timeout-minutes: 180
+
+            - name: Display wheel size
+              run: du -h flashinfer-jit-cache/dist/*
+
+            - name: Create artifact name
+              id: artifact-name
+              run: |
+                  CUDA_NO_DOT=$(echo "${{ matrix.cuda }}" | tr -d '.')
+                  echo "name=wheel-cu${CUDA_NO_DOT}-${{ matrix.arch }}" >> $GITHUB_OUTPUT
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: ${{ steps.artifact-name.outputs.name }}
+                  retention-days: 7
+                  path: flashinfer-jit-cache/dist/*
@@ -12,7 +12,6 @@ docs/generated/
 flashinfer/_build_meta.py
 flashinfer/data/
 flashinfer/jit/aot_config.py
-aot-ops/
 csrc/aot_default_additional_params.h
 
 # DS_Store files
 
@@ -178,8 +178,8 @@ def run_with_spot_retry(spot_node_type, on_demand_node_type, test_name, test_clo
 //   }
 // }
 
-def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
-  echo "Running CPU AOT Compile Unittest with CUDA ${cuda_version}"
+def run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, cuda_version) {
+  echo "Running CPU JIT Cache Package Build and Import Unittest with CUDA ${cuda_version}"
 
   def docker_run = ""
   if (cuda_version == "cu126") {
@@ -210,11 +210,11 @@ def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
       // If we reach here, node allocation was successful
       // Now run the tests without any timeout
       node(node_type) {
-        ws(per_exec_ws('flashinfer-aot')) {
+        ws(per_exec_ws('flashinfer-jit-cache')) {
           init_git(true)
           sh(script: "ls -alh", label: 'Show work directory')
           sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
-          sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
+          sh(script: "${docker_run} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh", label: 'Test JIT Cache Package Build and Import')
         }
       }
     } catch (Exception e) {
@@ -226,11 +226,11 @@ def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
   } else {
     // No timeout for non-spot instances
     node(node_type) {
-      ws(per_exec_ws('flashinfer-aot')) {
+      ws(per_exec_ws('flashinfer-jit-cache')) {
         init_git(true)
         sh(script: "ls -alh", label: 'Show work directory')
         sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
-        sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
+        sh(script: "${docker_run} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh", label: 'Test JIT Cache Package Build and Import')
       }
     }
   }
@@ -305,38 +305,38 @@ stage('Unittest') {
     // CUDA 12.6 AOT Tests
     'AOT-Build-Import-x86-64-cu126': {
       run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu126',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu126') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu126') })
     },
     'AOT-Build-Import-aarch64-cu126': {
       run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu126',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu126') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu126') })
     },
     // CUDA 12.8 AOT Tests
     'AOT-Build-Import-x86-64-cu128': {
       run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu128',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu128') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu128') })
     },
     'AOT-Build-Import-aarch64-cu128': {
       run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu128',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu128') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu128') })
     },
     // CUDA 12.9 AOT Tests
     'AOT-Build-Import-x86-64-cu129': {
       run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu129',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu129') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu129') })
     },
     'AOT-Build-Import-aarch64-cu129': {
       run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu129',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu129') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu129') })
     },
     // CUDA 13.0 AOT Tests
     'AOT-Build-Import-x86-64-cu130': {
       run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu130',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu130') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu130') })
     },
     'AOT-Build-Import-aarch64-cu130': {
       run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu130',
-        { node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu130') })
+        { node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu130') })
     },
     // JIT unittest only for cu129
     'JIT-Unittest-1-cu129': {
 
@@ -63,17 +63,26 @@ python -m pip install -v .
 python -m pip install --no-build-isolation -e . -v
 ```
 
-To pre-compile essential kernels ahead-of-time (AOT), run the following command:
+`flashinfer-python` is a source-only package and by default it will JIT compile/download kernels on-the-fly.
+For fully offline deployment, we also provide two additional packages `flashinfer-jit-cache` and `flashinfer-cubin`, to pre-compile and download cubins ahead-of-time.
 
+#### flashinfer-cubin
+
+To build `flashinfer-cubin` package from source:
+```bash
+cd flashinfer-cubin
+python -m build --no-isolation --wheel
+python -m pip install dist/*.whl
+```
+
+#### flashinfer-jit-cache
+
+To build `flashinfer-jit-cache` package from source:
 ```bash
-# Set target CUDA architectures
-export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a"
-# Build AOT kernels. Will produce AOT kernels in aot-ops/
-python -m flashinfer.aot
-# Build AOT wheel
+export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a" # user can shrink the list to specific architectures
+cd flashinfer-jit-cache
 python -m build --no-isolation --wheel
-# Install AOT wheel
-python -m pip install dist/flashinfer_*.whl
+python -m pip install dist/*.whl
 ```
 
 For more details, refer to the [Install from Source documentation](https://docs.flashinfer.ai/installation.html#install-from-source).
@@ -119,10 +128,6 @@ Check out [documentation](https://docs.flashinfer.ai/) for usage of batch decode
 
 Starting from FlashInfer v0.2, users can customize their own attention variants with additional parameters. For more details, refer to our [JIT examples](https://github.com/flashinfer-ai/flashinfer/blob/main/tests/utils/test_jit_example.py).
 
-## C++ API and TVM Bindings
-
-FlashInfer also provides C++ API and TVM bindings, please refer to [documentation](https://docs.flashinfer.ai/) for more details.
-
 ## GPU Support
 
 FlashInfer currently provides support for NVIDIA SM architectures 75 and higher and beta support for 103, 110, 120, and 121.
 
@@ -86,16 +86,16 @@ enum PrefillType {
 
 void init_cudnn_cubin(std::map<KernelType, std::string>& cubin_map) {
   cubin_map[PREFILL] =
-      getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d128_bf16",
-               "ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1.cubin");
+      getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_prefill_d128_bf16.cubin",
+               "ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1");
 
   cubin_map[DECODE] =
-      getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_decode_d128_bf16",
-               "e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c.cubin");
+      getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_decode_d128_bf16.cubin",
+               "e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c");
 
   cubin_map[PREFILL_DEEPSEEK] =
-      getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d192_bf16",
-               "2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9.cubin");
+      getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_prefill_d192_bf16.cubin",
+               "2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9");
 }
 
 auto get_cudnn_cubin(KernelType kernel_type) -> std::string {
 
@@ -5,17 +5,6 @@
 
 _root = Path(__file__).parent.resolve()
 _data_dir = _root / "flashinfer" / "data"
-_aot_ops_dir = _root / "aot-ops"
-_aot_ops_package_dir = _root / "build" / "aot-ops-package-dir"
-
-_requires_for_aot = ["torch", "ninja", "numpy", "requests", "apache-tvm-ffi"]
-
-
-def _rm_aot_ops_package_dir():
-    if _aot_ops_package_dir.is_symlink():
-        _aot_ops_package_dir.unlink()
-    elif _aot_ops_package_dir.exists():
-        shutil.rmtree(_aot_ops_package_dir)
 
 
 def _create_data_dir():
@@ -42,39 +31,19 @@ def _prepare_for_wheel():
     if _data_dir.exists():
         shutil.rmtree(_data_dir)
 
-    # Link AOT ops directory to "aot-ops"
-    _rm_aot_ops_package_dir()
-    if not _aot_ops_dir.exists():
-        _aot_ops_dir.mkdir()
-    num_ops = len(list(_aot_ops_dir.glob("*/*.so")))
-    print(f"{num_ops} AOT ops found in {_aot_ops_dir}")
-    _aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
-    _aot_ops_package_dir.symlink_to(_aot_ops_dir)
-
 
 def _prepare_for_editable():
     _create_data_dir()
 
-    _rm_aot_ops_package_dir()
-    _aot_ops_dir.mkdir(parents=True, exist_ok=True)
-    _aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
-    _aot_ops_package_dir.symlink_to(_aot_ops_dir)
-
 
 def _prepare_for_sdist():
     # Remove data directory
     if _data_dir.exists():
         shutil.rmtree(_data_dir)
 
-    # Create an empty directory for AOT ops
-    _rm_aot_ops_package_dir()
-    _aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
-    _aot_ops_package_dir.mkdir(parents=True)
-
 
 def get_requires_for_build_wheel(config_settings=None):
     _prepare_for_wheel()
-    return _requires_for_aot
 
 
 def get_requires_for_build_sdist(config_settings=None):
@@ -84,7 +53,6 @@ def get_requires_for_build_sdist(config_settings=None):
 
 def get_requires_for_build_editable(config_settings=None):
     _prepare_for_editable()
-    return _requires_for_aot
 
 
 def prepare_metadata_for_build_wheel(metadata_directory, config_settings=None):