Skip to content

Commit 5c7ad5b

Browse files
yzh119yongwww
andauthored
ci/cd: bringup flashinfer-jit-cache package (#1726)
<!-- .github/pull_request_template.md --> ## πŸ“Œ Description Move aot modules to a standalone package `flashinfer-jit-cache`. ## πŸ” Related Issues <!-- Link any related issues here --> ## πŸš€ Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### βœ… Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## πŸ§ͺ Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes <!-- Optional: anything you'd like reviewers to focus on, concerns, etc. --> --------- Co-authored-by: Yong Wu <[email protected]>
1 parent f3ea938 commit 5c7ad5b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+2496
-1576
lines changed
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
name: Build and Release flashinfer-jit-cache wheels
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
tag:
7+
description: 'Tag (e.g., v1.2.3) to build wheels for'
8+
required: true
9+
type: string
10+
11+
jobs:
12+
validate-tag:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Validate tag format
16+
run: |
17+
if [[ ! "${{ inputs.tag }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+([a-z0-9]+)?$ ]]; then
18+
echo "Error: Tag '${{ inputs.tag }}' does not match the expected format (e.g., v1.2.3 or v1.2.3.post1 or v1.2.3rc1)"
19+
exit 1
20+
fi
21+
echo "βœ“ Tag format is valid: ${{ inputs.tag }}"
22+
23+
- name: Check out tag
24+
uses: actions/checkout@v4
25+
with:
26+
ref: ${{ inputs.tag }}
27+
submodules: true
28+
29+
- name: Verify tag matches version.txt
30+
run: |
31+
# Extract version from tag (remove 'v' prefix)
32+
TAG_VERSION="${{ inputs.tag }}"
33+
TAG_VERSION="${TAG_VERSION#v}"
34+
35+
# Check version.txt - this is the source of truth
36+
if [ ! -f "version.txt" ]; then
37+
echo "Error: version.txt file not found!"
38+
exit 1
39+
fi
40+
41+
VERSION_TXT=$(cat version.txt | tr -d '[:space:]')
42+
43+
if [ "$TAG_VERSION" != "$VERSION_TXT" ]; then
44+
echo "❌ CRITICAL ERROR: version.txt does not match tag!"
45+
echo " Tag version: $TAG_VERSION"
46+
echo " version.txt: $VERSION_TXT"
47+
echo ""
48+
echo "Please update version.txt to match the release version before creating a release."
49+
echo "The tag should be 'v$VERSION_TXT' (e.g., if version.txt contains '1.2.3', tag should be 'v1.2.3')"
50+
exit 1
51+
fi
52+
53+
echo "βœ“ version.txt matches tag version: $VERSION_TXT"
54+
55+
build-wheel:
56+
needs: validate-tag
57+
strategy:
58+
fail-fast: false
59+
matrix:
60+
cuda: ["12.8", "12.9", "13.0"]
61+
arch: ['x86_64', 'aarch64']
62+
63+
# Use self-hosted runners with specific labels based on architecture
64+
runs-on: [self-hosted, "${{ matrix.arch == 'aarch64' && 'arm64' || matrix.arch }}"]
65+
66+
steps:
67+
- name: Display Machine Information
68+
run: |
69+
echo "CPU: $(nproc) cores, $(lscpu | grep 'Model name' | cut -d':' -f2 | xargs)"
70+
echo "RAM: $(free -h | awk '/^Mem:/ {print $7 " available out of " $2}')"
71+
echo "Disk: $(df -h / | awk 'NR==2 {print $4 " available out of " $2}')"
72+
echo "Architecture: $(uname -m)"
73+
- uses: actions/checkout@v4
74+
with:
75+
ref: ${{ inputs.tag }}
76+
submodules: true
77+
78+
- name: Build wheel in container
79+
env:
80+
DOCKER_IMAGE: ${{ matrix.arch == 'aarch64' && format('pytorch/manylinuxaarch64-builder:cuda{0}', matrix.cuda) || format('pytorch/manylinux2_28-builder:cuda{0}', matrix.cuda) }}
81+
FLASHINFER_CUDA_ARCH_LIST: ${{ matrix.cuda == '12.8' && '7.5 8.0 8.9 9.0a 10.0a 12.0a' || '7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0a' }}
82+
run: |
83+
# Extract CUDA major and minor versions
84+
CUDA_MAJOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f1)
85+
CUDA_MINOR=$(echo "${{ matrix.cuda }}" | cut -d'.' -f2)
86+
export CUDA_MAJOR
87+
export CUDA_MINOR
88+
export CUDA_VERSION_SUFFIX="cu${CUDA_MAJOR}${CUDA_MINOR}"
89+
90+
chown -R $(id -u):$(id -g) ${{ github.workspace }}
91+
mkdir -p ${{ github.workspace }}/ci-cache
92+
chown -R $(id -u):$(id -g) ${{ github.workspace }}/ci-cache
93+
94+
# Run the build script inside the container with proper mounts
95+
docker run --rm \
96+
-v ${{ github.workspace }}:/workspace \
97+
-v ${{ github.workspace }}/ci-cache:/ci-cache \
98+
-e FLASHINFER_CI_CACHE=/ci-cache \
99+
-e CUDA_VERSION="${{ matrix.cuda }}" \
100+
-e CUDA_MAJOR="$CUDA_MAJOR" \
101+
-e CUDA_MINOR="$CUDA_MINOR" \
102+
-e CUDA_VERSION_SUFFIX="$CUDA_VERSION_SUFFIX" \
103+
-e ARCH="${{ matrix.arch }}" \
104+
-e FLASHINFER_CUDA_ARCH_LIST="${FLASHINFER_CUDA_ARCH_LIST}" \
105+
--user $(id -u):$(id -g) \
106+
-w /workspace \
107+
${{ env.DOCKER_IMAGE }} \
108+
bash /workspace/scripts/build_flashinfer_jit_cache_whl.sh
109+
timeout-minutes: 180
110+
111+
- name: Display wheel size
112+
run: du -h flashinfer-jit-cache/dist/*
113+
114+
- name: Create artifact name
115+
id: artifact-name
116+
run: |
117+
CUDA_NO_DOT=$(echo "${{ matrix.cuda }}" | tr -d '.')
118+
echo "name=wheel-cu${CUDA_NO_DOT}-${{ matrix.arch }}" >> $GITHUB_OUTPUT
119+
120+
- uses: actions/upload-artifact@v4
121+
with:
122+
name: ${{ steps.artifact-name.outputs.name }}
123+
retention-days: 7
124+
path: flashinfer-jit-cache/dist/*

β€Ž.gitignoreβ€Ž

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ docs/generated/
1212
flashinfer/_build_meta.py
1313
flashinfer/data/
1414
flashinfer/jit/aot_config.py
15-
aot-ops/
1615
csrc/aot_default_additional_params.h
1716

1817
# DS_Store files

β€ŽJenkinsfileβ€Ž

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@ def run_with_spot_retry(spot_node_type, on_demand_node_type, test_name, test_clo
178178
// }
179179
// }
180180

181-
def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
182-
echo "Running CPU AOT Compile Unittest with CUDA ${cuda_version}"
181+
def run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, cuda_version) {
182+
echo "Running CPU JIT Cache Package Build and Import Unittest with CUDA ${cuda_version}"
183183

184184
def docker_run = ""
185185
if (cuda_version == "cu126") {
@@ -210,11 +210,11 @@ def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
210210
// If we reach here, node allocation was successful
211211
// Now run the tests without any timeout
212212
node(node_type) {
213-
ws(per_exec_ws('flashinfer-aot')) {
213+
ws(per_exec_ws('flashinfer-jit-cache')) {
214214
init_git(true)
215215
sh(script: "ls -alh", label: 'Show work directory')
216216
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
217-
sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
217+
sh(script: "${docker_run} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh", label: 'Test JIT Cache Package Build and Import')
218218
}
219219
}
220220
} catch (Exception e) {
@@ -226,11 +226,11 @@ def run_unittest_CPU_AOT_COMPILE(node_type, cuda_version) {
226226
} else {
227227
// No timeout for non-spot instances
228228
node(node_type) {
229-
ws(per_exec_ws('flashinfer-aot')) {
229+
ws(per_exec_ws('flashinfer-jit-cache')) {
230230
init_git(true)
231231
sh(script: "ls -alh", label: 'Show work directory')
232232
sh(script: "./scripts/task_show_node_info.sh", label: 'Show node info')
233-
sh(script: "${docker_run} --no-gpu ./scripts/task_test_aot_build_import.sh", label: 'Test AOT Build and Import')
233+
sh(script: "${docker_run} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh", label: 'Test JIT Cache Package Build and Import')
234234
}
235235
}
236236
}
@@ -305,38 +305,38 @@ stage('Unittest') {
305305
// CUDA 12.6 AOT Tests
306306
'AOT-Build-Import-x86-64-cu126': {
307307
run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu126',
308-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu126') })
308+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu126') })
309309
},
310310
'AOT-Build-Import-aarch64-cu126': {
311311
run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu126',
312-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu126') })
312+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu126') })
313313
},
314314
// CUDA 12.8 AOT Tests
315315
'AOT-Build-Import-x86-64-cu128': {
316316
run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu128',
317-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu128') })
317+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu128') })
318318
},
319319
'AOT-Build-Import-aarch64-cu128': {
320320
run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu128',
321-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu128') })
321+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu128') })
322322
},
323323
// CUDA 12.9 AOT Tests
324324
'AOT-Build-Import-x86-64-cu129': {
325325
run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu129',
326-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu129') })
326+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu129') })
327327
},
328328
'AOT-Build-Import-aarch64-cu129': {
329329
run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu129',
330-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu129') })
330+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu129') })
331331
},
332332
// CUDA 13.0 AOT Tests
333333
'AOT-Build-Import-x86-64-cu130': {
334334
run_with_spot_retry('CPU-LARGE-SPOT', 'CPU-LARGE', 'AOT-Build-Import-x86-64-cu130',
335-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu130') })
335+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu130') })
336336
},
337337
'AOT-Build-Import-aarch64-cu130': {
338338
run_with_spot_retry('ARM-LARGE-SPOT', 'ARM-LARGE', 'AOT-Build-Import-aarch64-cu130',
339-
{ node_type -> run_unittest_CPU_AOT_COMPILE(node_type, 'cu130') })
339+
{ node_type -> run_unittest_CPU_JIT_CACHE_PACKAGE_BUILD_IMPORT(node_type, 'cu130') })
340340
},
341341
// JIT unittest only for cu129
342342
'JIT-Unittest-1-cu129': {

β€ŽREADME.mdβ€Ž

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,26 @@ python -m pip install -v .
6363
python -m pip install --no-build-isolation -e . -v
6464
```
6565

66-
To pre-compile essential kernels ahead-of-time (AOT), run the following command:
66+
`flashinfer-python` is a source-only package and by default it will JIT compile/download kernels on-the-fly.
67+
For fully offline deployment, we also provide two additional packages `flashinfer-jit-cache` and `flashinfer-cubin`, to pre-compile and download cubins ahead-of-time.
6768

69+
#### flashinfer-cubin
70+
71+
To build `flashinfer-cubin` package from source:
72+
```bash
73+
cd flashinfer-cubin
74+
python -m build --no-isolation --wheel
75+
python -m pip install dist/*.whl
76+
```
77+
78+
#### flashinfer-jit-cache
79+
80+
To build `flashinfer-jit-cache` package from source:
6881
```bash
69-
# Set target CUDA architectures
70-
export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a"
71-
# Build AOT kernels. Will produce AOT kernels in aot-ops/
72-
python -m flashinfer.aot
73-
# Build AOT wheel
82+
export FLASHINFER_CUDA_ARCH_LIST="7.5 8.0 8.9 10.0a 10.3a 12.0a" # user can shrink the list to specific architectures
83+
cd flashinfer-jit-cache
7484
python -m build --no-isolation --wheel
75-
# Install AOT wheel
76-
python -m pip install dist/flashinfer_*.whl
85+
python -m pip install dist/*.whl
7786
```
7887

7988
For more details, refer to the [Install from Source documentation](https://docs.flashinfer.ai/installation.html#install-from-source).
@@ -119,10 +128,6 @@ Check out [documentation](https://docs.flashinfer.ai/) for usage of batch decode
119128

120129
Starting from FlashInfer v0.2, users can customize their own attention variants with additional parameters. For more details, refer to our [JIT examples](https://github.com/flashinfer-ai/flashinfer/blob/main/tests/utils/test_jit_example.py).
121130

122-
## C++ API and TVM Bindings
123-
124-
FlashInfer also provides C++ API and TVM bindings, please refer to [documentation](https://docs.flashinfer.ai/) for more details.
125-
126131
## GPU Support
127132

128133
FlashInfer currently provides support for NVIDIA SM architectures 75 and higher and beta support for 103, 110, 120, and 121.

β€Žcsrc/cudnn_sdpa_kernel_launcher.cuβ€Ž

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,16 +86,16 @@ enum PrefillType {
8686

8787
void init_cudnn_cubin(std::map<KernelType, std::string>& cubin_map) {
8888
cubin_map[PREFILL] =
89-
getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d128_bf16",
90-
"ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1.cubin");
89+
getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_prefill_d128_bf16.cubin",
90+
"ff14e8dcfc04d9b3a912dd44056be37d9aa8a85976e0070494ca0cce0524f2a1");
9191

9292
cubin_map[DECODE] =
93-
getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_decode_d128_bf16",
94-
"e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c.cubin");
93+
getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_decode_d128_bf16.cubin",
94+
"e7ce0408b4c3a36c42616498228534ee64cab785ef570af5741deaf9dd1b475c");
9595

9696
cubin_map[PREFILL_DEEPSEEK] =
97-
getCubin(cudnn_sdpa_cubin_path + "cudnn_sm100_fprop_sdpa_prefill_d192_bf16",
98-
"2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9.cubin");
97+
getCubin(cudnn_sdpa_cubin_path + "/" + "cudnn_sm100_fprop_sdpa_prefill_d192_bf16.cubin",
98+
"2190967b8733e193cdcecc054eeb7c2907080a158a33fe7ba2004523a4aff6f9");
9999
}
100100

101101
auto get_cudnn_cubin(KernelType kernel_type) -> std::string {

β€Žcustom_backend.pyβ€Ž

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,6 @@
55

66
_root = Path(__file__).parent.resolve()
77
_data_dir = _root / "flashinfer" / "data"
8-
_aot_ops_dir = _root / "aot-ops"
9-
_aot_ops_package_dir = _root / "build" / "aot-ops-package-dir"
10-
11-
_requires_for_aot = ["torch", "ninja", "numpy", "requests", "apache-tvm-ffi"]
12-
13-
14-
def _rm_aot_ops_package_dir():
15-
if _aot_ops_package_dir.is_symlink():
16-
_aot_ops_package_dir.unlink()
17-
elif _aot_ops_package_dir.exists():
18-
shutil.rmtree(_aot_ops_package_dir)
198

209

2110
def _create_data_dir():
@@ -42,39 +31,19 @@ def _prepare_for_wheel():
4231
if _data_dir.exists():
4332
shutil.rmtree(_data_dir)
4433

45-
# Link AOT ops directory to "aot-ops"
46-
_rm_aot_ops_package_dir()
47-
if not _aot_ops_dir.exists():
48-
_aot_ops_dir.mkdir()
49-
num_ops = len(list(_aot_ops_dir.glob("*/*.so")))
50-
print(f"{num_ops} AOT ops found in {_aot_ops_dir}")
51-
_aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
52-
_aot_ops_package_dir.symlink_to(_aot_ops_dir)
53-
5434

5535
def _prepare_for_editable():
5636
_create_data_dir()
5737

58-
_rm_aot_ops_package_dir()
59-
_aot_ops_dir.mkdir(parents=True, exist_ok=True)
60-
_aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
61-
_aot_ops_package_dir.symlink_to(_aot_ops_dir)
62-
6338

6439
def _prepare_for_sdist():
6540
# Remove data directory
6641
if _data_dir.exists():
6742
shutil.rmtree(_data_dir)
6843

69-
# Create an empty directory for AOT ops
70-
_rm_aot_ops_package_dir()
71-
_aot_ops_package_dir.parent.mkdir(parents=True, exist_ok=True)
72-
_aot_ops_package_dir.mkdir(parents=True)
73-
7444

7545
def get_requires_for_build_wheel(config_settings=None):
7646
_prepare_for_wheel()
77-
return _requires_for_aot
7847

7948

8049
def get_requires_for_build_sdist(config_settings=None):
@@ -84,7 +53,6 @@ def get_requires_for_build_sdist(config_settings=None):
8453

8554
def get_requires_for_build_editable(config_settings=None):
8655
_prepare_for_editable()
87-
return _requires_for_aot
8856

8957

9058
def prepare_metadata_for_build_wheel(metadata_directory, config_settings=None):

0 commit comments

Comments
Β (0)