modal-labs · yubofredwang · Nov 23, 2025 · Nov 23, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/.github/workflows/publish_docs.yaml b/.github/workflows/publish_docs.yaml
@@ -16,7 +16,7 @@ concurrency:
 jobs:
   deploy-github-pages:
     runs-on: ubuntu-latest
-    if: github.repository == 'sgl-project/specforge'
+    if: github.repository == 'sgl-project/specforge' || github.repository == 'sleepcoo/SpecForge'
     permissions:
       contents: write
     steps:
@@ -28,17 +28,42 @@ jobs:
         with:
           python-version: '3.13'
 
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+          cache: 'npm'
+          cache-dependency-path: docs/spec_bundle/package-lock.json
 
       - name: Install dependencies
         run: |
-          apt-get update && apt-get install -y pandoc parallel retry
+          sudo apt-get update && sudo apt-get install -y pandoc parallel retry
           pip install -r docs/requirements.txt
 
+      - name: Build spec bundle dashboard
+        run: |
+          # Copy logos to public directory
+          cp assets/logo.png docs/spec_bundle/public/logo.png
+          cp docs/_static/imgs/specbundle-logo.png docs/spec_bundle/public/specbundle-logo.png
+          cd docs/spec_bundle
+          npm ci
+          npm run build
+          # Clean up node_modules to prevent Sphinx from processing them
+          rm -rf node_modules
+          cd ..
+
       - name: Build documentation
         run: |
           cd docs
           make compile
           make html
+          # Copy SpecBundle to root of output directory
+          mkdir -p _build/html/SpecBundle
+          cp -r spec_bundle/dist/* _build/html/SpecBundle/
+
+      - name: Add .nojekyll file
+        run: |
+          touch ./docs/_build/html/.nojekyll
 
       - name: Deploy
         uses: peaceiris/actions-gh-pages@v4

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -26,8 +26,12 @@ jobs:
 
       - name: Restore cache
         run: |
+          if [ -d /github/home/cache ] && [ ! -z "$(ls -A /github/home/cache/)" ]; then
+            cp -p -r /github/home/cache ./
+          fi
+
           if [ -d /github/home/sf ] && [ ! -z "$(ls -A /github/home/sf/)" ]; then
-            cp -p -r /github/home/sf/* ./
+            cp -p -r /github/home/sf ./
           fi
 
       - name: Remove flashinfer # this is needed to avoid flashinfer jit compilation makes the program hang
@@ -42,16 +46,17 @@ jobs:
             uv venv sf -p 3.11
           fi
           source sf/bin/activate
-          uv pip install -v . --prerelease=allow
+          uv pip install setuptools
+          uv pip install -v . --prerelease=allow --no-build-isolation
 
       - name: Run test
         timeout-minutes: 30
-        shell: bash
         run: |
           source sf/bin/activate
           export PYTHONPATH=$PWD
-          python -m unittest discover -s ./tests -p "test_*.py" -v
+          python tests/test_utils/test_flash_attention.py
 
       - name: Save cache
         run: |
           cp -p -r sf /github/home/
+          cp -p -r cache /github/home/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,6 +1,11 @@
 default_stages: [pre-commit, pre-push, manual]
 
 repos:
+  - repo: https://github.com/PyCQA/autoflake
+    rev: v2.3.1
+    hooks:
+    -   id: autoflake
+        args: [--remove-all-unused-imports, --in-place]
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v5.0.0
     hooks:

diff --git a/README.md b/README.md
@@ -2,9 +2,11 @@
 <img src="./assets/logo.png" alt="logo" width="400" margin="10px"></img>
 
 [![documentation](https://img.shields.io/badge/📖-Documentation-red.svg?style=flat)](https://docs.sglang.ai/SpecForge/)
+[![SpecBundle](https://img.shields.io/badge/🤗%20SpecBundle-yellow.svg?style=flat)](https://huggingface.co/collections/lmsys/specbundle)
+[![DeepWiki](https://img.shields.io/badge/DeepWiki-SpecForge-blue.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACwAAAAyCAYAAAAnWDnqAAAAAXNSR0IArs4c6QAAA05JREFUaEPtmUtyEzEQhtWTQyQLHNak2AB7ZnyXZMEjXMGeK/AIi+QuHrMnbChYY7MIh8g01fJoopFb0uhhEqqcbWTp06/uv1saEDv4O3n3dV60RfP947Mm9/SQc0ICFQgzfc4CYZoTPAswgSJCCUJUnAAoRHOAUOcATwbmVLWdGoH//PB8mnKqScAhsD0kYP3j/Yt5LPQe2KvcXmGvRHcDnpxfL2zOYJ1mFwrryWTz0advv1Ut4CJgf5uhDuDj5eUcAUoahrdY/56ebRWeraTjMt/00Sh3UDtjgHtQNHwcRGOC98BJEAEymycmYcWwOprTgcB6VZ5JK5TAJ+fXGLBm3FDAmn6oPPjR4rKCAoJCal2eAiQp2x0vxTPB3ALO2CRkwmDy5WohzBDwSEFKRwPbknEggCPB/imwrycgxX2NzoMCHhPkDwqYMr9tRcP5qNrMZHkVnOjRMWwLCcr8ohBVb1OMjxLwGCvjTikrsBOiA6fNyCrm8V1rP93iVPpwaE+gO0SsWmPiXB+jikdf6SizrT5qKasx5j8ABbHpFTx+vFXp9EnYQmLx02h1QTTrl6eDqxLnGjporxl3NL3agEvXdT0WmEost648sQOYAeJS9Q7bfUVoMGnjo4AZdUMQku50McDcMWcBPvr0SzbTAFDfvJqwLzgxwATnCgnp4wDl6Aa+Ax283gghmj+vj7feE2KBBRMW3FzOpLOADl0Isb5587h/U4gGvkt5v60Z1VLG8BhYjbzRwyQZemwAd6cCR5/XFWLYZRIMpX39AR0tjaGGiGzLVyhse5C9RKC6ai42ppWPKiBagOvaYk8lO7DajerabOZP46Lby5wKjw1HCRx7p9sVMOWGzb/vA1hwiWc6jm3MvQDTogQkiqIhJV0nBQBTU+3okKCFDy9WwferkHjtxib7t3xIUQtHxnIwtx4mpg26/HfwVNVDb4oI9RHmx5WGelRVlrtiw43zboCLaxv46AZeB3IlTkwouebTr1y2NjSpHz68WNFjHvupy3q8TFn3Hos2IAk4Ju5dCo8B3wP7VPr/FGaKiG+T+v+TQqIrOqMTL1VdWV1DdmcbO8KXBz6esmYWYKPwDL5b5FA1a0hwapHiom0r/cKaoqr+27/XcrS5UwSMbQAAAABJRU5ErkJggg==)](https://deepwiki.com/sgl-project/SpecForge)
+
 [![github badge](https://img.shields.io/badge/📃%20LMSYS-Blog-black.svg?style=flat)](https://lmsys.org/blog/2025-07-25-spec-forge/)
 [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&amp)](https://sgl-fru7574.slack.com/archives/C09784E3EN6)
-[![SGLang Eagle3](https://img.shields.io/badge/🤗%20Hugging%20Face-SGLang%20Eagle3-yellow.svg?style=flat)](https://huggingface.co/collections/lmsys/eagle-3-6886b2329f3998a8bc23f8ed)
 [![license](https://img.shields.io/badge/License-MIT%202.0-blue)](./LICENSE)
 
 </div>
@@ -21,8 +23,23 @@ We have seen many open-source projects for speculative decoding, but most of the
 
 Check out [**our documentation**](https://docs.sglang.ai/SpecForge/) to get started.
 
+
+## 🚀 Accelerate with SpecBundle
+
+SpecBundle is a collection of production-grade speculative decoding models that are released by the SpecForge team and our industry partners. They provide higher acceptance rate compared to the existing open-source checkpoints over a wide range of domains. Together with SGLang, you can experience up to 4x speedup for inference. Check out our resources below:
+
+
+| Item | Link |
+| --- | --- |
+| 📝 Documentation | [Link](https://docs.sglang.io/SpecForge/community_resources/specbundle.html) |
+| 📊 Performance Dashboard | [Link](https://docs.sglang.io/SpecForge/SpecBundle/index.html) |
+| 🤗 Hugging Face Collection | [Link](https://huggingface.co/collections/lmsys/specbundle) |
+
+
 ## 🎉 News
 
+- [2025-12] 🎉 Released SpecBundle (phase 1) and SpecForge v0.2. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-12-23-spec-bundle-phase-1/)
+- [2025-12] 🔔 Released the roadmap for 2026 Q1.
 - [2025-08] 🔔 SpecForge is listed as a [flagship project](https://lmsys.org/about/) in LMSYS. Congratulations to the SpecForge team!
 - [2025-08] 🔥 SpecForge powered the Eagle3 draft model for GPT-OSS. Check out the blog at [LMSYS.org](https://lmsys.org/blog/2025-08-27-gpt-oss/)
 - [2025-07] 🔥 SpecForge is released together with Llama4-Eagle3 checkpoints. Check out our blog at [LMSYS.org](https://lmsys.org/blog/2025-07-25-spec-forge/)

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
@@ -1 +1,2 @@
 *.jsonl
+results/
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,72 +1,67 @@
 # Benchmarking for Speculative Decoding
 
-## Setup
+## Overview
 
-You can create a new environment and install SGLang with the following command:
+We provided a unified script to test the performance of the Speculative Decoding with EAGLE3 algorithm on multiple datasets. You can follow the steps below to run the benchmarks.
 
-```bash
-# create virtual env
-uv venv sglang -p 3.11
-source sglang/bin/activate
+## Run Benchmarks
 
-# install sglang
-uv pip install "sglang[all]>=0.4.9.post2"
-```
+### Launch SGLang and Benchmarker Concurrently
 
-You can serve your trained model with SGLang with the following command by replacing the `<target-model-path>` and `<draft-model-path>` with the actual path to the target model and draft model.
+`bench_eagle3.py` can help you launch a SGLang server process and a Benchmarking process concurrently. In this way, you don't have to launch the SGLang server manually, this script will manually handle the SGLang launch under different speculative decoding configurations. Some important arguments are:
+- `--model-path`: the path to the target model.
+- `--speculative-draft-model-path`: the path to the draft model.
+- `--port`: the port to launch the SGLang server.
+- `--trust-remote-code`: trust the remote code.
+- `--mem-fraction-static`: the memory fraction for the static memory.
+- `--tp-size`: the tensor parallelism size.
+- `--attention-backend`: the attention backend.
+- `--config-list`: the list of speculative decoding configuration to test, the format is `<batch-size>,<num-steps>,<topk>,<num-draft-tokens>`.
+- `--benchmark-list`: the list of benchmarks to test, the format is `<benchmark-name>:<num-prompts>:<subset>`.
 
-```bash
-python3 -m sglang.launch_server \
-    --model <target-model-path>  \
-    --speculative-algorithm EAGLE3 \
-    --speculative-draft-model-path <draft-model-path> \
-    --speculative-num-steps 3 \
-    --speculative-eagle-topk 1 \
-    --speculative-num-draft-tokens 4 \
-    --mem-fraction-static 0.75 \
-    --cuda-graph-max-bs 2 \
-    --tp 1 \
-    --context-length 8192 \
-    --trust-remote-code \
-    --host 0.0.0.0 \
+```shell
+python3 bench_eagle3.py \
+    --model-path meta-llama/Llama-3.1-8B-Instruct \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
     --port 30000 \
+    --trust-remote-code \
+    --mem-fraction-static 0.8 \
+    --tp-size 1 \
+    --attention-backend fa3 \
+    --config-list 1,0,0,0 1,3,1,4 \
+    --benchmark-list mtbench gsm8k:5 ceval:5:accountant \
     --dtype bfloat16
 ```
 
-## Run Benchmarks
+### Launch Benchmarker Independently
 
-You first need to start the SGLang server:
+If you want to launch the SGLang server independently, you can use the following command.
 
-```bash
+```shell
+# you can launch a server
 python3 -m sglang.launch_server \
-    --model <target-model-path>  \
+    --model meta-llama/Llama-3.1-8B-Instruct   \
     --speculative-algorithm EAGLE3 \
-    --speculative-draft-model-path <draft-model-path> \
+    --speculative-draft-model-path lmsys/sglang-EAGLE3-LLaMA3.1-Instruct-8B \
     --speculative-num-steps 3 \
     --speculative-eagle-topk 1 \
     --speculative-num-draft-tokens 4 \
     --mem-fraction-static 0.75 \
-    --cuda-graph-max-bs 2 \
-    --tp 8 \
-    --context-length 8192 \
+    --cuda-graph-max-bs 1 \
+    --tp 1 \
     --trust-remote-code \
     --host 0.0.0.0 \
     --port 30000 \
     --dtype bfloat16
 ```
 
-Then you can run the benchmarks:
+Then we can start benchmarking. Note that you should use the same host and port as the one used in the SGLang server. Note that `--skip-launch-server` is required to skip the launch of the SGLang server.
 
 ```bash
-# GSM8K
-python run_gsm8k.py
-
-# MATH-500
-python run_math500.py
-
-# MTBench
-python run_mtbench.py
-
-# HumanEval
-python run_humaneval.py
+python bench_eagle3.py \
+        --model-path meta-llama/Llama-3.1-8B-Instruct \
+        --port 30000 \
+        --config-list 1,3,1,4 \
+        --benchmark-list mtbench:5 ceval:5:accountant gsm8k:5 humaneval:5 math500:5 mtbench:5 aime:1 \
+        --skip-launch-server
 ```