Add flash attn backend, duplicates #314 (#400)

yubofredwang · timmy-feng · sleepcoo · web-flow · commit 3e34e19f0ebd · 2026-01-14T17:20:46.000+08:00
* added flash_attn backend

* fix pre commit

* replace requirements.txt with pyproject.toml for flash attn compilation

* fix pre commit

* fix deps

* lint

* bump flash-attn

* update ci image

* test fa3

* fix bug

* fix bug

* fix bug

* Update Docker image version in test workflow

* Update pip install command in test workflow

Add --no-build-isolation flag to pip install command

* Update pyproject.toml

* Add setuptools installation to workflow

* Update test.yaml

* Update test.yaml

* Refactor test workflow to eliminate redundancy

Removed duplicate test run commands and unnecessary ls statements.

* Update pyproject.toml

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* polish

* fix position id

* clean up

* polish

---------

Co-authored-by: timmy-feng &lt;timothy@modal.com&gt;
Co-authored-by: sleepcoo &lt;sleepcoo@gmail.com&gt;
Co-authored-by: Shenggui Li &lt;somerlee.9@gmail.com&gt;
Co-authored-by: Yu Feng &lt;admin@fengyu.org&gt;
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -46,7 +46,8 @@ jobs:
             uv venv sf -p 3.11
           fi
           source sf/bin/activate
-          uv pip install -v . --prerelease=allow
+          uv pip install setuptools
+          MAX_JOBS=8 uv pip install -v ".[fa]" --prerelease=allow --no-build-isolation
 
       - name: Run test
         timeout-minutes: 30
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,44 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "specforge"
+dynamic = ["version", "description"]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "pre-commit",
+    "torch==2.9.1",
+    "torchaudio==2.9.1",
+    "torchvision==0.24.1",
+    "transformers==4.57.1",
+    "qwen-vl-utils==0.0.11",
+    "datasets",
+    "setuptools",
+    "tqdm",
+    "wandb",
+    "psutil",
+    "numpy",
+    "accelerate",
+    "pydantic",
+    "sglang==0.5.6",
+    "openai-harmony",
+    "ninja",
+    "packaging",
+    "yunchang",
+]
+
+[tool.setuptools]
+packages = ["specforge"]
+
+[project.optional-dependencies]
+dev = [
+    "pre-commit",
+    "unittest"
+]
+fa = ["flash-attn"]
+
+[tool.setuptools.dynamic]
+version = {file = "version.txt"}
+description = {file = "README.md"}
diff --git a/requirements.txt b/requirements.txt
diff --git a/scripts/regenerate_train_data.py b/scripts/regenerate_train_data.py
@@ -292,9 +292,11 @@ def main():
     error_samples = 0
 
     # Create progress bar
-    with open(args.input_file_path, "r") as input_file, open(
-        args.output_file_path, "w"
-    ) as output_file_handle, open(error_file_path, "w") as error_file_handle:
+    with (
+        open(args.input_file_path, "r") as input_file,
+        open(args.output_file_path, "w") as output_file_handle,
+        open(error_file_path, "w") as error_file_handle,
+    ):
         executor = ThreadPoolExecutor(
             max_workers=args.concurrency * len(valid_server_addresses)
         )
diff --git a/setup.py b/setup.py
@@ -1,10 +1,7 @@
-from setuptools import find_packages, setup
-
+import tomllib
+from pathlib import Path
 
-def read_requirements():
-    with open(f"requirements.txt", "r") as f:
-        lines = (line.strip() for line in f)
-        return [line for line in lines if line and not line.startswith(("#", "--"))]
+from setuptools import find_packages, setup
 
 
 def read_readme():
@@ -17,11 +14,18 @@ def read_version():
         return f.read().strip()
 
 
+def read_dependencies():
+    pyproject_path = Path(__file__).parent / "pyproject.toml"
+    with open(pyproject_path, "rb") as f:
+        pyproject = tomllib.load(f)
+        return pyproject.get("project", {}).get("dependencies", [])
+
+
 setup(
     name="specforge",
     packages=find_packages(exclude=["configs", "scripts", "tests"]),
     version=read_version(),
-    install_requires=read_requirements(),
+    install_requires=read_dependencies(),
     long_description=read_readme(),
     long_description_content_type="text/markdown",
     author="SGLang Team",
diff --git a/specforge/core/eagle3.py b/specforge/core/eagle3.py
@@ -165,7 +165,7 @@ def forward(
         acces = []
         # for sequence paralle, position mask and input ids will split by sequence dim, need to keep origin for ttt shift
         global_input_ids = input_ids
-        if self.attention_backend == "sdpa":
+        if self.attention_backend in ["sdpa", "fa"]:
             cache_hidden = [[], []]
             past_key_values = None
         elif self.attention_backend == "flex_attention":
@@ -175,6 +175,8 @@ def forward(
             cache_hidden = [[], []]
             past_key_values = None
             hidden_states = self.prepare_usp_input(hidden_states)
+        else:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}")
 
         for idx in range(self.length):
             target_p = target_p_padded[:, idx : idx + seq_length, :]
@@ -464,12 +466,14 @@ def forward(
         plosses = []
         vlosses = []
         acces = []
-        if self.attention_backend == "sdpa":
+        if self.attention_backend in ["sdpa", "fa"]:
             cache_hidden = [[], []]
             past_key_values = None
         elif self.attention_backend == "flex_attention":
             cache_hidden = None
             past_key_values = DynamicCache()
+        else:
+            raise ValueError(f"Unknown attention backend: {self.attention_backend}")
 
         for idx in range(self.length):
             target_p = target_p_padded[:, idx : idx + seq_length, :].contiguous()
diff --git a/specforge/data/utils.py b/specforge/data/utils.py
@@ -237,7 +237,7 @@ def prepare_dp_dataloaders(
     shuffle: Optional[bool] = False,
     is_vlm: Optional[bool] = False,
     prefetch_factor: Optional[int] = 2,
-    **dataloader_kwargs
+    **dataloader_kwargs,
 ) -> DataLoader:
     """
     Prepare dataloader for distributed data parallel training.
@@ -277,7 +277,7 @@ def prepare_dp_dataloaders(
         prefetch_factor=prefetch_factor,
         collate_fn=datacollator_cls(),
         drop_last=True,
-        **dataloader_kwargs
+        **dataloader_kwargs,
     )
     return dataloader
 
diff --git a/specforge/modeling/draft/llama3_eagle.py b/specforge/modeling/draft/llama3_eagle.py
@@ -1,4 +1,5 @@
 import math
+import warnings
 from typing import List, Optional, Tuple
 
 import torch
@@ -23,6 +24,14 @@
 from ...distributed import get_sp_ring_group, get_sp_ulysses_group
 from .base import Eagle3DraftModel
 
+try:
+    from flash_attn import flash_attn_func
+except:
+    warnings.warn(
+        "flash_attn is not found, please install flash_attn if you want to use the flash attention backend"
+    )
+    flash_attn_func = None
+
 
 # Copied from transformers.models.bart.modeling_bart._make_causal_mask
 def _make_causal_mask(
@@ -94,12 +103,12 @@ def rotate_half(x):
 
 
 @torch.compile(dynamic=True)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
-    cos = cos[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
-    sin = sin[position_ids].unsqueeze(1)  # [bs, 1, seq_len, dim]
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)  # [bs, 1, seq_len, dim]
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
@@ -1170,6 +1179,120 @@ def forward(
         return attn_output
 
 
+class LlamaFlashAttention(LlamaAttention):
+    """
+    Attention layer implemented with flash attention. We keep the parameters consistent with LlamaAttention.
+    The used parameters are:
+        - hidden_states: input hidden states
+        - position_ids: position ids
+        - cache_hidden: manual cache used for storing past key and value states
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_hidden: Optional[List[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        )
+
+        lck = 0 if cache_hidden is None else len(cache_hidden[0])
+        if isinstance(self.rotary_emb, LlamaMutiRotaryEmbedding):
+            cos, sin = self.rotary_emb(query_states, position_ids + lck)
+            cos, sin = cos.to(query_states.device), sin.to(query_states.device)
+            query_states, key_states = apply_multimodal_rotary_pos_emb(
+                query_states,
+                key_states,
+                cos,
+                sin,
+                self.config.rope_scaling["mrope_section"],
+                unsqueeze_dim=2,
+            )
+        else:
+            cos, sin = self.rotary_emb(query_states, seq_len=q_len + lck)
+            cos, sin = cos.to(query_states.device), sin.to(query_states.device)
+            query_states, key_states = apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids + lck, unsqueeze_dim=2
+            )
+
+        if cache_hidden is not None:
+            cache_hidden[0] = cache_hidden[0] + [key_states]
+            cache_hidden[1] = cache_hidden[1] + [value_states]
+
+            cache_k = cache_hidden[0]
+            cache_v = cache_hidden[1]
+        else:
+            cache_k = [key_states]
+            cache_v = [value_states]
+
+        k0 = cache_k[0]
+        v0 = cache_v[0]
+
+        assert (
+            flash_attn_func is not None
+        ), "flash_attn is not installed, please install flash_attn if you want to use the flash attention backend"
+        attn_output, lse, _ = flash_attn_func(
+            query_states,
+            k0,
+            v0,
+            dropout_p=0.0,
+            softmax_scale=1.0 / math.sqrt(self.head_dim),
+            causal=True,
+            return_attn_probs=True,
+        )
+        lse = lse.transpose(1, 2)
+
+        lck = len(cache_k)
+        if lck > 1:
+            q_shape_expanded = (
+                bsz,
+                q_len,
+                self.num_key_value_heads,
+                self.num_key_value_groups,
+                self.head_dim,
+            )
+            attn_outputs = [attn_output.view(q_shape_expanded)]
+            lses = [lse.view(q_shape_expanded[:-1])]
+
+            for i in range(1, lck):
+                ki = cache_k[i].unsqueeze(-2)
+                qi = query_states.view(q_shape_expanded)
+                vi = cache_v[i].unsqueeze(-2)
+
+                attn_outputs.append(vi)
+                lses.append((qi * ki).sum(-1) / math.sqrt(self.head_dim))
+
+            lse = torch.logsumexp(torch.stack(lses, dim=-1), dim=-1)
+            attn_output = sum(
+                attn_outputi * torch.exp(lsei - lse).unsqueeze(-1)
+                for attn_outputi, lsei in zip(attn_outputs, lses)
+            )
+            # lse is fp32, downcast attn_output back
+            attn_output = attn_output.to(self.o_proj.weight.dtype)
+
+        attn_output = attn_output.reshape(bsz, q_len, self.head_dim * self.num_heads)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output
+
+
 class LlamaMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1245,6 +1368,8 @@ def __init__(self, config, attention_backend: str = "sdpa"):
         elif attention_backend == "flex_attention":
             print_with_rank("Using flex attention on draft model training!")
             self.self_attn = LlamaFlexAttention(config=config)
+        elif attention_backend == "fa":
+            self.self_attn = LlamaFlashAttention(config=config)
         else:
             raise ValueError(f"Unknown attention backend {attention_backend}")
 
diff --git a/tests/test_utils/test_flash_attention.py b/tests/test_utils/test_flash_attention.py