delete gqa

dreaming-panda · dreaming-panda · commit 64957e8cfd2b · 2025-11-23T19:48:29.000-05:00
diff --git a/docs/index.rst b/docs/index.rst
@@ -8,17 +8,33 @@ Installation
 
 .. code-block:: bash
 
-   pip install vortex-torch
+   git clone https://github.com/Infini-AI-Lab/vortex_torch.git
+   cd vortex_torch
+   pip install -e .
 
 Quick Example
 -------------
-
 .. code-block:: python
 
-   import vortex_torch as vt
+   
+
+.. code-block:: python
 
-   model = vt.Model(...)
-   out = model.forward(...)
+   llm = sgl.Engine(model_path="Qwen/Qwen3-0.6B", 
+                    disable_cuda_graph=False,
+                    page_size=16,
+                    vortex_topk_val=30,   
+                    disable_overlap_schedule=True,
+                    attention_backend="flashinfer",
+                    enable_vortex_sparsity=True,
+                    vortex_page_reserved_bos=1,
+                    vortex_page_reserved_eos=1,
+                    vortex_layers_skip=list(range(1)),
+                    vortex_module_path="path/to/custom_sparse_attention.py"
+                    vortex_module_name="custom_sparse_attention",
+                    vortex_max_seq_lens=8192,
+                    mem_fraction_static=0.6
+                    )
 
 API Reference
 -------------
diff --git a/vortex_torch/flow/algorithms.py b/vortex_torch/flow/algorithms.py
@@ -43,10 +43,14 @@ class BlockSparseAttention(vFlow):
 
          .. math::
 
-             o \in \mathbb{R}^{S_{\mathrm{sparse}} \times 1 \times 1},
+             o \in \mathbb{R}^{S} \times 1 \times 1},
 
-         where :math:`S_{\mathrm{sparse}}` is a packed sparse page axis
-         as described in :class:`vFlow`.
+         Here :math:`S` is the leading page axis. Internally it is a packed
+         axis (often denoted :math:`S_{\mathrm{pack}}`), obtained by
+         concatenating the pages from all requests. As a user, you can simply
+         think of :math:`S` as "the number of pages for this request"; the
+         vFlow kernels and :class:`ContextBase` will take care of mapping
+         between per-request page counts and the packed layout automatically.
 
     Cache layout
     ------------
@@ -68,7 +72,7 @@ class BlockSparseAttention(vFlow):
       .. math::
 
           \text{cache["centroids"]} \sim
-          \mathbb{R}^{S_{\mathrm{pack}} \times 1 \times D},
+          \mathbb{R}^{S} \times 1 \times D},
 
     - In :meth:`forward_cache` (batch-major view):
 
@@ -127,8 +131,7 @@ def forward_indexer(
             - ``cache["k"]`` and ``cache["v"]`` are page-packed key/value
               tensors,
             - ``cache["centroids"]`` is interpreted as
-              ``[S_pack, 1, D]`` (page-packed centroids), with
-              :math:`S_{\mathrm{pack}} = \sum_i S_i`.
+              ``[S, 1, D]`` (page-packed centroids).
 
         ctx : ContextBase
             Runtime context carrying page layout, top-k configuration
@@ -214,7 +217,7 @@ def create_cache(self, page_size: int, head_dim: int):
             - ``"centroids"`` with inner shape ``(1, head_dim)``, which
               becomes
 
-              - ``[S_pack, 1, head_dim]`` in :meth:`forward_indexer`,
+              - ``[S, 1, head_dim]`` in :meth:`forward_indexer`,
               - ``[B, 1, head_dim]`` in :meth:`forward_cache`.
         """
         return {
@@ -235,9 +238,15 @@ class GQABlockSparseAttention(vFlow):
     - Centroids cache ``cache["centroids"]`` has inner shape
       ``(1, head_dim)`` and is viewed as:
 
-      - ``[S_pack, 1, D]`` in :meth:`forward_indexer`,
+      - ``[S, 1, D]`` in :meth:`forward_indexer`,
       - ``[B, 1, D]`` in :meth:`forward_cache`.
-
+      Here :math:`S` is the leading page axis. Internally it is a packed
+      axis (often denoted :math:`S_{\mathrm{pack}}`), obtained by
+      concatenating the pages from all requests. As a user, you can simply
+      think of :math:`S` as "the number of pages for this request"; the
+      vFlow kernels and :class:`ContextBase` will take care of mapping
+      between per-request page counts and the packed layout automatically.
+      
     For a design similar in spirit to grouped-query block sparsity, see
     the GQA sparse attention formulation in:
 
@@ -270,8 +279,8 @@ def forward_indexer(
         1. Apply :class:`GeMM` between queries and centroids:
 
            - ``q``: ``[B, H_q, D]``
-           - ``cache["centroids"]`` (indexer view): ``[S_pack, 1, D]``
-           - ``score``: ``[S_pack, H_q, 1]`` (logical ``[S, Ny, Nx]``)
+           - ``cache["centroids"]`` (indexer view): ``[S, 1, D]``
+           - ``score``: ``[S, H_q, 1]`` (logical ``[S, Ny, Nx]``)
 
         2. Apply in-place softmax over the leading (page) axis with a
            scaling factor ``scale``:
@@ -352,12 +361,19 @@ class GQAQuestSparseAttention(vFlow):
       - ``cache["max"]`` and ``cache["min"]``: ``(1, head_dim)``
         → viewed as
 
-        - ``[S_pack, 1, D]`` in :meth:`forward_indexer`,
+        - ``[S, 1, D]`` in :meth:`forward_indexer`,
         - ``[B, 1, D]`` in :meth:`forward_cache`.
 
       - ``cache["k"]``: standard key cache with inner shape
         ``(page_size, head_dim)``.
 
+      Here :math:`S` is the leading page axis. Internally it is a packed
+      axis (often denoted :math:`S_{\mathrm{pack}}`), obtained by
+      concatenating the pages from all requests. As a user, you can simply
+      think of :math:`S` as "the number of pages for this request"; the
+      vFlow kernels and :class:`ContextBase` will take care of mapping
+      between per-request page counts and the packed layout automatically.
+      
     Routing intuition
     -----------------
     For each query and page envelope:
@@ -401,15 +417,15 @@ def forward_indexer(
         Let:
 
         - ``q``: ``[B, H_q, D]``
-        - ``cache["max"]``: ``[S_pack, 1, D]``
-        - ``cache["min"]``: ``[S_pack, 1, D]``
+        - ``cache["max"]``: ``[S, 1, D]``
+        - ``cache["min"]``: ``[S, 1, D]``
 
         Steps:
 
         1. ``s_max = q * max_envelope``
         2. ``s_min = q * min_envelope``
         3. ``s = max(s_max, s_min)`` (elementwise)
-        4. ``score = sum(s, dim=D)`` → ``[S_pack, H_q, 1]``
+        4. ``score = sum(s, dim=D)`` → ``[S, H_q, 1]``
         5. ``aggr_score = max(score, dim=H_q)`` → per-page scalar
         6. :class:`topK` converts ``aggr_score`` into sparse page
            indices ``o`` of shape ``[S_sparse, 1, 1]``.
@@ -468,106 +484,3 @@ def create_cache(self, page_size: int, head_dim: int):
             "max": (1, head_dim),
             "min": (1, head_dim),
         }
-
-
-
-# Generated by GPT5.1
-@register("gqa_dynamic_hybrid_sparse_attention")
-class GQADynamicHybridSparseAttention(vFlow):
-    """
-    Dynamic hybrid sparse attention:
-      - Maintains mean, max, and min statistics per block.
-      - Uses a block-sparse (centroid-based) score path.
-      - Uses a Quest-style (max/min) score path.
-      - Combines them via element-wise max as a dynamic gating signal.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-        # ----- indexer ops -----
-        # Block-style scoring
-        self.gemm = GeMM()
-        self.softmax = Softmax(dim=0, scale=0.09)
-        self.max_over_heads = Max(dim=2)   # same as GQABlockSparseAttention
-
-        # Quest-style scoring
-        self.mul_max = Multiply()
-        self.mul_min = Multiply()
-        self.max_elementwise = Maximum()
-        self.sum_over_dim = Sum(dim=2)     # same as GQAQuestSparseAttention
-        self.max_over_queries = Max(dim=1)
-
-        # Combine block + quest scores
-        self.merge_scores = Maximum()      # element-wise max between the two scores
-
-        # Final selection
-        self.output_func = topK()
-
-        # ----- cache ops -----
-        self.reduction_mean = CMean(dim=1)  # centroids
-        self.reduction_max = CMax(dim=1)    # per-dim max
-        self.reduction_min = CMin(dim=1)    # per-dim min
-
-    def forward_indexer(self, q, o, cache: Dict[str, torch.Tensor], ctx: ContextBase):
-        """
-        q: query tensor (GQA-packed)
-        o: indexer output tensor (indices / scores buffer for topK)
-        cache: contains "centroids", "max", "min"
-        """
-
-        # ---- 1. Block-style centroid scoring ----
-        # score_block: [*, *, num_blocks] (same shape as in GQABlockSparseAttention)
-        score_block = self.gemm(q, cache["centroids"], ctx=ctx)
-        self.softmax(score_block, ctx=ctx)
-        # Aggregate over heads → [*, num_blocks]
-        aggr_block = self.max_over_heads(score_block, ctx=ctx)
-
-        # ---- 2. Quest-style max/min gating ----
-        # Element-wise products with cached max/min stats
-        s_max = self.mul_max(q, cache["max"], ctx=ctx)
-        s_min = self.mul_min(q, cache["min"], ctx=ctx)
-
-        # Take the element-wise max between the two projections
-        s = self.max_elementwise(s_max, s_min, ctx=ctx)
-
-        # Sum over feature dimension → [num_queries, num_heads, num_blocks]
-        score_quest = self.sum_over_dim(s, ctx=ctx)
-
-        # Aggregate over queries → [num_heads, num_blocks] or [*, num_blocks]
-        aggr_quest = self.max_over_queries(score_quest, ctx=ctx)
-
-        # ---- 3. Dynamic merge ----
-        # For each block, take whichever score (block vs quest) is stronger
-        # This yields a per-block dynamic gate.
-        combined_score = self.merge_scores(aggr_block, aggr_quest, ctx=ctx)
-
-        # ---- 4. Top-K block selection ----
-        self.output_func(combined_score, o, ctx=ctx)
-
-    def forward_cache(self, cache: Dict[str, torch.Tensor], loc: torch.Tensor, ctx: ContextBase):
-        """
-        cache["k"]: full key buffer for the page
-        loc: index of the page / block being updated
-        """
-
-        # Update mean (centroids)
-        self.reduction_mean(cache["k"], cache["centroids"], loc=loc, ctx=ctx)
-
-        # Update per-dimension maxima and minima
-        self.reduction_max(cache["k"], cache["max"], loc=loc, ctx=ctx)
-        self.reduction_min(cache["k"], cache["min"], loc=loc, ctx=ctx)
-
-    def create_cache(self, page_size: int, head_dim: int):
-        """
-        For each block/page we maintain:
-          - centroids: mean key per dimension
-          - max: max key per dimension
-          - min: min key per dimension
-        """
-        return {
-            "centroids": (1, head_dim),
-            "max": (1, head_dim),
-            "min": (1, head_dim),
-        }
-
diff --git a/vortex_torch/flow/flow.py b/vortex_torch/flow/flow.py
@@ -81,18 +81,20 @@ class vFlow(ABC):
        .. math::
 
            \text{cache[key]} \sim
-           \mathbb{R}^{S_{\text{pack}} \times r \times c},
+           \mathbb{R}^{S \times r \times c},
 
-       where
+       
 
-       .. math::
-
-           S_{\text{pack}} = \sum_{i=0}^{B-1} S_i
-
-       is the total number of pages packed across all requests, and
        :math:`(r, c)` is the per-key inner shape declared via
        :meth:`create_cache` or implicitly for ``"k"``/``"v"``.
 
+        Here :math:`S` is the leading page axis. Internally it is a packed
+        axis (often denoted :math:`S_{\mathrm{pack}}`), obtained by
+        concatenating the pages from all requests. As a user, you can simply
+        think of :math:`S` as "the number of pages for this request"; the
+        vFlow kernels and :class:`ContextBase` will take care of mapping
+        between per-request page counts and the packed layout automatically.
+    
     2. **Cache-update view (batch-major)** — used in :meth:`forward_cache`:
 
        .. math::
@@ -140,7 +142,7 @@ class vFlow(ABC):
                {\text{page_size} \cdot \text{head_dim}}.
 
     This ignores the leading dimension (whether :math:`B` or
-    :math:`S_{\text{pack}}`) and compares only inner shapes to the
+    :math:`S`) and compares only inner shapes to the
     baseline ``(page_size, head_dim)``.
 
     Subclass responsibilities
@@ -149,7 +151,7 @@ class vFlow(ABC):
 
     - :meth:`forward_indexer(q, o, cache, ctx)`:
       compute sparse page indices (or routing scores) from queries,
-      using cache in the :math:`S_{\text{pack}}` view.
+      using cache in the :math:`S` view.
 
     - :meth:`forward_cache(cache, loc, ctx)`:
       update cache tensors using the :math:`B`-major view and positional
@@ -203,9 +205,8 @@ def forward_indexer(
           .. math::
 
               \text{cache[key]}
-              \sim \mathbb{R}^{S_{\text{pack}} \times r \times c},
+              \sim \mathbb{R}^{S \times r \times c},
 
-          where :math:`S_{\text{pack}} = \sum_i S_i` and
           :math:`(r, c)` are the per-key inner dimensions obtained from
           :meth:`get_cache_meta_info`.
 
@@ -219,7 +220,7 @@ def forward_indexer(
         --------
         Implementations should:
 
-        - interpret ``cache`` in the :math:`S_{\text{pack}}` view,
+        - interpret ``cache`` in the :math:`S` view,
         - use ``q`` and relevant cache tensors to score/select pages,
         - respect per-request bounds derived from ``ctx``,
         - write the resulting sparse indices (or routing representation)
@@ -291,7 +292,7 @@ def create_cache(
 
         This method **does not allocate** tensors. It only declares the
         per-key inner dimensions :math:`(r, c)`; the runtime will attach
-        the appropriate leading axis (:math:`B` or :math:`S_{\text{pack}}`)
+        the appropriate leading axis (:math:`B` or :math:`S`)
         depending on whether the cache is used in :meth:`forward_cache`
         or :meth:`forward_indexer`.
 
@@ -357,7 +358,7 @@ def get_cache_meta_info(
         Dict[str, Tuple[int, int]]
             Mapping from cache tensor names to inner shapes ``(r, c)``.
             The runtime will later prepend either a batch axis ``B`` or a
-            packed-page axis ``S_pack`` when materializing the tensors.
+            packed-page axis ``S`` when materializing the tensors.
 
         Raises
         ------
@@ -392,7 +393,7 @@ def get_token_ratio(self, page_size: int, head_dim: int) -> float:
               \frac{r_{\text{key}} \cdot c_{\text{key}}}
                    {\text{page_size} \cdot \text{head_dim}}.
 
-        The leading dimension (:math:`B` or :math:`S_{\text{pack}}`) is
+        The leading dimension (:math:`B` or :math:`S`) is
         not included in this ratio on purpose; it is a per-page
         normalization.