1116

dreaming-panda · dreaming-panda · commit 17b29a1df9f9 · 2025-11-16T23:01:24.000-05:00
diff --git a/vortex_torch/cache/__init__.py b/vortex_torch/cache/__init__.py
@@ -1,11 +1,17 @@
 from .context import Context
-from .reduce import Mean
+from .reduce import Mean, Max, Min
+from .matmul import GeMM
+from .elementwise import Relu, Silu, Sigmoid, Abs, Add_Mul
+from .elementwise_binary import Maximum, Minimum, Multiply, Add
 from .triton_kernels import set_kv_buffer_launcher
 
 
 __all__ = [
     "set_kv_buffer_launcher",
-    "Mean",
+    "Mean", "Max", "Min",
+    "GeMM",
+    "Relu", "Silu", "Sigmoid", "Abs", "Add_Mul",
+    "Maximum", "Minimum", "Multiply", "Add",
     "Context"
 ]
 
diff --git a/vortex_torch/cache/reduce.py b/vortex_torch/cache/reduce.py
@@ -171,3 +171,8 @@ def __init__(self, dim = 1):
         super().__init__(dim)
         self.reduce_type = ReduceType.L2Norm
 
+class Sum(Reduce):
+    
+    def __init__(self, dim = 1):
+        super().__init__(dim)
+        self.reduce_type = ReduceType.Sum
diff --git a/vortex_torch/flow/algorithms.py b/vortex_torch/flow/algorithms.py
@@ -2,8 +2,8 @@
 from typing import Dict
 
 from .flow import vFlow
-from ..indexer import topK, GeMV, Softmax, Max, GeMM
-from ..cache import Mean
+from ..indexer import topK, GeMV, Softmax, Max, Sum, GeMM, Maximum, Multiply
+from ..cache import Mean as CMean, Max as CMax, Min as CMin
 from ..abs import ContextBase
 from .registry import register
 
@@ -12,10 +12,12 @@ class BlockSparseAttention(vFlow):
     
     def __init__(self):
         super().__init__()
-        
+        #indexer ops
         self.gemv = GeMV()
         self.output_func = topK()
-        self.reduction = Mean(dim=1)
+
+        #cache ops
+        self.reduction = CMean(dim=1)
     
     def forward_indexer(self, q, o, cache, ctx):
         
@@ -40,12 +42,14 @@ class GQABlockSparseAttention(vFlow):
     
     def __init__(self):
         super().__init__()
-        
+        #indexer ops
         self.gemm = GeMM()
         self.softmax = Softmax(dim=0, scale=0.09)
         self.max_op = Max(dim=2)
         self.output_func = topK()
-        self.reduction = Mean(dim=1)
+
+        #cache ops
+        self.reduction = CMean(dim=1)
     
     def forward_indexer(self, q, o, cache, ctx):
         
@@ -64,4 +68,46 @@ def create_cache(self, page_size: int, head_dim: int):
          return {
              "centroids": (1, head_dim)
          }
-         
+
+
+
+@register("gqa_quest_sparse_attention")
+class GQAQuestSparseAttention(vFlow):
+    
+    def __init__(self):
+        super().__init__()
+        
+        #indexer ops
+        self.mul_max = Multiply()
+        self.mul_min = Multiply()
+        self.maximum_op = Maximum()
+        self.sum = Sum(dim=2)
+        self.max_op = Max(dim=1)
+        self.output_func = topK()
+
+        #cache ops
+        self.reduction_max = CMax(dim=1)
+        self.reduction_min = CMin(dim=1)
+    
+    def forward_indexer(self, q, o, cache, ctx):
+        
+        s_max = self.mul_max(q, cache["max"], ctx=ctx)
+        s_min = self.mul_min(q, cache["min"], ctx=ctx)
+        s = self.maximum_op(s_max, s_min, ctx=ctx)
+        score = self.sum(s, ctx=ctx)
+        aggr_score = self.max_op(score, ctx=ctx)
+        self.output_func(aggr_score, o, ctx=ctx)
+            
+    def forward_cache(self, cache: Dict[str, torch.Tensor], loc:torch.Tensor, ctx: ContextBase):
+        
+        self.reduction_max(cache["k"], cache["max"], loc=loc, ctx=ctx)
+        self.reduction_min(cache["k"], cache["min"], loc=loc, ctx=ctx)
+
+     
+     
+    def create_cache(self, page_size: int, head_dim: int):
+         
+         return {
+             "max": (1, head_dim),
+             "min": (1, head_dim)
+         }
diff --git a/vortex_torch/indexer/__init__.py b/vortex_torch/indexer/__init__.py
@@ -1,6 +1,6 @@
 from .matmul import GeMV, GeMM
 from .output_func import topK
-from .reduce import Max, Mean, Min, L2Norm
+from .reduce import Max, Mean, Min, L2Norm, Sum
 from .scan import Softmax, Normalize
 from .transpose import Transpose
 from .elementwise_binary import Maximum, Minimum, Multiply, Add
@@ -10,7 +10,7 @@
 __all__ = [ 
     "GeMV", "GeMM",
     "topK",
-    "Max", "Mean", "Min", "L2Norm",
+    "Max", "Mean", "Min", "L2Norm", "Sum",
     "Softmax", "Normalize",
     "Transpose",
     "Maximum", "Minimum", "Multiply", "Add",
diff --git a/vortex_torch/indexer/reduce.py b/vortex_torch/indexer/reduce.py
@@ -114,3 +114,10 @@ class L2Norm(Reduce):
     def __init__(self, dim = 1):
         super().__init__(dim)
         self.reduce_type = ReduceType.L2Norm
+
+
+class Sum(Reduce):
+    
+    def __init__(self, dim = 1):
+        super().__init__(dim)
+        self.reduce_type = ReduceType.Sum
diff --git a/vortex_torch/indexer/triton_kernels/reduce_impl.py b/vortex_torch/indexer/triton_kernels/reduce_impl.py
@@ -59,7 +59,10 @@ def reduce_rr_kernel(
                 
             elif REDUCE_TYPE == 3:
                 x_i_reduce = tl.sqrt(tl.sum(x_i * x_i, axis=1))
-                
+            
+            elif REDUCE_TYPE == 4:
+                x_i_reduce = tl.sum(x_i, axis=1)
+
             else:
                 x_i_reduce = tl.zeros((max_chunk_size, x_D1), dtype=tl.bfloat16)
             
@@ -80,7 +83,10 @@ def reduce_rr_kernel(
             
             elif REDUCE_TYPE == 3:
                 x_i_reduce = tl.sqrt(tl.sum(x_i * x_i, axis=2))
-                
+            
+            elif REDUCE_TYPE == 4:
+                x_i_reduce = tl.sum(x_i, axis=2)
+
             else:
                 x_i_reduce = tl.zeros((max_chunk_size, x_D1), dtype=tl.float32)
 
diff --git a/vortex_torch/utils.py b/vortex_torch/utils.py
@@ -14,6 +14,7 @@ class ReduceType(Enum):
     Max = 1
     Min = 2
     L2Norm = 3
+    Sum = 4
     
 
 class ElementwiseBinaryOpType(Enum):