vectorch-ai
diff --git a/‎src/kernels/attention/collective/sm120_collective_fmha_mainloop_ws.cuh‎
Lines changed: 27 additions & 4 deletions b/‎src/kernels/attention/collective/sm120_collective_fmha_mainloop_ws.cuh‎
Lines changed: 27 additions & 4 deletions
diff --git a/‎src/kernels/attention/collective/sm120_collective_load_cpasync_ws.cuh‎
Lines changed: 2 additions & 10 deletions b/‎src/kernels/attention/collective/sm120_collective_load_cpasync_ws.cuh‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎src/kernels/attention/collective/sm120_collective_load_tma_ws.cuh‎
Lines changed: 0 additions & 12 deletions b/‎src/kernels/attention/collective/sm120_collective_load_tma_ws.cuh‎
Lines changed: 0 additions & 12 deletions
@@ -179,16 +179,35 @@ struct Sm120CollectiveFMhaWs {
 
   // load Q/K/V from gmem to smem
   template <class Block>
-  CUTE_DEVICE void load(const Block& block,
+  CUTE_DEVICE void load(const Params& params,
+                        const Block& block,
                         int tidx,
                         PipelineQ& q_pipeline,
                         typename PipelineQ::PipelineState& q_state,
                         PipelineKV& kv_pipeline,
                         typename PipelineKV::PipelineState& kv_state,
                         TensorStorage& ss) {
+    if (!block.is_valid()) {
+      // skip invalid block
+      return;
+    }
+    const auto [n_block_min, n_block_max] =
+        block.template get_kv_blocks<LOCAL>(params.sliding_window);
+    if (n_block_min >= n_block_max) {
+      return;  // no kv blocks to process
+    }
+
     // forward to the load implementation
     Load load;
-    load(block, tidx, q_pipeline, q_state, kv_pipeline, kv_state, ss);
+    load(block,
+         tidx,
+         n_block_min,
+         n_block_max,
+         q_pipeline,
+         q_state,
+         kv_pipeline,
+         kv_state,
+         ss);
   }
 
   template <class Block, class FrgTensor, class PipelineQ, class PipelineKV>
@@ -212,12 +231,16 @@ struct Sm120CollectiveFMhaWs {
       return;
     }
 
-    const auto [n_block_min, n_block_max] = block.get_kv_blocks();
+    const auto [n_block_min, n_block_max] =
+        block.template get_kv_blocks<LOCAL>(params.sliding_window);
     if (n_block_min >= n_block_max) {
       return;  // no kv blocks to process
     }
 
-    const auto [batch_idx, m_block_idx, kv_head_idx] = block.get_block_coord();
+    // (m_block_idx, ((kv_head_idx, _0), batch_idx))
+    const auto& block_coord = block.get_block_coord();
+    const int m_block_idx = get<0>(block_coord);
+    const int kv_head_idx = get<1, 0, 0>(block_coord);
 
     const auto q_packed_len = block.get_packed_len();
     const auto q_len = block.get_q_len();
 
@@ -39,23 +39,15 @@ struct Sm120CollectiveLoadCpAsyncWs {
   template <class Block>
   CUTE_DEVICE void operator()(const Block& block,
                               int tidx,
+                              int n_block_min,
+                              int n_block_max,
                               PipelineQ& q_pipeline,
                               typename PipelineQ::PipelineState& q_state,
                               PipelineKV& kv_pipeline,
                               typename PipelineKV::PipelineState& kv_state,
                               TensorStorage& ss) {
     static constexpr int kStages = size<2>(SmemLayoutK{});
 
-    if (!block.is_valid()) {
-      // skip invalid block
-      return;
-    }
-
-    const auto [n_block_min, n_block_max] = block.get_kv_blocks();
-    if (n_block_min >= n_block_max) {
-      return;  // no kv blocks to process
-    }
-
     // (M, N, K)
     const auto residue_mnk = block.get_residue_mnk();
 
 
@@ -35,18 +35,6 @@ struct Sm120CollectiveLoadTmaWs {
           Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL_ZFILL<cute::uint128_t>,
                     Element>{}));
 
-  // using StrideK = ...;
-
-  // using TMA_K = decltype(make_tma_copy(
-  //       GmemTiledCopy{}, // TMA_COPY
-  //       make_tensor(static_cast<InternalElementA const*>(nullptr),
-  //       repeat_like(StrideK{}, int32_t(0)), StrideK{}),
-  //       SmemLayoutK{}(_,_,_0{})));
-
-  // Tensor tensor_k = make_tensor(ptr_k, make_layout(make_shape(M,K,L),
-  // args.stride_k)); auto tma_load_k = make_tma_copy(SM90_TMA_LOAD{},
-  // gtensor_k, SmemLayoutK{}(_,_,_0{}));
-
   // load Q using cp_async and K/V using tma
   template <class Block>
   CUTE_DEVICE void operator()(const Block& block,