making forward optimization work take 3 (#4864)

henrylhtsang · facebook-github-bot · commit 0322d2e3f960 · 2025-09-12T09:51:25.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1887 Pull Request resolved: #4864 Reviewed By: Aya-ZIbra Differential Revision: D81809218 fbshipit-source-id: a6c26d5a8d3c10d3b49f2bf1a4cf017f92860400
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/fmha_fusion.hpp
@@ -84,6 +84,26 @@ struct NoMask {
     return get_trip_count(blk_coord, tile_shape, problem_size);
   }
 
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_start_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return 0;
+  }
+
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_stop_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return ceil_div(get<1>(problem_size), get<1>(tile_shape));
+  }
+
   template<class AccQK, class IndexQK, class ProblemSize>
   CUTLASS_DEVICE
   void apply_mask(
@@ -140,6 +160,26 @@ struct ResidualMask : NoMask {
     return get_trip_count(blk_coord, tile_shape, problem_size);
   }
 
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_start_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return 0;
+  }
+
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_stop_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return get_unmasked_trip_count(blk_coord, tile_shape, problem_size);
+  }
+
   template<class AccQK, class IndexQK, class ProblemSize>
   CUTLASS_DEVICE
   void apply_mask(
@@ -293,6 +333,26 @@ struct CausalMask : NoMask {
     return get_trip_count(blk_coord, tile_shape, problem_size) - get_masked_trip_count(blk_coord, tile_shape, problem_size);
   }
 
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_start_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return 0;
+  }
+
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_stop_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+
+    return get_unmasked_trip_count(blk_coord, tile_shape, problem_size);
+  }
+
   template<class AccQK, class IndexQK, class ProblemSize>
   CUTLASS_DEVICE
   void apply_mask(
@@ -456,8 +516,55 @@ struct LocalMask : NoMask {
       TileShape const& tile_shape,
       ProblemSize const& problem_size) {
 
-    // TODO: follow CausalMask to improve this
-    return 0;
+    const int n_block_start_unmask = get_n_block_start_unmask(blk_coord, tile_shape, problem_size);
+    const int n_block_stop_unmask = get_n_block_stop_unmask(blk_coord, tile_shape, problem_size);
+
+    return n_block_stop_unmask - n_block_start_unmask;
+  }
+
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_start_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+    // this does not guarantee to be smaller than n_block_stop_unmask
+
+    const int kBlockM = get<0>(tile_shape);
+    const int kBlockN = get<1>(tile_shape);
+    const int seq_len_k = get<1>(problem_size);
+
+    const int m_block = get<0>(blk_coord);
+    const int offset_q = IsQBegin? 0 : get<1>(problem_size) - get<0>(problem_size);
+
+    const int m_idx_max = (m_block + 1) * kBlockM;
+
+    // -1 to make this inclusive
+    const int n_idx_max_left = std::max(m_idx_max + offset_q - window_size_left - 1, 0);
+
+    return ceil_div(n_idx_max_left, kBlockN);
+  }
+
+  template<class BlkCoord, class TileShape, class ProblemSize>
+  CUTLASS_DEVICE
+  int get_n_block_stop_unmask(
+      BlkCoord const& blk_coord,
+      TileShape const& tile_shape,
+      ProblemSize const& problem_size) {
+    // this does not guarantee to be larger than n_block_start_unmask
+
+    const int kBlockM = get<0>(tile_shape);
+    const int kBlockN = get<1>(tile_shape);
+    const int seq_len_k = get<1>(problem_size);
+
+    const int m_block = get<0>(blk_coord);
+    const int offset_q = IsQBegin? 0 : get<1>(problem_size) - get<0>(problem_size);
+
+    const int m_idx_min = m_block * kBlockM;
+    // +1 to make this exclusive
+    const int n_idx_min_right = std::min(m_idx_min + offset_q + window_size_right + 1, seq_len_k);
+
+    return n_idx_min_right / kBlockN;
   }
 
   template<class AccQK, class IndexQK, class ProblemSize>
diff --git a/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/fbgemm_gpu/experimental/gen_ai/src/attention/cuda/cutlass_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp
@@ -729,11 +729,12 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
       PipelineC& pipeline_c, typename PipelineC::PipelineState& pipeline_c_producer_state,
       OrderBarrierSoftmax& order_s) {
 
-    int mask_tile_count = Mask(params.window_size_left, params.window_size_right).get_unmasked_trip_count(blk_coord, TileShape{}, problem_shape);
-
-    auto min_max = Mask(params.window_size_left, params.window_size_right).get_n_block_min_max(blk_coord, TileShape{}, problem_shape);
+    Mask mask(params.window_size_left, params.window_size_right);
+    auto min_max = mask.get_n_block_min_max(blk_coord, TileShape{}, problem_shape);
     int n_block_min = get<0>(min_max);
-    // int n_block_max = get<1>(min_max);
+    const int n_block_max = get<1>(min_max);
+    const int n_block_start_unmask = mask.get_n_block_start_unmask(blk_coord, TileShape{}, problem_shape);
+    const int n_block_stop_unmask = mask.get_n_block_stop_unmask(blk_coord, TileShape{}, problem_shape);
 
     ElementQK row_max = -INFINITY;
     ElementQK row_sum = 0;
@@ -747,35 +748,73 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized {
 
     pipeline_c.producer_acquire(pipeline_c_producer_state);
 
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (; mask_tile_count > 0; mask_tile_count -= 1) {
-      softmax_step<false /* need_apply_mask */>(
-          row_max, row_sum, stage,
-          (mask_tile_count == 1) &&
-              (Mask(params.window_size_left, params.window_size_right).get_masked_trip_count(blk_coord, TileShape{}, problem_shape) == 0),
-          blk_coord, cS, params, problem_shape,
-          pipeline_s, pipeline_s_consumer_state,
-          pipeline_c, pipeline_c_producer_state,
-          order_s
-      );
-
-      cS.data() = cS.data() + E<1>{} * get<1>(ThreadShape{}) * get<1>(TileShapeQK{});
-    }
-
-    // Masked iterations
-    mask_tile_count = Mask(params.window_size_left, params.window_size_right).get_masked_trip_count(blk_coord, TileShape{}, problem_shape);
-
-    CUTLASS_PRAGMA_NO_UNROLL
-    for (; mask_tile_count > 0; mask_tile_count -= 1) {
-      softmax_step<true /* need_apply_mask */>(
-          row_max, row_sum, stage, mask_tile_count == 1,
-          blk_coord, cS, params, problem_shape,
-          pipeline_s, pipeline_s_consumer_state,
-          pipeline_c, pipeline_c_producer_state,
-          order_s
-      );
+    // from observation, dispatch is better for the mask -> unmask -> mask pattern and when the number of tiles is small
+    if constexpr (std::is_base_of_v<cutlass::fmha::collective::LocalMask<true>, Mask>
+        || std::is_base_of_v<cutlass::fmha::collective::LocalMask<false>, Mask>) {
+      auto dispatch_bool = [](bool b, auto fn) {
+        if (b) {
+          fn(cute::true_type{});
+        }
+        else {
+          fn(cute::false_type{});
+        }
+      };
+
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; n_block_min < n_block_max; n_block_min += 1) {
+        // Apply mask only for tiles outside the attention window
+        // for local mask, we don't guarantee n_block_start_unmask <= n_block_stop_unmask <= n_block_max
+        bool need_apply_mask = warp_uniform(n_block_min < n_block_start_unmask || n_block_min >= n_block_stop_unmask);
+
+        dispatch_bool(need_apply_mask, [&](auto is_masked_tile) {
+          if constexpr (decltype(is_masked_tile)::value) {
+            softmax_step<true /* need_apply_mask */>(
+                row_max, row_sum, stage, (n_block_min == n_block_max - 1),
+                blk_coord, cS, params, problem_shape,
+                pipeline_s, pipeline_s_consumer_state,
+                pipeline_c, pipeline_c_producer_state,
+                order_s
+            );
+          } else {
+            softmax_step<false /* need_apply_mask */>(
+                row_max, row_sum, stage, (n_block_min == n_block_max - 1),
+                blk_coord, cS, params, problem_shape,
+                pipeline_s, pipeline_s_consumer_state,
+                pipeline_c, pipeline_c_producer_state,
+                order_s
+            );
+          }
+        });
+
+        cS.data() = cS.data() + E<1>{} * get<1>(ThreadShape{}) * get<1>(TileShapeQK{});
+      }
+    } else {
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; n_block_min < n_block_stop_unmask; n_block_min += 1) {
+        softmax_step<false /* need_apply_mask */>(
+            row_max, row_sum, stage,
+            (n_block_min == n_block_max - 1),
+            blk_coord, cS, params, problem_shape,
+            pipeline_s, pipeline_s_consumer_state,
+            pipeline_c, pipeline_c_producer_state,
+            order_s
+        );
+
+        cS.data() = cS.data() + E<1>{} * get<1>(ThreadShape{}) * get<1>(TileShapeQK{});
+      }
 
-      cS.data() = cS.data() + E<1>{} * get<1>(ThreadShape{}) * get<1>(TileShapeQK{});
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; n_block_min < n_block_max; n_block_min += 1) {
+        softmax_step<true /* need_apply_mask */>(
+            row_max, row_sum, stage, n_block_min == n_block_max - 1,
+            blk_coord, cS, params, problem_shape,
+            pipeline_s, pipeline_s_consumer_state,
+            pipeline_c, pipeline_c_producer_state,
+            order_s
+        );
+
+        cS.data() = cS.data() + E<1>{} * get<1>(ThreadShape{}) * get<1>(TileShapeQK{});
+      }
     }
 
     pipeline_c.producer_commit(pipeline_c_producer_state);