Flash Attention: fix all-masked rows (#1070)

louisfd · web-flow · commit 477dc3bfe119 · 2025-11-26T08:24:12.000-05:00
diff --git a/crates/cubecl-attention/src/components/global/simple/attention.rs b/crates/cubecl-attention/src/components/global/simple/attention.rs
@@ -171,7 +171,7 @@ impl<
                     mask.view(layout),
                     step,
                     seq_kv_shape,
-                    config.mask_gmem_config.view_direction,
+                    config.mask_gmem_config,
                 )
             }
             CubeOption::None => MaskReader::new_logical(stage_q_offset + partition_q_offset, step),
diff --git a/crates/cubecl-attention/src/components/global/simple/reader/mask.rs b/crates/cubecl-attention/src/components/global/simple/reader/mask.rs
@@ -2,8 +2,7 @@ use crate::components::tile::TileAttentionConfig;
 use crate::components::{AttentionTileSize, attention_types::*};
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
-use cubecl_matmul::components::MatrixLayout;
-use cubecl_matmul::components::global::memory::{GlobalIterator, ViewDirection};
+use cubecl_matmul::components::global::memory::{GlobalIterator, GlobalMemoryConfig};
 use cubecl_matmul::components::tile::StridedTile;
 use cubecl_std::tensor::{View, layout::Coords2d};
 use cubecl_std::{Swizzle, tensor::layout::Coordinates};
@@ -44,6 +43,8 @@ pub struct MaterializedMaskReader<M: Numeric> {
     logical_iter: LogicalIterator,
     // TODO not sure if mandatory, but i need for the stride when reading in global memory
     seq_kv_shape: u32,
+    #[cube(comptime)]
+    gmem_config: GlobalMemoryConfig,
 }
 
 #[derive(CubeType)]
@@ -64,15 +65,16 @@ impl<AP: AttentionPrecision> MaskReader<AP> {
         mask: View<Line<MSK<AP>>, Coords2d>,
         step: u32,
         seq_kv_shape: u32,
-        #[comptime] view_direction: ViewDirection,
+        #[comptime] gmem_config: GlobalMemoryConfig,
     ) -> Self {
         let mask = mask.slice((stage_q_offset, 0), mask.shape());
-        let global_iter = GlobalIterator::new(mask, step, view_direction, false);
+        let global_iter = GlobalIterator::new(mask, step, gmem_config.view_direction, false);
 
         MaskReader::<AP>::new_Materialized(MaterializedMaskReader::new(
             global_iter,
             LogicalIterator::init(partition_q_offset, step),
             seq_kv_shape,
+            gmem_config,
         ))
     }
 
@@ -117,11 +119,13 @@ impl<M: Numeric> MaterializedMaskReader<M> {
         global_iter: GlobalIterator<Line<M>>,
         logical_iter: LogicalIterator,
         seq_kv_shape: u32,
+        #[comptime] gmem_config: GlobalMemoryConfig,
     ) -> Self {
         MaterializedMaskReader::<M> {
             global_iter,
             logical_iter,
             seq_kv_shape,
+            gmem_config,
         }
     }
 
@@ -135,20 +139,29 @@ impl<M: Numeric> MaterializedMaskReader<M> {
 
         let row = row_offset + P::seq_q_index() * elements_in_partition_seq_q;
 
+        let slice = self
+            .global_iter
+            .view()
+            .slice(
+                (row, col.runtime()),
+                (attention_tile_size.seq_q, attention_tile_size.seq_kv).runtime(),
+            )
+            .to_linear_slice();
+
+        let line_size = self.gmem_config.line_size;
+        let start = 0;
+        let length = attention_tile_size.seq_q * attention_tile_size.seq_kv / line_size;
+        let end = start + length;
+        let stride = self.seq_kv_shape / line_size;
+
         StridedTile::<M>::new_strided(
-            self.global_iter
-                .view()
-                .slice(
-                    (row, col.runtime()),
-                    (attention_tile_size.seq_q, attention_tile_size.seq_kv).runtime(),
-                )
-                .to_linear_slice(),
-            0,
-            attention_tile_size.seq_q * attention_tile_size.seq_kv,
-            self.seq_kv_shape,
+            slice,
+            start,
+            end,
+            stride,
             Swizzle::none(),
-            MatrixLayout::RowMajor,
-            1u32,
+            self.gmem_config.matrix_layout,
+            line_size,
         )
     }
 
diff --git a/crates/cubecl-attention/src/components/tile/accelerated/local_tile.rs b/crates/cubecl-attention/src/components/tile/accelerated/local_tile.rs
@@ -4,8 +4,8 @@ use cubecl_matmul::components::tile::StridedTile;
 use cubecl_std::tensor::layout::Coords2d;
 
 use crate::components::tile::{
-    FragmentAccumulator, FragmentAccumulatorExpand, FragmentMask, FragmentMaskExpand, RowVal,
-    RowWise, RowwiseFormat, RowwiseFormatExpand,
+    FragmentAccumulator, FragmentAccumulatorExpand, FragmentMask, FragmentMaskExpand, LOGIT_MASKED,
+    RowVal, RowWise, RowwiseFormat, RowwiseFormatExpand,
 };
 
 use crate::components::tile::{FragmentLayout, FragmentLayoutExpand};
@@ -226,13 +226,21 @@ impl<E: Float> RowwiseFormat<E> for LocalTile<E> {
     }
 
     fn exp_diff(&mut self, val: &RowWise<E>) {
+        let threshold = E::new(LOGIT_MASKED);
+
         #[unroll]
         for r in 0..self.layout.unit_size.0 {
             let row_offset = r * self.layout.unit_size.1;
+
+            let val = val.index(r);
+
             #[unroll]
             for c in 0..self.layout.unit_size.1 {
                 let index = row_offset + c;
-                self.array[index] = Exp::exp(self.array[index] - val.index(r));
+
+                let safe_val = Max::max(val, threshold);
+                let not_masked = E::cast_from(val >= threshold);
+                self.array[index] = not_masked * Exp::exp(self.array[index] - safe_val);
             }
         }
     }
diff --git a/crates/cubecl-attention/src/components/tile/base.rs b/crates/cubecl-attention/src/components/tile/base.rs
@@ -14,6 +14,13 @@ use crate::components::{
 use std::fmt::Debug;
 use std::hash::Hash;
 
+/// Logits below this are considered masked (effectively -inf)
+pub(crate) const LOGIT_MASKED: f32 = -1e5;
+
+/// Any value smaller than this is considered numerically zero
+/// (used for fully-masked rows or tiny contributions)
+pub(crate) const FULLY_MASKED_ROW_THRESHOLD: f32 = 1e-7;
+
 #[cube]
 pub trait TileAttention<AP: AttentionPrecision>: Send + Sync + 'static {
     type Config: TileAttentionConfig;
diff --git a/crates/cubecl-attention/src/components/tile/rowwise.rs b/crates/cubecl-attention/src/components/tile/rowwise.rs
@@ -1,6 +1,8 @@
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
+use crate::components::tile::FULLY_MASKED_ROW_THRESHOLD;
+
 #[derive(CubeType)]
 /// Contains one value per row of a fragment for which the unit contributes
 ///
@@ -188,13 +190,22 @@ impl<E: Float> RowWise<E> {
         }
     }
 
-    /// Changes the value v at each row for 1/v
+    /// Replaces each value `v` (v >= 0) in a row with `1/v`.
+    ///
+    /// If `v = 0`, the result is set to `0` instead of `1/0`.
+    /// This occurs when the entire row is masked, meaning it should
+    /// contribute no information, and ensures numerical stability.
     pub fn recip_inplace(&mut self) {
         let mut i = comptime![0u32];
         #[unroll]
         for _ in 0..self.num_rows {
             let row_val = self.vals.index_mut(i);
-            row_val.val = Recip::recip(row_val.val);
+
+            let epsilon = E::new(FULLY_MASKED_ROW_THRESHOLD);
+            let not_masked = E::cast_from(row_val.val >= epsilon);
+            let safe_val = Max::max(row_val.val, epsilon);
+            let recip = Recip::recip(safe_val);
+            row_val.val = not_masked * recip;
 
             comptime![i += 1];
         }
diff --git a/crates/cubecl-attention/src/components/tile/unit_register/attention.rs b/crates/cubecl-attention/src/components/tile/unit_register/attention.rs
@@ -7,6 +7,7 @@ use cubecl_std::tensor::layout::Coords2d;
 
 use crate::components::AttentionPrecision;
 use crate::components::attention_types::*;
+use crate::components::tile::LOGIT_MASKED;
 use crate::components::tile::RowVal;
 use crate::components::tile::RowWise;
 use crate::components::tile::unit_register::setup::UnitTileAttentionConfig;
@@ -138,13 +139,21 @@ impl<E: Float> RowwiseFormat<E> for UnitTile<E> {
     }
 
     fn exp_diff(&mut self, val: &RowWise<E>) {
+        let threshold = E::new(LOGIT_MASKED);
+
         #[unroll]
         for r in 0..self.layout.num_rows {
             let row_offset = r * self.layout.num_cols;
+
+            let val = val.index(r);
+
             #[unroll]
             for c in 0..self.layout.num_cols {
                 let index = row_offset + c;
-                self.data[index] = Exp::exp(self.data[index] - val.index(r));
+
+                let safe_val = Max::max(val, threshold);
+                let not_masked = E::cast_from(val >= threshold);
+                self.data[index] = not_masked * Exp::exp(self.data[index] - safe_val);
             }
         }
     }
@@ -323,7 +332,7 @@ impl<AP: AttentionPrecision> TileAttention<AP> for UnitRegisterTileAttention {
         slice: &mut SliceMut<Line<E>>,
         #[comptime] _config: Self::Config,
     ) {
-        array_tile_to_slice(out, slice)
+        unit_tile_to_slice(out, slice)
     }
 }
 
@@ -363,14 +372,27 @@ fn strided_tile_to_transposed_unit_tile<E: Numeric, E2: Numeric>(
 }
 
 #[cube]
-fn array_tile_to_slice<E: Numeric, E2: Numeric>(
+fn unit_tile_to_slice<E: Numeric, E2: Numeric>(
     unit_tile: &UnitTile<E>,
     slice: &mut SliceMut<Line<E2>>,
 ) {
+    let line_size = slice.line_size();
+    assert!(unit_tile.layout.num_cols % line_size == 0);
+
+    let col_iterations = comptime!(unit_tile.layout.num_cols / line_size);
+
     for row in 0..unit_tile.layout.num_rows {
-        for col in 0..unit_tile.layout.num_cols {
-            let index = row * unit_tile.layout.num_cols + col;
-            slice[index] = Line::cast_from(unit_tile.data[index]);
+        for col in 0..col_iterations {
+            let mut out_line = Line::empty(line_size);
+
+            #[unroll]
+            for i in 0..line_size {
+                let index = row * unit_tile.layout.num_cols + col * line_size + i;
+                out_line[i] = E2::cast_from(unit_tile.data[index]);
+            }
+
+            let line_index = row * col_iterations + col;
+            slice[line_index] = out_line;
         }
     }
 }
diff --git a/crates/cubecl-attention/src/kernels/blackbox_accelerated.rs b/crates/cubecl-attention/src/kernels/blackbox_accelerated.rs
@@ -29,7 +29,7 @@ impl Algorithm for BlackboxAcceleratedAlgorithm {
             key: vec![1],
             value: vec![1],
             mask: vec![1],
-            out: vec![1],
+            out: available_line_sizes.out,
         }
     }
 }
diff --git a/crates/cubecl-attention/src/kernels/unit.rs b/crates/cubecl-attention/src/kernels/unit.rs
@@ -29,7 +29,7 @@ impl Algorithm for UnitAlgorithm {
             key: vec![1],
             value: vec![1],
             mask: vec![1],
-            out: vec![1],
+            out: available_line_sizes.out,
         }
     }
 }
diff --git a/crates/cubecl-attention/src/tests/attention_test_launcher.rs b/crates/cubecl-attention/src/tests/attention_test_launcher.rs
@@ -36,7 +36,6 @@ pub fn test_attention_algorithm<A, P, R>(
     let panic_on_launch_err = match env {
         Ok(val) => match val.as_str() {
             "panic" => true,
-            "skip" => false,
             _ => false,
         },
         Err(_) => false,
@@ -162,6 +161,7 @@ where
     let handle = T::sample(client, &tensor_shape, sample_seed);
     let data = client.read_one(handle.handle);
     let data = T::from_bytes(&data);
+
     let original_data = data.to_owned();
     let data_bytes = T::as_bytes(&original_data);
     let shape = tensor_shape.as_slice();
diff --git a/crates/cubecl-attention/src/tests/macros/mod.rs b/crates/cubecl-attention/src/tests/macros/mod.rs
@@ -52,7 +52,8 @@ pub fn attention_test_launch<A: Algorithm, R: Runtime>(
         two_rows_in_array_tile: test_options.two_rows_in_array_tile,
     };
 
-    test_attention_algorithm::<A, (half::f16, half::f16), R>(client, problem, selection);
+    test_attention_algorithm::<A, (f32, f32), R>(client, problem, selection);
+    // test_attention_algorithm::<A, (half::f16, half::f16), R>(client, problem, selection);
 }
 
 #[macro_export]
diff --git a/crates/cubecl-attention/src/tests/macros/suite.rs b/crates/cubecl-attention/src/tests/macros/suite.rs
@@ -1091,6 +1091,7 @@ macro_rules! testgen_attention_suite {
                 masked: true,
                 causal: false,
             };
+            println!("{:?}", problem);
             attention_test_launch::<Algorithm, TestRuntime>(
                 client,
                 tiling_scheme,
diff --git a/crates/cubecl-attention/src/tests/test_utils.rs b/crates/cubecl-attention/src/tests/test_utils.rs

Original file line number	Diff line number	Diff line change
`@@ -171,7 +171,7 @@ impl<`
`171`	`171`	`mask.view(layout),`
`172`	`172`	`step,`
`173`	`173`	`seq_kv_shape,`
`174`		`- config.mask_gmem_config.view_direction,`
	`174`	`+ config.mask_gmem_config,`
`175`	`175`	`)`
`176`	`176`	`}`
`177`	`177`	`CubeOption::None => MaskReader::new_logical(stage_q_offset + partition_q_offset, step),`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ impl Algorithm for BlackboxAcceleratedAlgorithm {`
`29`	`29`	`key: vec![1],`
`30`	`30`	`value: vec![1],`
`31`	`31`	`mask: vec![1],`
`32`		`- out: vec![1],`
	`32`	`+ out: available_line_sizes.out,`
`33`	`33`	`}`
`34`	`34`	`}`
`35`	`35`	`}`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ impl Algorithm for UnitAlgorithm {`
`29`	`29`	`key: vec![1],`
`30`	`30`	`value: vec![1],`
`31`	`31`	`mask: vec![1],`
`32`		`- out: vec![1],`
	`32`	`+ out: available_line_sizes.out,`
`33`	`33`	`}`
`34`	`34`	`}`
`35`	`35`	`}`
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,8 @@ pub fn attention_test_launch<A: Algorithm, R: Runtime>(`
`52`	`52`	`two_rows_in_array_tile: test_options.two_rows_in_array_tile,`
`53`	`53`	`};`
`54`	`54`
`55`		`- test_attention_algorithm::<A, (half::f16, half::f16), R>(client, problem, selection);`
	`55`	`+ test_attention_algorithm::<A, (f32, f32), R>(client, problem, selection);`
	`56`	`+ // test_attention_algorithm::<A, (half::f16, half::f16), R>(client, problem, selection);`
`56`	`57`	`}`
`57`	`58`
`58`	`59`	`#[macro_export]`