tracel-ai
diff --git a/‎crates/cubecl-attention/src/components/global/simple/attention.rs‎
Lines changed: 14 additions & 14 deletions b/‎crates/cubecl-attention/src/components/global/simple/attention.rs‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎crates/cubecl-attention/src/components/global/simple/reader/key_value.rs‎
Lines changed: 0 additions & 98 deletions b/‎crates/cubecl-attention/src/components/global/simple/reader/key_value.rs‎
Lines changed: 0 additions & 98 deletions
diff --git a/‎crates/cubecl-attention/src/components/global/simple/reader/mod.rs‎
Lines changed: 0 additions & 2 deletions b/‎crates/cubecl-attention/src/components/global/simple/reader/mod.rs‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎crates/cubecl-attention/src/components/global/simple/setup.rs‎
Lines changed: 23 additions & 10 deletions b/‎crates/cubecl-attention/src/components/global/simple/setup.rs‎
Lines changed: 23 additions & 10 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 8 additions & 1 deletion b/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎crates/cubecl-attention/src/components/stage/partition_attention.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-attention/src/components/stage/partition_attention.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/tile_ops/mask.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-attention/src/components/stage/tile_ops/mask.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-attention/src/components/stage/tile_ops/query.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-attention/src/components/stage/tile_ops/query.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-attention/src/components/tile/accelerated/attention.rs‎
Lines changed: 5 additions & 5 deletions b/‎crates/cubecl-attention/src/components/tile/accelerated/attention.rs‎
Lines changed: 5 additions & 5 deletions
@@ -1,17 +1,19 @@
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 use cubecl_matmul::components::global::PartitionedStage;
+use cubecl_matmul::components::global::read::FullStageGlobalReader;
 use cubecl_matmul::components::stage::StridedStageMemory;
 use cubecl_std::tensor::r#virtual::VirtualTensor;
 use cubecl_std::{CubeOption, CubeOptionExpand};
 use std::marker::PhantomData;
 
 use crate::components::attention_types::*;
+use crate::components::global::AttentionGlobalLayout;
 use crate::components::global::simple::QueryReader;
 use crate::components::global::simple::{AttentionWriter, AttentionWriterExpand, MaskReader};
-use crate::components::global::{AttentionGlobalLayout, simple::DummyKeyValueReader};
 use crate::components::stage::{
-    AttentionPartitioner, AttentionTilingLayout, StageAttention, StageAttentionConfig as _,
+    AttentionLoadingStrategy, AttentionPartitioner, AttentionTilingLayout, StageAttention,
+    StageAttentionConfig as _,
 };
 use crate::components::{
     AttentionPrecision,
@@ -33,8 +35,8 @@ impl<
     AP: AttentionPrecision,
 > GlobalAttention<AP> for SimpleGlobalAttention<AP, SA>
 {
-    type KeyReader = DummyKeyValueReader<KG<AP>, KS<AP>>;
-    type ValueReader = DummyKeyValueReader<VG<AP>, VS<AP>>;
+    type KeyReader = FullStageGlobalReader<KG<AP>, KS<AP>, AttentionLoadingStrategy>;
+    type ValueReader = FullStageGlobalReader<VG<AP>, VS<AP>, AttentionLoadingStrategy>;
     type MaskReader = MaskReader<AP>;
 
     type Writer = <SA::Partitioner as AttentionPartitioner>::Writer<OS<AP>, OG<AP>>;
@@ -51,10 +53,6 @@ impl<
         seq_kv: u32,
         #[comptime] config: Self::Config,
     ) {
-        // Init staging shared memories
-        let mut key_stage = key_reader.init_stage();
-        let mut value_stage = value_reader.init_stage();
-
         // Load queries which stay alive in registers for all the kernel
         let mut query_registers = SA::init_query(config.stage_config);
         SA::read_query(&query_reader, &mut query_registers, config.stage_config);
@@ -73,19 +71,21 @@ impl<
         let num_stage_iterations =
             seq_kv.div_ceil(config.stage_config.elements_in_partition_seq_kv());
 
+        let mut barrier = ();
+
         // Global loop over seq_kv
         for _ in 0..num_stage_iterations {
             // Put key and value into stage
-            key_reader.read_global(&mut key_stage);
-            value_reader.read_global(&mut value_stage);
+            key_reader.load_stage(&mut barrier, config.key_reader_config);
+            value_reader.load_stage(&mut barrier, config.value_reader_config);
 
             sync_cube();
 
             // Core of flash attention
             SA::execute(
                 &query_registers,
-                &key_stage,
-                &value_stage,
+                &key_reader.stage(),
+                &value_reader.stage(),
                 &mut key_value_registers,
                 &mask_reader,
                 &mut mask_registers,
@@ -135,7 +135,7 @@ impl<
         let step = config.stage_config.elements_in_partition_seq_kv().runtime();
         let layout =
             AttentionGlobalLayout::new(&key, batch_index, config.key_reader_config.gmem_config);
-        DummyKeyValueReader::new(key.view(layout), step, config.key_reader_config)
+        FullStageGlobalReader::new(key.view(layout), step, config.key_reader_config)
     }
 
     fn init_value_reader(
@@ -146,7 +146,7 @@ impl<
         let step = config.stage_config.elements_in_partition_seq_kv().runtime();
         let layout =
             AttentionGlobalLayout::new(&value, batch_index, config.value_reader_config.gmem_config);
-        DummyKeyValueReader::new(value.view(layout), step, config.value_reader_config)
+        FullStageGlobalReader::new(value.view(layout), step, config.value_reader_config)
     }
 
     fn init_mask_reader(
 
@@ -1,7 +1,5 @@
-mod key_value;
 mod mask;
 mod query;
 
-pub use key_value::*;
 pub use mask::*;
 pub use query::*;
@@ -55,42 +55,55 @@ impl<
         let specialization_tensor_config = SpecializationTensorConfig::MainFlowOnly;
         let plane_role_config = PlaneRoleConfig::new_unspecialized(stage_config.num_planes());
 
+        let seq_q_check_bounds = !problem
+            .seq_q
+            .is_multiple_of(stage_config.elements_in_stage_seq_q() as usize);
+        let seq_kv_check_bounds = !problem
+            .seq_kv
+            .is_multiple_of(stage_config.elements_in_partition_seq_kv() as usize);
+        let head_dim_check_bounds = !problem
+            .head_dim
+            .is_multiple_of(stage_config.elements_in_partition_head_dim() as usize);
+        let val_dim_check_bounds = !problem
+            .val_dim
+            .is_multiple_of(stage_config.elements_in_partition_val_dim() as usize);
+
         let query_gmem_config = GlobalMemoryConfig {
             line_size: line_sizes.query as u32,
-            check_row_bounds: false,
-            check_col_bounds: false,
+            check_row_bounds: seq_q_check_bounds,
+            check_col_bounds: head_dim_check_bounds,
             matrix_layout: MatrixLayout::RowMajor,
             view_direction: ViewDirection::None,
         };
 
         let mask_gmem_config = GlobalMemoryConfig {
             line_size: line_sizes.mask as u32,
-            check_row_bounds: false,
-            check_col_bounds: false,
+            check_row_bounds: seq_q_check_bounds,
+            check_col_bounds: seq_kv_check_bounds,
             matrix_layout: MatrixLayout::RowMajor,
             view_direction: ViewDirection::Col,
         };
 
         let key_gmem_config = GlobalMemoryConfig {
             line_size: line_sizes.key as u32,
-            check_row_bounds: false,
-            check_col_bounds: false,
+            check_row_bounds: seq_kv_check_bounds,
+            check_col_bounds: head_dim_check_bounds,
             matrix_layout: MatrixLayout::RowMajor,
             view_direction: ViewDirection::Row,
         };
 
         let value_gmem_config = GlobalMemoryConfig {
             line_size: line_sizes.value as u32,
-            check_row_bounds: false,
-            check_col_bounds: false,
+            check_row_bounds: seq_kv_check_bounds,
+            check_col_bounds: val_dim_check_bounds,
             matrix_layout: MatrixLayout::RowMajor,
             view_direction: ViewDirection::Row,
         };
 
         let out_gmem_config = GlobalMemoryConfig {
             line_size: line_sizes.out as u32,
-            check_row_bounds: false,
-            check_col_bounds: false,
+            check_row_bounds: seq_q_check_bounds,
+            check_col_bounds: val_dim_check_bounds,
             matrix_layout: MatrixLayout::RowMajor,
             view_direction: ViewDirection::None,
         };
 
@@ -1,7 +1,7 @@
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 use cubecl_matmul::components::{
-    global::{WriteEventListener, WriteTiling},
+    global::{WriteEventListener, WriteTiling, read::sync_full_cyclic::SyncFullCyclicLoading},
     stage::{ContiguousTilingLayout, RowMajorTilingOrder, StageFamily, StageMemoryConfig},
 };
 use std::{fmt::Debug, hash::Hash};
@@ -21,6 +21,7 @@ use cubecl_std::CubeOption;
 use cubecl_std::tensor::layout::Coords2d;
 
 pub type AttentionTilingLayout = ContiguousTilingLayout<RowMajorTilingOrder>;
+pub type AttentionLoadingStrategy = SyncFullCyclicLoading<RowMajorTilingOrder>;
 
 /// A family of [TileAttention] implementations that operate with any [precision](AttentionPrecision).
 pub trait StageAttentionFamily: Send + Sync + 'static {
@@ -132,6 +133,7 @@ pub trait StageAttentionConfig:
 
     fn elements_in_partition_seq_q(&self) -> u32;
     fn elements_in_partition_seq_kv(&self) -> u32;
+    fn elements_in_partition_head_dim(&self) -> u32;
     fn elements_in_partition_val_dim(&self) -> u32;
 
     fn elements_in_stage_seq_q(&self) -> u32;
@@ -217,6 +219,11 @@ impl<TC: TileAttentionConfig> StageAttentionConfig for PartitionAttentionConfig<
         self.shared().partition_size.seq_kv * self.shared().tile_config.attention_tile_size().seq_kv
     }
 
+    fn elements_in_partition_head_dim(&self) -> u32 {
+        self.shared().partition_size.head_dim
+            * self.shared().tile_config.attention_tile_size().head_dim
+    }
+
     fn elements_in_partition_val_dim(&self) -> u32 {
         self.shared().partition_size.val_dim
             * self.shared().tile_config.attention_tile_size().val_dim
 
@@ -117,7 +117,7 @@ impl<
                     // Get the only key-value tile and fill it with hd,kv-th key data
                     let key_tile = key_value_partition.get_key_mut();
                     let key_data = SK::tile(key_stage, (kv, hd).runtime());
-                    TA::fill_key_transposed(&key_data, key_tile.key_mut(), config.tile_config());
+                    TA::load_key_transposed(&key_data, key_tile.key_mut(), config.tile_config());
 
                     // Perform score matmul on query and key, and accumulate in softmax tile
                     TA::score_matmul(
@@ -164,7 +164,7 @@ impl<
                     // Get the only key-value tile and fill it with hd,kv-th key data
                     let value_data = SV::tile(value_stage, (kv, vd).runtime());
                     let value_tile = key_value_partition.get_value_mut();
-                    TA::fill_value(&value_data, value_tile.value_mut(), config.tile_config());
+                    TA::load_value(&value_data, value_tile.value_mut(), config.tile_config());
 
                     // Get the q,vd-th accumulator and scale it with previously obtained scale
                     let accumulator = accumulator_partition.get_at_mut(q, vd, config);
 
@@ -142,7 +142,7 @@ impl<AP: AttentionPrecision, TA: TileAttention<AP>> MaterializedTileMask<AP, TA>
     }
 
     pub fn update_tile(&mut self, tile: StridedTile<MSK<AP>>) {
-        TA::fill_mask(&tile, &mut self.fragment, self.config);
+        TA::load_mask(&tile, &mut self.fragment, self.config);
     }
 }
 
 
@@ -22,6 +22,6 @@ impl<AP: AttentionPrecision, TA: TileAttention<AP>> QueryTile<AP, TA> {
 
     /// Loads the query data into the fragment
     pub fn update(&mut self, tile: &StridedTile<QG<AP>>) {
-        TA::fill_query(tile, &mut self.fragment)
+        TA::load_query(tile, &mut self.fragment)
     }
 }
@@ -125,13 +125,13 @@ impl<AP: AttentionPrecision> TileAttention<AP> for BlackboxAcceleratedTileAttent
         HybridFragment::new(size, config)
     }
 
-    fn fill_query<E: Numeric>(tile: &StridedTile<E>, fragment: &mut Self::Query) {
+    fn load_query<E: Numeric>(tile: &StridedTile<E>, fragment: &mut Self::Query) {
         let (slice, stride) = tile.as_unlined();
 
         cmma::load(fragment, &slice, stride);
     }
 
-    fn fill_key_transposed<E: Float>(
+    fn load_key_transposed<E: Float>(
         tile: &StridedTile<E>,
         rhs: &mut Self::KeyValue,
         #[comptime] _config: Self::Config,
@@ -140,7 +140,7 @@ impl<AP: AttentionPrecision> TileAttention<AP> for BlackboxAcceleratedTileAttent
         cmma::load(rhs, &slice, stride);
     }
 
-    fn fill_value<E: Float>(
+    fn load_value<E: Float>(
         tile: &StridedTile<E>,
         rhs: &mut Self::KeyValue,
         #[comptime] _config: Self::Config,
@@ -149,12 +149,12 @@ impl<AP: AttentionPrecision> TileAttention<AP> for BlackboxAcceleratedTileAttent
         cmma::load(rhs, &slice, stride);
     }
 
-    fn fill_mask<E: Numeric>(
+    fn load_mask<E: Numeric>(
         tile: &StridedTile<E>,
         mask: &mut Self::Mask,
         #[comptime] _config: Self::Config,
     ) {
-        mask.fill_from_strided_tile(tile)
+        mask.load_from_strided_tile(tile)
     }
 
     fn write_results<E: Float>(
Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ impl<AP: AttentionPrecision, TA: TileAttention<AP>> MaterializedTileMask<AP, TA>`
`142`	`142`	`}`
`143`	`143`
`144`	`144`	`pub fn update_tile(&mut self, tile: StridedTile<MSK<AP>>) {`
`145`		`- TA::fill_mask(&tile, &mut self.fragment, self.config);`
	`145`	`+ TA::load_mask(&tile, &mut self.fragment, self.config);`
`146`	`146`	`}`
`147`	`147`	`}`
`148`	`148`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,6 @@ impl<AP: AttentionPrecision, TA: TileAttention<AP>> QueryTile<AP, TA> {`
`22`	`22`
`23`	`23`	`/// Loads the query data into the fragment`
`24`	`24`	`pub fn update(&mut self, tile: &StridedTile<QG<AP>>) {`
`25`		`- TA::fill_query(tile, &mut self.fragment)`
	`25`	`+ TA::load_query(tile, &mut self.fragment)`
`26`	`26`	`}`
`27`	`27`	`}`