tracel-ai
diff --git a/‎crates/cubecl-attention/src/base.rs‎
Lines changed: 5 additions & 4 deletions b/‎crates/cubecl-attention/src/base.rs‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎crates/cubecl-attention/src/components/mask.rs‎
Lines changed: 4 additions & 3 deletions b/‎crates/cubecl-attention/src/components/mask.rs‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎crates/cubecl-attention/src/components/selection.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/cubecl-attention/src/components/selection.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 5 additions & 7 deletions b/‎crates/cubecl-attention/src/components/stage/base.rs‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/dummy/attention.rs‎
Lines changed: 24 additions & 14 deletions b/‎crates/cubecl-attention/src/components/stage/dummy/attention.rs‎
Lines changed: 24 additions & 14 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/dummy/config.rs‎
Lines changed: 3 additions & 3 deletions b/‎crates/cubecl-attention/src/components/stage/dummy/config.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/dummy/setup.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-attention/src/components/stage/dummy/setup.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-attention/src/components/stage/dummy/tile_partitions.rs‎
Lines changed: 0 additions & 33 deletions b/‎crates/cubecl-attention/src/components/stage/dummy/tile_partitions.rs‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎crates/cubecl-attention/src/components/tile/base.rs‎
Lines changed: 17 additions & 8 deletions b/‎crates/cubecl-attention/src/components/tile/base.rs‎
Lines changed: 17 additions & 8 deletions
@@ -9,7 +9,7 @@ use crate::{
         AttentionTilingScheme, AvailableLineSizes, args::TensorInputsLaunch, attention_types::*,
         batch::HypercubeSelection,
     },
-    kernels::{Algorithm, dummy::DummyAlgorithm},
+    kernels::{Algorithm, dummy::DummyRegisterAlgorithm},
 };
 
 use crate::components::batch::BatchAttentionConfig;
@@ -66,7 +66,7 @@ pub fn launch_tmp<R: Runtime, AP: AttentionPrecision>(
         &MSK::<AP>::as_type_native_unchecked(),
         &OG::<AP>::as_type_native_unchecked(),
     );
-    let line_sizes = DummyAlgorithm::filter_line_sizes(line_sizes)
+    let line_sizes = DummyRegisterAlgorithm::filter_line_sizes(line_sizes)
         .filter_with_tensor(AttentionIdent::Query, query.strides, query.shape)
         .filter_with_tensor(AttentionIdent::Key, key.strides, key.shape)
         .filter_with_tensor(AttentionIdent::Value, value.strides, value.shape)
@@ -105,16 +105,17 @@ pub fn launch_tmp<R: Runtime, AP: AttentionPrecision>(
         },
         plane_dim: 32,
         reuse_key_value: false,
+        two_rows_in_array_tile: false,
     };
 
-    let config = DummyAlgorithm::setup::<AP, R>(client, &problem, &selection, &line_sizes)?;
+    let config = DummyRegisterAlgorithm::setup::<AP, R>(client, &problem, &selection, &line_sizes)?;
 
     let cube_count_plan = config
         .hypercube_config()
         .cube_count_plan(&problem, &selection);
 
     unsafe {
-        <DummyAlgorithm as Algorithm>::BatchAttention::launch_unchecked::<AP, R>(
+        <DummyRegisterAlgorithm as Algorithm>::BatchAttention::launch_unchecked::<AP, R>(
             client,
             config.cube_dim(),
             cube_count_plan.resolve(),
 
@@ -1,5 +1,6 @@
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
+use cubecl_std::tensor::layout::Coords2d;
 
 use crate::components::AttentionTilingScheme;
 
@@ -87,8 +88,8 @@ impl PartitionMask {
 
 #[cube]
 impl TileMask {
-    pub fn apply<E: Numeric>(&self, row: u32, col: u32) -> Line<E> {
-        let should_mask = Line::<E>::cast_from(row >= self.q_bound || col >= self.kv_bound);
-        should_mask * Line::cast_from(-999999)
+    pub fn apply<E: Numeric>(&self, pos: Coords2d) -> E {
+        let should_mask = E::cast_from(pos.0 >= self.q_bound || pos.1 >= self.kv_bound);
+        should_mask * E::min_value()
     }
 }
@@ -8,4 +8,5 @@ pub struct AttentionSelection {
     pub plane_dim: u32,
 
     pub reuse_key_value: bool,
+    pub two_rows_in_array_tile: bool,
 }
@@ -8,14 +8,14 @@ use std::{fmt::Debug, hash::Hash};
 
 use crate::components::attention_types::*;
 use crate::components::stage::dummy::AttentionStageMemoryConfig;
-use crate::components::{AttentionIdent, StageMask};
 use crate::components::{
     AttentionLineSizes, AttentionPrecision, AttentionProblem, AttentionSelection,
     AttentionSetupError, AvailableLineSizes,
     global::GlobalAttentionConfig,
     tile::{AttentionTilingLayout, dummy::AttentionMatmulConfig},
 };
 use crate::components::{AttentionTilingScheme, global::dummy::QueryReader};
+use crate::components::{StageMask, tile::RunningState};
 
 /// A family of [TileAttention] implementations that operate with any [precision](AttentionPrecision).
 pub trait StageAttentionFamily: Send + Sync + 'static {
@@ -62,14 +62,12 @@ pub trait StageAttention<AP: AttentionPrecision>: 'static + Send + Sync {
     /// The configuration type associated with this Attention.
     type Config: StageAttentionConfig;
 
-    type State: CubeType;
-
     type QueryPartition: CubeType;
     type KeyValuePartition: CubeType;
     type SoftmaxPartition: CubeType;
     type AccumulatorPartition: CubeType;
 
-    fn init_state(#[comptime] config: Self::Config) -> Self::State;
+    fn init_state(#[comptime] config: Self::Config) -> Sequence<RunningState<SM<AP>>>;
 
     fn execute(
         key_reader: &Self::KeyStage,
@@ -79,13 +77,13 @@ pub trait StageAttention<AP: AttentionPrecision>: 'static + Send + Sync {
         score: &mut Self::SoftmaxPartition,
         mask: StageMask,
         accumulator: &mut Self::AccumulatorPartition,
-        prev_state: &mut Self::State,
+        prev_state: &mut Sequence<RunningState<SM<AP>>>,
         #[comptime] config: Self::Config,
     );
 
     fn rescale(
         acc: &mut Self::AccumulatorPartition,
-        state: Self::State,
+        state: Sequence<RunningState<SM<AP>>>,
         #[comptime] config: Self::Config,
     );
 
@@ -123,5 +121,5 @@ pub trait StageAttentionConfig:
     fn tiling_scheme(&self) -> AttentionTilingScheme;
     fn reuse_key_value(&self) -> bool;
 
-    fn num_rows_per_unit(&self, ident: AttentionIdent) -> u32;
+    fn num_rows_per_unit(&self) -> u32;
 }
@@ -7,16 +7,15 @@ use cubecl_matmul::components::{
 };
 use std::marker::PhantomData;
 
-use crate::components::StageMask;
 use crate::components::attention_types::*;
 use crate::components::global::dummy::QueryReader;
 use crate::components::stage::dummy::SoftmaxPartition;
-use crate::components::stage::dummy::StageState;
 use crate::components::stage::dummy::{Accumulators, DummyStageConfig, KeyValues, Queries};
 use crate::components::stage::{StageAttention, StageAttentionConfig};
 use crate::components::tile::RowWise;
 use crate::components::tile::TileAttention;
 use crate::components::{AttentionPrecision, global::GlobalAttentionConfig};
+use crate::components::{StageMask, tile::RunningState};
 
 pub struct DummyStageAttention<AP: AttentionPrecision, SK, SV, SO, TA: TileAttention<AP>> {
     _phantom: PhantomData<(AP, SK, SV, SO, TA)>,
@@ -37,7 +36,6 @@ impl<
     type ValueStage = SV;
     type OutStage = SO;
 
-    type State = StageState<AP>;
     type QueryPartition = Queries<AP, TA, Self::Config>;
     type KeyValuePartition = KeyValues<AP, TA, Self::Config>;
     type SoftmaxPartition = SoftmaxPartition<AP, TA, Self::Config>;
@@ -51,7 +49,7 @@ impl<
         softmax_partition: &mut Self::SoftmaxPartition,
         mask: StageMask,
         accumulator_partition: &mut Self::AccumulatorPartition,
-        state: &mut Self::State,
+        state: &mut Sequence<RunningState<SM<AP>>>,
         #[comptime] config: Self::Config,
     ) {
         let partition_mask = mask.to_partition(UNIT_POS_Y);
@@ -60,6 +58,9 @@ impl<
 
         let mut kv = comptime![0u32];
 
+        let mut max_placeholder = TA::init_max_placeholder(config.num_rows_per_unit());
+        let mut sum_placeholder = TA::init_sum_placeholder(config.num_rows_per_unit());
+
         #[unroll]
         #[allow(clippy::explicit_counter_loop)]
         for _ in 0..p.seq_kv {
@@ -80,7 +81,7 @@ impl<
             }
 
             let mut q = comptime![0u32];
-            let mut scales = Sequence::<RowWise<ACC<AP>>>::new();
+            let mut scales = Sequence::<RowWise<SM<AP>>>::new();
 
             #[unroll]
             #[allow(clippy::explicit_counter_loop)]
@@ -101,16 +102,17 @@ impl<
                     comptime![hd += 1];
                 }
 
-                let state_q = state.get_at_mut(q);
+                let state_q = state.index_mut(q);
 
-                let accumulator_scale = TA::softmax(
+                scales.push(TA::softmax(
                     softmax_tile,
                     partition_mask.to_tile(q, kv),
                     state_q,
+                    &mut max_placeholder,
+                    &mut sum_placeholder,
                     config.tiling_scheme().elements_in_partition_head_dim(),
-                );
-
-                scales.push(accumulator_scale);
+                    config.tile_config(),
+                ));
 
                 comptime![q += 1];
             }
@@ -162,7 +164,7 @@ impl<
 
     fn rescale(
         acc: &mut Self::AccumulatorPartition,
-        state: Self::State,
+        state: Sequence<RunningState<SM<AP>>>,
         #[comptime] config: Self::Config,
     ) {
         let p = config.tiling_scheme().partition_size;
@@ -179,7 +181,7 @@ impl<
             for _ in 0..p.val_dim {
                 TA::rescale(
                     Self::AccumulatorPartition::get_at_mut(acc, q, vd, config),
-                    state.get_at(q),
+                    state.index(q),
                     config.tile_config(),
                 );
 
@@ -190,8 +192,16 @@ impl<
         }
     }
 
-    fn init_state(#[comptime] config: Self::Config) -> Self::State {
-        StageState::<AP>::init::<Self::Config>(config)
+    fn init_state(#[comptime] config: Self::Config) -> Sequence<RunningState<SM<AP>>> {
+        let p = config.tiling_scheme().partition_size;
+        let mut sequence = Sequence::new();
+
+        #[unroll]
+        for _ in 0..comptime!(p.seq_q) {
+            sequence.push(TA::init_state(config.tile_config()));
+        }
+
+        sequence
     }
 
     fn write<W: WriteEventListener, G: GlobalAttentionConfig>(
 
@@ -1,7 +1,7 @@
 use cubecl_matmul::components::{MatrixLayout, StageIdent, TilingScheme, stage::StageMemoryConfig};
 
 use crate::components::{
-    AttentionIdent, AttentionSetupError, AttentionTilingScheme, stage::StageAttentionConfig,
+    AttentionSetupError, AttentionTilingScheme, stage::StageAttentionConfig,
     tile::dummy::AttentionMatmulConfig,
 };
 
@@ -46,8 +46,8 @@ impl<FC: AttentionMatmulConfig> StageAttentionConfig for DummyStageConfig<FC> {
         self.reuse_key_value
     }
 
-    fn num_rows_per_unit(&self, ident: AttentionIdent) -> u32 {
-        self.tile_config.num_rows_per_unit(ident)
+    fn num_rows_per_unit(&self) -> u32 {
+        self.tile_config.num_rows_per_unit()
     }
 }
 
 
@@ -52,11 +52,11 @@ impl<
         selection: &AttentionSelection,
         line_sizes: &AttentionLineSizes,
     ) -> Result<Self::Config, AttentionSetupError> {
-        let tile_config = TA::setup::<AP, R>(client, problem, selection, line_sizes)?;
-
         let num_planes = selection.tiling_scheme.stage_size.seq_q
             * TA::computation_resources()?.num_planes(selection.plane_dim)?;
 
+        let tile_config = TA::setup::<AP, R>(client, problem, selection, line_sizes, num_planes)?;
+
         DummyStageConfig::new(
             tile_config,
             score_attention_stage_memory_config(selection),
 
@@ -4,10 +4,7 @@ use std::marker::PhantomData;
 use cubecl::prelude::*;
 use cubecl_core as cubecl;
 
-use crate::components::AttentionIdent;
-use crate::components::attention_types::*;
 use crate::components::global::dummy::QueryReader;
-use crate::components::tile::RunningState;
 use crate::components::{AttentionPrecision, stage::StageAttentionConfig, tile::TileAttention};
 
 #[derive(CubeType)]
@@ -307,33 +304,3 @@ impl<
         self.sequence.index_mut(index)
     }
 }
-
-#[derive(CubeType)]
-pub struct StageState<AP: AttentionPrecision> {
-    sequence: Sequence<RunningState<SM<AP>>>,
-}
-
-#[cube]
-impl<AP: AttentionPrecision> StageState<AP> {
-    pub fn init<S: StageAttentionConfig>(#[comptime] config: S) -> StageState<AP> {
-        let p = config.tiling_scheme().partition_size;
-        let mut sequence = Sequence::new();
-
-        #[unroll]
-        for _ in 0..comptime!(p.seq_q) {
-            sequence.push(RunningState::<SM<AP>>::init(
-                config.num_rows_per_unit(AttentionIdent::Softmax),
-            ));
-        }
-
-        StageState::<AP> { sequence }
-    }
-
-    pub fn get_at(&self, #[comptime] q: u32) -> &RunningState<SM<AP>> {
-        self.sequence.index(q)
-    }
-
-    pub fn get_at_mut(&mut self, #[comptime] q: u32) -> &mut RunningState<SM<AP>> {
-        self.sequence.index_mut(q)
-    }
-}
@@ -10,7 +10,7 @@ use crate::components::{
     AttentionLineSizes, AttentionPrecision, AttentionProblem, AttentionSelection,
     AttentionSetupError, AvailableLineSizes,
     attention_types::*,
-    tile::{RowWise, RunningState, dummy::AttentionMatmulConfig},
+    tile::{KeyValueTile, QueryTile, RowWise, RunningState, dummy::AttentionMatmulConfig},
 };
 use crate::components::{InvalidConfigError, tile::AccumulatorTile};
 use crate::components::{TileMask, tile::SoftmaxTile};
@@ -33,6 +33,7 @@ pub trait TileAttentionFamily: Send + Sync + 'static {
         problem: &AttentionProblem,
         selection: &AttentionSelection,
         line_sizes: &AttentionLineSizes,
+        num_planes: u32,
     ) -> Result<Self::Config, AttentionSetupError>;
 
     /// Filters out line sizes that are incompatible with this Attention family.
@@ -50,10 +51,10 @@ pub trait TileAttention<AP: AttentionPrecision>: 'static + Send + Sync {
     /// The configuration type associated with this Attention.
     type Config: AttentionMatmulConfig;
 
-    type QueryTile: CubeType;
-    type KeyValueTile: CubeType;
+    type QueryTile: QueryTile<QT<AP>>;
+    type KeyValueTile: KeyValueTile<KVT<AP>>;
     type SoftmaxTile: SoftmaxTile<AP>;
-    type AccumulatorTile: AccumulatorTile<ACC<AP>>;
+    type AccumulatorTile: AccumulatorTile<AP>;
 
     fn rescale(
         acc: &mut Self::AccumulatorTile,
@@ -77,13 +78,15 @@ pub trait TileAttention<AP: AttentionPrecision>: 'static + Send + Sync {
 
     fn init_softmax(#[comptime] config: Self::Config) -> Self::SoftmaxTile;
 
-    fn fill_key<E: Numeric>(
+    fn init_state(#[comptime] config: Self::Config) -> RunningState<SM<AP>>;
+
+    fn fill_key<E: Float>(
         tile: &StridedTile<E>,
         rhs: &mut Self::KeyValueTile,
         #[comptime] config: Self::Config,
     );
 
-    fn fill_value<E: Numeric>(
+    fn fill_value<E: Float>(
         tile: &StridedTile<E>,
         rhs: &mut Self::KeyValueTile,
         #[comptime] config: Self::Config,
@@ -102,14 +105,20 @@ pub trait TileAttention<AP: AttentionPrecision>: 'static + Send + Sync {
         softmax: &mut Self::SoftmaxTile,
         mask: TileMask,
         state: &mut RunningState<SM<AP>>,
+        max_placeholder: &mut RowWise<SM<AP>>,
+        sum_placeholder: &mut RowWise<SM<AP>>,
         #[comptime] dk: u32,
-    ) -> RowWise<ACC<AP>>;
+        #[comptime] config: Self::Config,
+    ) -> RowWise<SM<AP>>;
 
     fn accumulate_value(
         softmax: &Self::SoftmaxTile,
         key_value: &Self::KeyValueTile,
         accumulator: &mut Self::AccumulatorTile,
-        scale: &RowWise<ACC<AP>>,
+        scale: &RowWise<SM<AP>>,
         #[comptime] config: Self::Config,
     );
+
+    fn init_max_placeholder(#[comptime] num_rows: u32) -> RowWise<SM<AP>>;
+    fn init_sum_placeholder(#[comptime] num_rows: u32) -> RowWise<SM<AP>>;
 }
Original file line number	Diff line number	Diff line change
`@@ -8,4 +8,5 @@ pub struct AttentionSelection {`
`8`	`8`	`pub plane_dim: u32,`
`9`	`9`
`10`	`10`	`pub reuse_key_value: bool,`
	`11`	`+ pub two_rows_in_array_tile: bool,`
`11`	`12`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`use cubecl_matmul::components::{MatrixLayout, StageIdent, TilingScheme, stage::StageMemoryConfig};`
`2`	`2`
`3`	`3`	`use crate::components::{`
`4`		`- AttentionIdent, AttentionSetupError, AttentionTilingScheme, stage::StageAttentionConfig,`
	`4`	`+ AttentionSetupError, AttentionTilingScheme, stage::StageAttentionConfig,`
`5`	`5`	`tile::dummy::AttentionMatmulConfig,`
`6`	`6`	`};`
`7`	`7`
`@@ -46,8 +46,8 @@ impl<FC: AttentionMatmulConfig> StageAttentionConfig for DummyStageConfig<FC> {`
`46`	`46`	`self.reuse_key_value`
`47`	`47`	`}`
`48`	`48`
`49`		`- fn num_rows_per_unit(&self, ident: AttentionIdent) -> u32 {`
`50`		`- self.tile_config.num_rows_per_unit(ident)`
	`49`	`+ fn num_rows_per_unit(&self) -> u32 {`
	`50`	`+ self.tile_config.num_rows_per_unit()`
`51`	`51`	`}`
`52`	`52`	`}`
`53`	`53`