tracel-ai
diff --git a/‎crates/cubecl-attention/src/components/global/dummy/read.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-attention/src/components/global/dummy/read.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/multi_stage/tma/convolution.rs‎
Lines changed: 12 additions & 12 deletions b/‎crates/cubecl-convolution/src/components/global/multi_stage/tma/convolution.rs‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/read/reader/layout.rs‎
Lines changed: 42 additions & 0 deletions b/‎crates/cubecl-convolution/src/components/global/read/reader/layout.rs‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/read/reader/mod.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/cubecl-convolution/src/components/global/read/reader/mod.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/read/reader/weight_tma.rs‎
Lines changed: 25 additions & 43 deletions b/‎crates/cubecl-convolution/src/components/global/read/reader/weight_tma.rs‎
Lines changed: 25 additions & 43 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/single_stage/tma/convolution.rs‎
Lines changed: 10 additions & 10 deletions b/‎crates/cubecl-convolution/src/components/global/single_stage/tma/convolution.rs‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎crates/cubecl-matmul/src/components/global/memory/iterator.rs‎
Lines changed: 7 additions & 7 deletions b/‎crates/cubecl-matmul/src/components/global/memory/iterator.rs‎
Lines changed: 7 additions & 7 deletions
@@ -23,7 +23,7 @@ pub struct QueryReader<AP: AttentionPrecision> {
 
 #[derive(CubeType)]
 pub struct DummyKeyReader<AP: AttentionPrecision, G: GlobalAttentionConfig> {
-    global_iter: GlobalIterator<KG<AP>>,
+    global_iter: GlobalIterator<Line<KG<AP>>>,
     stage_memory: StridedStage<KS<AP>, AttentionTilingLayout>,
 
     #[cube(comptime)]
@@ -32,7 +32,7 @@ pub struct DummyKeyReader<AP: AttentionPrecision, G: GlobalAttentionConfig> {
 
 #[derive(CubeType)]
 pub struct DummyValueReader<AP: AttentionPrecision, G: GlobalAttentionConfig> {
-    global_iter: GlobalIterator<VG<AP>>,
+    global_iter: GlobalIterator<Line<VG<AP>>>,
     stage_memory: StridedStage<VS<AP>, AttentionTilingLayout>,
 
     #[cube(comptime)]
 
@@ -15,7 +15,7 @@ use cubecl_matmul::components::{
 };
 use cubecl_std::{
     CubeOption,
-    tensor::{layout::Coords2d, r#virtual::VirtualTensor},
+    tensor::{AsTensorView, AsTensorViewExpand, layout::Coords2d, r#virtual::VirtualTensor},
 };
 
 use crate::{
@@ -27,6 +27,7 @@ use crate::{
             read::{
                 bias::{BiasGlobalReader, BiasStage},
                 im2col_tma::{TmaIm2colGlobalReader, TmaIm2colTiling},
+                layout::TmaWeightLayout,
                 weight_tma::{TmaWeightGlobalReader, TmaWeightTiling},
             },
         },
@@ -69,7 +70,7 @@ where
     type Config = ConvolutionConfig<SimpleTmaConfig<SMM::Config>>;
 
     type LhsGlobalReader = TmaIm2colGlobalReader<MP::Lhs, Self::Config>;
-    type RhsGlobalReader = TmaWeightGlobalReader<MP::Rhs, SMM::Config>;
+    type RhsGlobalReader = TmaWeightGlobalReader<MP::Rhs>;
     type AccGlobalReader = BiasGlobalReader<MP::Acc>;
     type GlobalWriter = PlaneWriter<MP::Acc>;
 
@@ -121,12 +122,12 @@ where
             let barrier = Barrier::new_with_tma_proxy(BarrierLevel::cube_coop(0u32));
 
             lhs_reader.fill_stage(&barrier, stage);
-            rhs_reader.fill_stage(&barrier, stage, stage_config);
+            rhs_reader.fill_stage(&barrier, stage);
 
             arrive_tma(&barrier, stages_bytes);
 
             lhs_reader.advance_view(k_step);
-            rhs_reader.advance_view(k_step);
+            rhs_reader.advance_view();
 
             barriers.push(barrier);
 
@@ -168,12 +169,12 @@ where
 
                         // Refill stage and advance view
                         lhs_reader.fill_stage(barrier, stage);
-                        rhs_reader.fill_stage(barrier, stage, stage_config);
+                        rhs_reader.fill_stage(barrier, stage);
 
                         arrive_tma(barrier, stages_bytes);
 
                         lhs_reader.advance_view(k_step);
-                        rhs_reader.advance_view(k_step);
+                        rhs_reader.advance_view();
                     }
                 }
 
@@ -216,16 +217,15 @@ where
     fn init_rhs_global_reader(
         rhs: VirtualTensor<RhsG<MP>>,
         offset: Coords2d,
-        _slice_size: Coords2d,
+        slice_size: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
     ) -> Self::RhsGlobalReader {
-        let (x_offset, y_offset) = offset;
+        let layout = TmaWeightLayout::new(runtime_args.padded_channels);
+        let rhs = rhs.as_tensor_map().unwrap().view_3d(layout);
         Self::RhsGlobalReader::new(
-            rhs.as_tensor_map().unwrap(),
-            x_offset,
-            y_offset,
-            runtime_args,
+            rhs.slice(offset, slice_size),
+            config.k_step,
             config.num_stages(MatmulIdent::Rhs),
             config.stage_memory_config(MatmulIdent::Rhs),
         )
 
@@ -0,0 +1,42 @@
+use cubecl::prelude::*;
+use cubecl_core as cubecl;
+use cubecl_std::{
+    FastDivmod,
+    tensor::layout::{Coords2d, Coords3d, Layout, LayoutExpand},
+};
+
+#[derive(CubeType)]
+pub struct TmaWeightLayout {
+    padded_channels: FastDivmod,
+}
+
+#[cube]
+impl TmaWeightLayout {
+    pub fn new(padded_channels: FastDivmod) -> Self {
+        TmaWeightLayout { padded_channels }
+    }
+}
+
+#[cube]
+impl Layout for TmaWeightLayout {
+    type Coordinates = Coords2d;
+    type SourceCoordinates = Coords3d;
+
+    fn to_source_pos(&self, pos: Self::Coordinates) -> Self::SourceCoordinates {
+        let (k, n) = pos;
+        let (k_idx, in_c) = self.padded_channels.div_mod(k);
+        (n, k_idx, in_c)
+    }
+
+    fn is_in_bounds(&self, _pos: Self::Coordinates) -> bool {
+        true.runtime()
+    }
+
+    fn shape(&self) -> Self::Coordinates {
+        (u32::MAX, u32::MAX).runtime()
+    }
+
+    fn to_source_pos_checked(&self, pos: Self::Coordinates) -> (Self::SourceCoordinates, bool) {
+        (self.to_source_pos(pos), self.is_in_bounds(pos))
+    }
+}
@@ -1,3 +1,4 @@
 pub mod bias;
 pub mod im2col_tma;
+pub mod layout;
 pub mod weight_tma;
@@ -1,39 +1,31 @@
-use core::marker::PhantomData;
-
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::Barrier};
 use cubecl_matmul::components::{
-    MatmulIdent, MatrixPrecision, StageIdent, stage::StageMemoryConfig,
+    MatrixPrecision, StageIdent,
+    global::memory::{GlobalIterator, ViewDirection},
+    stage::StageMemoryConfig,
 };
-use cubecl_std::FastDivmod;
+use cubecl_std::tensor::{View, layout::Coords2d};
 
 use cubecl_matmul::components::stage::RowMajorTilingOrder;
-use cubecl_matmul::components::{
-    global::memory::MappedTensorReader,
-    stage::{ContiguousTilingLayout, StageConfig, StridedStage},
-};
-
-use crate::kernels::layered::selector::RuntimeArgs;
+use cubecl_matmul::components::stage::{ContiguousTilingLayout, StridedStage};
 
 pub type TmaWeightTiling = ContiguousTilingLayout<RowMajorTilingOrder>;
 pub type TmaWeightStage<IP> = StridedStage<<IP as MatrixPrecision>::Stage, TmaWeightTiling>;
 
 #[derive(CubeType)]
-pub struct TmaWeightGlobalReader<IP: MatrixPrecision, S: StageConfig> {
-    pub tensor_view: MappedTensorReader<IP::Global>,
+pub struct TmaWeightGlobalReader<IP: MatrixPrecision> {
+    pub global_iter: GlobalIterator<IP::Global>,
     pub stages: Sequence<StridedStage<IP::Stage, TmaWeightTiling>>,
-    padded_channels: FastDivmod,
     #[cube(comptime)]
-    _config: PhantomData<S>,
+    config: StageMemoryConfig,
 }
 
 #[cube]
-impl<IP: MatrixPrecision, S: StageConfig> TmaWeightGlobalReader<IP, S> {
+impl<IP: MatrixPrecision> TmaWeightGlobalReader<IP> {
     pub fn new(
-        tensor: TensorMap<IP::Global>,
-        x: u32,
-        y: u32,
-        runtime_args: &RuntimeArgs,
+        global_view: View<IP::Global, Coords2d>,
+        k_step: u32,
         #[comptime] num_stages: u32,
         #[comptime] config: StageMemoryConfig,
     ) -> Self {
@@ -44,42 +36,32 @@ impl<IP: MatrixPrecision, S: StageConfig> TmaWeightGlobalReader<IP, S> {
             stages.push(StridedStage::new_aligned(StageIdent::Rhs, 128u32, config));
         }
 
-        let tensor_view = MappedTensorReader::new(tensor, x, y, 0);
+        let global_iter = GlobalIterator::new(global_view, k_step, ViewDirection::Row, false);
 
-        TmaWeightGlobalReader::<IP, S> {
-            tensor_view,
+        TmaWeightGlobalReader::<IP> {
+            global_iter,
             stages,
-            padded_channels: runtime_args.padded_channels,
-            _config: PhantomData::<S>,
+            config,
         }
     }
 
-    pub fn fill_stage(
-        &mut self,
-        barrier: &Barrier,
-        #[comptime] stage_idx: u32,
-        #[comptime] config: S,
-    ) {
+    pub fn fill_stage(&mut self, barrier: &Barrier, #[comptime] stage_idx: u32) {
         let stage = self.stages.index_mut(stage_idx);
+        let config = comptime![self.config];
 
         if UNIT_POS == 0 {
-            let k = self.tensor_view.tile_x;
-            let out_c = self.tensor_view.tile_y;
+            let global_view = self.global_iter.view();
 
-            let tensor = self.tensor_view.tensor.try_cast_unchecked();
             let mut stage = stage.as_slice_mut(1u32);
-            let slice_size = config.tiling_scheme().elements_in_stage_n()
-                * config.tiling_scheme().elements_in_tile_k();
+            let slice_size = config.elements_in_stage_col() * config.elements_in_tile_row;
 
             #[unroll]
-            for tile_k in 0..config.tiling_scheme().tiles_in_stage_k() {
+            for tile_k in 0..config.tiles_in_stage_row {
                 let slice_start = slice_size * tile_k;
-                let mut slice = stage.slice_mut(slice_start, slice_size);
-
-                let k = k + tile_k * config.tiling_scheme().elements_in_tile_k();
-                let (k_idx, in_c) = self.padded_channels.div_mod(k);
+                let slice = stage.slice_mut(slice_start, slice_size);
 
-                barrier.tma_load_3d(&tensor, &mut slice, out_c as i32, k_idx as i32, in_c as i32);
+                let k = tile_k * config.elements_in_tile_row;
+                global_view.tensor_map_load(barrier, &mut slice.try_cast_unchecked(), (k, 0));
             }
         }
     }
@@ -88,7 +70,7 @@ impl<IP: MatrixPrecision, S: StageConfig> TmaWeightGlobalReader<IP, S> {
         *self.stages.index(stage_idx)
     }
 
-    pub fn advance_view(&mut self, k_offset: u32) {
-        self.tensor_view.update_view(k_offset, MatmulIdent::Rhs);
+    pub fn advance_view(&mut self) {
+        self.global_iter.advance();
     }
 }
@@ -15,7 +15,7 @@ use cubecl_matmul::components::{
 };
 use cubecl_std::{
     CubeOption,
-    tensor::{layout::Coords2d, r#virtual::VirtualTensor},
+    tensor::{AsTensorView, AsTensorViewExpand, layout::Coords2d, r#virtual::VirtualTensor},
 };
 
 use crate::{
@@ -27,6 +27,7 @@ use crate::{
             read::{
                 bias::{BiasGlobalReader, BiasStage},
                 im2col_tma::{TmaIm2colGlobalReader, TmaIm2colTiling},
+                layout::TmaWeightLayout,
                 weight_tma::{TmaWeightGlobalReader, TmaWeightTiling},
             },
         },
@@ -56,7 +57,7 @@ where
     type Config = ConvolutionConfig<SimpleTmaConfig<SMM::Config>>;
 
     type LhsGlobalReader = TmaIm2colGlobalReader<MP::Lhs, Self::Config>;
-    type RhsGlobalReader = TmaWeightGlobalReader<MP::Rhs, SMM::Config>;
+    type RhsGlobalReader = TmaWeightGlobalReader<MP::Rhs>;
     type AccGlobalReader = BiasGlobalReader<MP::Acc>;
     type GlobalWriter = PlaneWriter<MP::Acc>;
 
@@ -97,7 +98,7 @@ where
             sync_cube();
 
             lhs_reader.fill_stage(&barrier, 0u32);
-            rhs_reader.fill_stage(&barrier, 0u32, config.stage_config());
+            rhs_reader.fill_stage(&barrier, 0u32);
 
             arrive_tma(&barrier, stages_bytes);
 
@@ -114,7 +115,7 @@ where
             );
 
             lhs_reader.advance_view(k_step);
-            rhs_reader.advance_view(k_step);
+            rhs_reader.advance_view();
         }
 
         sync_cube();
@@ -145,16 +146,15 @@ where
     fn init_rhs_global_reader(
         rhs: VirtualTensor<RhsG<MP>>,
         offset: Coords2d,
-        _slice_size: Coords2d,
+        slice_size: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
     ) -> Self::RhsGlobalReader {
-        let (x_offset, y_offset) = offset;
+        let layout = TmaWeightLayout::new(runtime_args.padded_channels);
+        let rhs = rhs.as_tensor_map().unwrap().view_3d(layout);
         Self::RhsGlobalReader::new(
-            rhs.as_tensor_map().unwrap(),
-            x_offset,
-            y_offset,
-            runtime_args,
+            rhs.slice(offset, slice_size),
+            config.k_step,
             1u32,
             config.stage_memory_config(MatmulIdent::Rhs),
         )
 
@@ -4,8 +4,8 @@ use cubecl_std::tensor::{View, layout::Coords2d};
 
 #[derive(Clone, CubeType)]
 /// An iterator over global memory, advancing along k.
-pub struct GlobalIterator<EI: Numeric> {
-    global_view: View<Line<EI>, Coords2d>,
+pub struct GlobalIterator<EI: CubePrimitive> {
+    global_view: View<EI, Coords2d>,
     offset: RuntimeCell<u32>,
     /// The amount to advance by on each iteration
     step: u32,
@@ -16,8 +16,8 @@ pub struct GlobalIterator<EI: Numeric> {
     checked: bool,
 }
 
-unsafe impl<EG: Numeric> Sync for GlobalIterator<EG> {}
-unsafe impl<EG: Numeric> Send for GlobalIterator<EG> {}
+unsafe impl<EG: CubePrimitive> Sync for GlobalIterator<EG> {}
+unsafe impl<EG: CubePrimitive> Send for GlobalIterator<EG> {}
 
 #[derive(CubeType, Clone, Copy)]
 pub enum ViewDirection {
@@ -28,14 +28,14 @@ pub enum ViewDirection {
 }
 
 #[cube]
-impl<EG: Numeric> GlobalIterator<EG> {
+impl<EG: CubePrimitive> GlobalIterator<EG> {
     /// Instantiate a read iterator over the given global view, which should be sliced to the size
     /// of one `m`/`n` stage and the full range of `k` handled by this matmul instance.
     ///
     /// `step` is the amount advanced in `view_direction` each iteration.
     /// `checked` determines whether the slices should be created as checked or unchecked.
     pub fn new(
-        global_view: View<Line<EG>, Coords2d>,
+        global_view: View<EG, Coords2d>,
         step: u32,
         #[comptime] view_direction: ViewDirection,
         #[comptime] checked: bool,
@@ -63,7 +63,7 @@ impl<EG: Numeric> GlobalIterator<EG> {
     }
 
     /// Returns the current view slice of the iterator
-    pub fn view(&self) -> View<Line<EG>, Coords2d> {
+    pub fn view(&self) -> View<EG, Coords2d> {
         let offset = match comptime![self.view_direction] {
             ViewDirection::Row => (self.offset.read(), 0u32),
             ViewDirection::Col => (0u32, self.offset.read()),