tracel-ai
diff --git a/‎crates/cubecl-attention/src/tests/macros/mod.rs‎
Lines changed: 1 addition & 9 deletions b/‎crates/cubecl-attention/src/tests/macros/mod.rs‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎crates/cubecl-convolution/src/components/config.rs‎
Lines changed: 22 additions & 40 deletions b/‎crates/cubecl-convolution/src/components/config.rs‎
Lines changed: 22 additions & 40 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/args.rs‎
Lines changed: 120 additions & 11 deletions b/‎crates/cubecl-convolution/src/components/global/args.rs‎
Lines changed: 120 additions & 11 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/base.rs‎
Lines changed: 6 additions & 14 deletions b/‎crates/cubecl-convolution/src/components/global/base.rs‎
Lines changed: 6 additions & 14 deletions
@@ -8,20 +8,12 @@ use crate::{
     tests::attention_test_launcher::test_attention_algorithm,
 };
 
+#[derive(Default)]
 pub struct TestOptions {
     pub reuse_key_value: bool,
     pub two_rows_in_array_tile: bool,
 }
 
-impl Default for TestOptions {
-    fn default() -> Self {
-        Self {
-            reuse_key_value: false,
-            two_rows_in_array_tile: false,
-        }
-    }
-}
-
 pub fn attention_test_launch<A: Algorithm, R: Runtime>(
     client: ComputeClient<R::Server, R::Channel>,
     tiling_scheme: AttentionTilingScheme,
 
@@ -15,29 +15,25 @@ use super::*;
 /// Convolution specific config, extends regular matmul [`Config`](global::Config)
 pub trait ConvGemmConfig: GlobalConfig {
     /// The size of the convolution kernel at `dim`
-    fn kernel_size(&self, dim: u32) -> u32;
-    /// The dilation of the kernel at `dim`
-    fn dilation(&self, dim: u32) -> u32;
-    /// The stride of the kernel at `dim`
-    fn stride(&self, dim: u32) -> u32;
-    /// The padding of the kernel at `dim`
-    fn padding(&self, dim: u32) -> i32;
-    /// The dimensionality of the kernel
-    fn dimensionality(&self) -> Dimensionality;
-
+    fn convolution_params(&self) -> ConvolutionParams;
     fn line_sizes(&self) -> MatmulLineSizes;
     fn check_spatial_bounds(&self) -> bool;
 }
 
 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
 pub struct ConvolutionConfig<M: GlobalConfig> {
     matmul: M,
+    params: ConvolutionParams,
+    num_stages: u32,
+}
+
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
+pub struct ConvolutionParams {
     pub kernel_size: [u32; 3],
     pub stride: [u32; 3],
     pub dilation: [u32; 3],
     pub padding: [i32; 3],
-    dimensionality: Dimensionality,
-    num_stages: u32,
+    pub dimensionality: Dimensionality,
 }
 
 impl<M: GlobalConfig> Deref for ConvolutionConfig<M> {
@@ -121,24 +117,8 @@ impl<M: GlobalConfig> GlobalConfig for ConvolutionConfig<M> {
 }
 
 impl<M: GlobalConfig> ConvGemmConfig for ConvolutionConfig<M> {
-    fn kernel_size(&self, dim: u32) -> u32 {
-        self.kernel_size[dim as usize]
-    }
-
-    fn dilation(&self, dim: u32) -> u32 {
-        self.dilation[dim as usize]
-    }
-
-    fn stride(&self, dim: u32) -> u32 {
-        self.stride[dim as usize]
-    }
-
-    fn padding(&self, dim: u32) -> i32 {
-        self.padding[dim as usize]
-    }
-
-    fn dimensionality(&self) -> Dimensionality {
-        self.dimensionality
+    fn convolution_params(&self) -> ConvolutionParams {
+        self.params
     }
 
     fn line_sizes(&self) -> cubecl_matmul::components::MatmulLineSizes {
@@ -150,10 +130,10 @@ impl<M: GlobalConfig> ConvGemmConfig for ConvolutionConfig<M> {
     }
 
     fn check_spatial_bounds(&self) -> bool {
-        let spatial_dims = self.dimensionality.num_dims();
+        let spatial_dims = self.params.dimensionality.num_dims();
         let mut has_padding = false;
         for i in 0..spatial_dims {
-            has_padding |= self.padding[i as usize] != 0;
+            has_padding |= self.params.padding[i as usize] != 0;
         }
         has_padding
     }
@@ -172,20 +152,22 @@ impl<M: GlobalConfig> ConvolutionConfig<M> {
     ) -> Result<Self, MatmulSetupError> {
         let dims = kernel_size.len();
 
-        let mut this = Self {
-            matmul,
+        let mut params = ConvolutionParams {
             kernel_size: [0; 3],
             stride: [0; 3],
             dilation: [0; 3],
             padding: [0; 3],
             dimensionality: dim,
-            num_stages,
         };
-        this.kernel_size[0..dims].copy_from_slice(kernel_size);
-        this.stride[0..dims].copy_from_slice(stride);
-        this.dilation[0..dims].copy_from_slice(dilation);
-        this.padding[0..dims].copy_from_slice(padding);
-        Ok(this)
+        params.kernel_size[0..dims].copy_from_slice(kernel_size);
+        params.stride[0..dims].copy_from_slice(stride);
+        params.dilation[0..dims].copy_from_slice(dilation);
+        params.padding[0..dims].copy_from_slice(padding);
+        Ok(Self {
+            matmul,
+            params,
+            num_stages,
+        })
     }
 
     pub fn to_matmul_config(self) -> M {
 
@@ -2,61 +2,156 @@ use std::any::TypeId;
 
 use cubecl::prelude::*;
 use cubecl_core as cubecl;
+use cubecl_std::{
+    FastDivmodArgs,
+    tensor::{
+        View,
+        launch::ViewArg,
+        layout::{
+            Coords3d,
+            chain::{Chain, ChainLaunch},
+        },
+    },
+};
 
 use crate::{
-    components::ConvolutionProblem,
+    components::{
+        ConvGemmConfig, ConvolutionProblem,
+        global::{
+            layout::{
+                BiasLayout, BiasLayoutLaunch, Im2colLayout, Im2colLayoutLaunch, NhwcLayout,
+                NhwcLayoutLaunch, OutLayout, OutLayoutLaunch, WeightLayout, WeightLayoutLaunch,
+            },
+            read::layout::{
+                TmaDummyLayout, TmaDummyLayoutLaunch, TmaWeightLayout, TmaWeightLayoutLaunch,
+            },
+        },
+    },
     kernels::layered::algorithm::simple_tma::{calculate_lower_corner, calculate_upper_corner},
 };
 use cubecl_matmul::{
     MatmulInputHandleRef,
     components::{
-        MatmulLineSizes, MatmulSelection,
+        MatmulIdent, MatmulLineSizes, MatmulSelection,
         global::args::{TensorInputs, TensorInputsLaunch, TensorMapInputs, TensorMapInputsLaunch},
     },
 };
 
 /// Create the input runtime arguments for a matmul kernel that works on concrete inputs and
 /// output (not fused).
 pub trait ConcreteInputsFactory: LaunchArg {
+    #[allow(clippy::too_many_arguments)]
     fn create<'a, R: Runtime>(
+        client: &ComputeClient<R::Server, R::Channel>,
         lhs: &'a MatmulInputHandleRef<'a, R>,
         rhs: &'a MatmulInputHandleRef<'a, R>,
         bias: Option<&'a TensorHandleRef<'a, R>>,
         selection: &MatmulSelection,
         problem: &ConvolutionProblem,
         line_sizes: &MatmulLineSizes,
+        config: impl ConvGemmConfig,
+    ) -> Self::RuntimeArg<'a, R>;
+}
+
+/// Create the output runtime arguments for a matmul kernel that works on concrete inputs and
+/// output (not fused).
+pub trait ConcreteOutputFactory: LaunchArg {
+    fn create<'a, R: Runtime>(
+        client: &ComputeClient<R::Server, R::Channel>,
+        out: &'a TensorHandleRef<'a, R>,
+        selection: &MatmulSelection,
+        problem: &ConvolutionProblem,
+        line_sizes: &MatmulLineSizes,
+        config: impl ConvGemmConfig,
     ) -> Self::RuntimeArg<'a, R>;
 }
 
 impl<Lhs: Numeric, Rhs: Numeric, EO: Numeric> ConcreteInputsFactory for TensorInputs<Lhs, Rhs, EO> {
     fn create<'a, R: Runtime>(
+        client: &ComputeClient<R::Server, R::Channel>,
         lhs: &'a MatmulInputHandleRef<'a, R>,
         rhs: &'a MatmulInputHandleRef<'a, R>,
         bias: Option<&'a TensorHandleRef<'a, R>>,
         _selection: &MatmulSelection,
-        _problem: &ConvolutionProblem,
+        problem: &ConvolutionProblem,
         line_sizes: &MatmulLineSizes,
+        config: impl ConvGemmConfig,
     ) -> Self::RuntimeArg<'a, R> {
+        type LhsLayout = Chain<NhwcLayout, Im2colLayout>;
+        type RhsLayout = Chain<NhwcLayout, WeightLayout>;
+
+        let layout_nhwc = |handle, line_size, check| {
+            NhwcLayoutLaunch::from_handle(handle, line_size as u32, check)
+        };
+        let layout_lhs = Im2colLayoutLaunch::from_args(
+            client,
+            problem,
+            config.convolution_params(),
+            config.global_memory_config(MatmulIdent::Lhs),
+        );
+        let layout_rhs = WeightLayoutLaunch::from_args(
+            client,
+            problem,
+            config.convolution_params(),
+            config.global_memory_config(MatmulIdent::Rhs),
+        );
+        let layout_bias =
+            BiasLayoutLaunch::new(ScalarArg::new(problem.n as u32), line_sizes.out as u32);
+
+        let layout_lhs = {
+            let global = layout_nhwc(lhs.data(), line_sizes.lhs, config.check_spatial_bounds());
+            ChainLaunch::new(global, layout_lhs)
+        };
+        let layout_rhs = {
+            let global = layout_nhwc(rhs.data(), line_sizes.rhs, false);
+            ChainLaunch::new(global, layout_rhs)
+        };
+
         TensorInputsLaunch::new(
-            lhs.data().as_tensor_arg(line_sizes.lhs),
-            lhs.scale().map(|it| it.as_tensor_arg(1)).into(),
-            rhs.data().as_tensor_arg(line_sizes.rhs),
-            rhs.scale().map(|it| it.as_tensor_arg(1)).into(),
-            bias.map(|it| it.as_tensor_arg(line_sizes.out)).into(),
+            ViewArg::new::<LhsLayout>(lhs.data().as_array_arg(line_sizes.lhs), layout_lhs),
+            ViewArg::new::<RhsLayout>(rhs.data().as_array_arg(line_sizes.rhs), layout_rhs),
+            bias.map(|bias| {
+                ViewArg::new::<BiasLayout>(bias.as_array_arg(line_sizes.out), layout_bias)
+            })
+            .into(),
         )
     }
 }
 
+impl<EG: Numeric> ConcreteOutputFactory for View<Line<EG>, Coords3d, ReadWrite> {
+    fn create<'a, R: Runtime>(
+        client: &ComputeClient<R::Server, R::Channel>,
+        out: &'a TensorHandleRef<'a, R>,
+        _selection: &MatmulSelection,
+        problem: &ConvolutionProblem,
+        line_sizes: &MatmulLineSizes,
+        config: impl ConvGemmConfig,
+    ) -> Self::RuntimeArg<'a, R> {
+        type Layout = Chain<NhwcLayout, OutLayout>;
+
+        let global = NhwcLayoutLaunch::from_handle(out, line_sizes.out as u32, false);
+        let layout = OutLayoutLaunch::from_args(
+            client,
+            problem,
+            config.global_memory_config(MatmulIdent::Out),
+        );
+        let layout = ChainLaunch::new(global, layout);
+        ViewArg::new::<Layout>(out.as_array_arg(line_sizes.out), layout)
+    }
+}
+
 impl<Lhs: Numeric, Rhs: Numeric, EO: Numeric> ConcreteInputsFactory
     for TensorMapInputs<Lhs, Rhs, EO>
 {
     fn create<'a, R: Runtime>(
+        client: &ComputeClient<R::Server, R::Channel>,
         lhs: &'a MatmulInputHandleRef<'a, R>,
         rhs: &'a MatmulInputHandleRef<'a, R>,
         bias: Option<&'a TensorHandleRef<'a, R>>,
         selection: &MatmulSelection,
         problem: &ConvolutionProblem,
         line_sizes: &MatmulLineSizes,
+        config: impl ConvGemmConfig,
     ) -> Self::RuntimeArg<'a, R> {
         let tiling_scheme = selection.tiling_scheme;
         let stage_m = tiling_scheme.elements_in_stage_m();
@@ -119,9 +214,23 @@ impl<Lhs: Numeric, Rhs: Numeric, EO: Numeric> ConcreteInputsFactory
         )
         .with_prefetch(prefetch_rhs);
 
-        let bias = bias.map(|it| it.as_tensor_arg(line_sizes.out));
+        let padded_channels =
+            (problem.channels as u32).next_multiple_of(config.tiling_scheme().elements_in_tile_k());
+
+        // Dummy layout since we don't support im2col loading rn
+        let lhs_layout = TmaDummyLayoutLaunch::new();
+        let rhs_layout = TmaWeightLayoutLaunch::new(FastDivmodArgs::new(client, padded_channels));
 
-        // TODO: Think about how to handle scales with TMA
-        TensorMapInputsLaunch::new(lhs, rhs, bias.into())
+        let bias = bias.map(|bias| {
+            let layout =
+                BiasLayoutLaunch::new(ScalarArg::new(problem.n as u32), line_sizes.out as u32);
+            ViewArg::new::<BiasLayout>(bias.as_array_arg(line_sizes.out), layout)
+        });
+
+        TensorMapInputsLaunch::new(
+            ViewArg::new_tensor_map::<TmaDummyLayout>(lhs, lhs_layout),
+            ViewArg::new_tensor_map::<TmaWeightLayout>(rhs, rhs_layout),
+            bias.into(),
+        )
     }
 }
@@ -8,7 +8,7 @@ use cubecl_matmul::components::{
 };
 use cubecl_std::{
     CubeOption,
-    tensor::{layout::Coords2d, r#virtual::VirtualTensor},
+    tensor::{View, layout::Coords2d},
 };
 
 use crate::{
@@ -69,36 +69,28 @@ pub trait GlobalConvolution<MP: MatmulPrecision>: 'static + Send + Sync {
 
     /// Initializes the global reader for the input feature map with an appropriate layout
     fn init_lhs_global_reader(
-        lhs: VirtualTensor<LhsG<MP>>,
+        lhs: View<Line<LhsG<MP>>, Coords2d>,
         offset: Coords2d,
-        view_shape: Coords2d,
+        slice_size: Coords2d,
         runtime_args: &RuntimeArgs,
         #[comptime] config: Self::Config,
     ) -> Self::LhsGlobalReader;
 
     /// Initializes the global reader for the weights with an appropriate layout
     fn init_rhs_global_reader(
-        rhs: VirtualTensor<RhsG<MP>>,
-        offset: Coords2d,
-        view_shape: Coords2d,
-        runtime_args: &RuntimeArgs,
+        rhs: View<Line<RhsG<MP>>, Coords2d>,
         #[comptime] config: Self::Config,
     ) -> Self::RhsGlobalReader;
 
     /// Initializes the global reader for the bias with an appropriate layout
     fn init_bias_global_reader(
-        bias: CubeOption<VirtualTensor<AccG<MP>>>,
-        n_offset: u32,
-        slice_size: u32,
+        bias: CubeOption<View<Line<AccG<MP>>, Coords2d>>,
         #[comptime] config: Self::Config,
     ) -> Self::AccGlobalReader;
 
     /// Initializes the output feature map global writer with an appropriate layout
     fn init_global_writer(
-        out: VirtualTensor<AccG<MP>, ReadWrite>,
-        offset: Coords2d,
-        view_shape: Coords2d,
-        runtime_args: &RuntimeArgs,
+        out: View<Line<AccG<MP>>, Coords2d, ReadWrite>,
         #[comptime] config: Self::Config,
     ) -> Self::GlobalWriter;