tracel-ai
diff --git a/‎crates/cubecl-common/src/quant/scheme.rs‎
Lines changed: 3 additions & 0 deletions b/‎crates/cubecl-common/src/quant/scheme.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/cubecl-convolution/src/components/global/read/reader/bias.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/cubecl-convolution/src/components/global/read/reader/bias.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/cubecl-matmul/src/base.rs‎
Lines changed: 53 additions & 9 deletions b/‎crates/cubecl-matmul/src/base.rs‎
Lines changed: 53 additions & 9 deletions
diff --git a/‎crates/cubecl-matmul/src/components/global/args.rs‎
Lines changed: 37 additions & 12 deletions b/‎crates/cubecl-matmul/src/components/global/args.rs‎
Lines changed: 37 additions & 12 deletions
diff --git a/‎crates/cubecl-matmul/src/components/global/memory/config.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-matmul/src/components/global/memory/config.rs‎
Lines changed: 1 addition & 1 deletion
@@ -224,6 +224,9 @@ pub struct BlockSize {
 }
 
 impl BlockSize {
+    /// Max number of dimensions for block size
+    pub const MAX_DIMS: usize = MAX_DIMS;
+
     /// Create a new blocksize from a set of values. The number of values must be `<= MAX_DIMS`.
     pub fn new(values: impl AsRef<[u8]>) -> Self {
         let values = values.as_ref();
 
@@ -6,7 +6,7 @@ use cubecl_std::{
 };
 
 use cubecl_matmul::components::{
-    MatmulIdent, MatrixPrecision, StageIdent,
+    MatrixPrecision, StageIdent,
     global::GlobalConfig,
     stage::{StageMemoryConfig, StridedStage},
 };
@@ -33,7 +33,7 @@ impl<IP: MatrixPrecision> BiasGlobalReader<IP> {
     pub fn load_stage<G: GlobalConfig>(&mut self, #[comptime] config: G) {
         match self {
             BiasGlobalReader::Some { view, stage } => {
-                let line_size = config.global_line_size(MatmulIdent::Out);
+                let line_size = view.line_size();
                 let num_stage_elements = config.tiling_scheme().elements_in_stage_n();
 
                 let unit_id = UNIT_POS_Y * config.plane_dim() + UNIT_POS_X;
 
@@ -1,7 +1,8 @@
+use cubecl_common::quant::scheme::QuantScheme;
 use cubecl_core::{
     Runtime,
     client::ComputeClient,
-    prelude::{Numeric, TensorHandleRef},
+    prelude::{CubePrimitive, Numeric, TensorHandleRef},
 };
 
 use cubecl_std::tensor::TensorHandle;
@@ -94,33 +95,49 @@ pub enum AsyncReadingStrategy {
     Tma,
 }
 
-pub enum MatmulInputHandle<R: Runtime, E: Numeric> {
+pub enum MatmulInputHandle<R: Runtime, E: CubePrimitive, S: CubePrimitive = f32> {
     Normal(TensorHandle<R, E>),
     Quantized {
         data: TensorHandle<R, E>,
-        scale: TensorHandle<R, f32>,
+        scale: TensorHandle<R, S>,
+        shape: Vec<usize>,
+        scheme: QuantScheme,
     },
 }
 
 impl<R: Runtime, E: Numeric> MatmulInputHandle<R, E> {
     pub fn as_ref(&self) -> MatmulInputHandleRef<'_, R> {
         match self {
             MatmulInputHandle::Normal(handle) => MatmulInputHandleRef::Normal(handle.as_ref()),
-            MatmulInputHandle::Quantized { data, scale } => MatmulInputHandleRef::Quantized {
+            MatmulInputHandle::Quantized {
+                data,
+                scale,
+                shape,
+                scheme,
+            } => MatmulInputHandleRef::Quantized {
                 data: data.as_ref(),
                 scale: scale.as_ref(),
+                shape,
+                scheme,
             },
         }
     }
 }
 
-impl<R: Runtime, E: Numeric> Clone for MatmulInputHandle<R, E> {
+impl<R: Runtime, E: CubePrimitive> Clone for MatmulInputHandle<R, E> {
     fn clone(&self) -> Self {
         match self {
             Self::Normal(handle) => Self::Normal(handle.clone()),
-            Self::Quantized { data, scale } => Self::Quantized {
+            Self::Quantized {
+                data,
+                scale,
+                shape,
+                scheme,
+            } => Self::Quantized {
                 data: data.clone(),
                 scale: scale.clone(),
+                shape: shape.clone(),
+                scheme: *scheme,
             },
         }
     }
@@ -132,6 +149,9 @@ pub enum MatmulInputHandleRef<'a, R: Runtime> {
     Quantized {
         data: TensorHandleRef<'a, R>,
         scale: TensorHandleRef<'a, R>,
+        /// Unpacked shape, excluding padding
+        shape: &'a [usize],
+        scheme: &'a QuantScheme,
     },
 }
 
@@ -148,8 +168,18 @@ impl<'a, R: Runtime> MatmulInputHandleRef<'a, R> {
         Self::Normal(data)
     }
 
-    pub fn quantized(data: TensorHandleRef<'a, R>, scale: TensorHandleRef<'a, R>) -> Self {
-        Self::Quantized { data, scale }
+    pub fn quantized(
+        data: TensorHandleRef<'a, R>,
+        scale: TensorHandleRef<'a, R>,
+        shape: &'a [usize],
+        scheme: &'a QuantScheme,
+    ) -> Self {
+        Self::Quantized {
+            data,
+            scale,
+            shape,
+            scheme,
+        }
     }
 
     pub fn data(&self) -> &TensorHandleRef<'a, R> {
@@ -172,6 +202,20 @@ impl<'a, R: Runtime> MatmulInputHandleRef<'a, R> {
             MatmulInputHandleRef::Quantized { scale, .. } => Some(scale),
         }
     }
+
+    pub fn scheme(&self) -> Option<&QuantScheme> {
+        match self {
+            MatmulInputHandleRef::Normal(_) => None,
+            MatmulInputHandleRef::Quantized { scheme, .. } => Some(scheme),
+        }
+    }
+
+    pub fn shape(&self) -> &[usize] {
+        match self {
+            MatmulInputHandleRef::Normal(handle) => handle.shape,
+            MatmulInputHandleRef::Quantized { shape, .. } => shape,
+        }
+    }
 }
 
 #[allow(clippy::result_large_err)]
@@ -310,7 +354,7 @@ pub fn launch_ref<R: Runtime, MP: MatmulPrecision>(
             layered::launch_ref::<R, MP, DoubleUnitAlgorithm>(client, lhs, rhs, out, selection)
         }
         Strategy::Naive => {
-            naive::launch_ref::<R, LhsG<MP>, AccG<MP>>(client, lhs.data(), rhs.data(), out)?;
+            naive::launch_ref::<R, LhsG<MP>, AccG<MP>>(client, lhs, rhs, out)?;
             Ok(())
         }
         Strategy::Auto => {
 
@@ -15,8 +15,8 @@ use crate::{
         global::{
             GlobalConfig,
             memory::{
-                BatchedGlobalLayout, BatchedGlobalLayoutLaunch, SimpleTmaGlobalLayout,
-                SimpleTmaGlobalLayoutLaunch,
+                BatchedGlobalLayout, BatchedGlobalLayoutLaunch, BatchedGlobalScaleLayout,
+                SimpleTmaGlobalLayout, SimpleTmaGlobalLayoutLaunch,
             },
         },
     },
@@ -128,19 +128,44 @@ impl<Lhs: Numeric, Rhs: Numeric, Acc: Numeric> ConcreteInputsFactory
         config: impl BatchConfig,
     ) -> Self::RuntimeArg<'a, R> {
         let config = config.global_config();
-        let view = |handle, ident, line_size| {
-            let layout = BatchedGlobalLayoutLaunch::from_handle(
-                client,
-                handle,
-                problem,
-                config.global_memory_config(ident),
-            );
-            ViewArg::new::<BatchedGlobalLayout>(handle.as_array_arg(line_size), layout)
+        let view = |handle: &'a MatmulInputHandleRef<'a, R>, ident, line_size| match handle {
+            MatmulInputHandleRef::Normal(handle) => {
+                let layout = BatchedGlobalLayoutLaunch::from_handle(
+                    client,
+                    handle,
+                    problem,
+                    config.global_memory_config(ident),
+                );
+                ViewArg::new::<BatchedGlobalLayout>(handle.as_array_arg(line_size), layout)
+            }
+            MatmulInputHandleRef::Quantized {
+                data,
+                scale,
+                shape,
+                scheme,
+            } => {
+                let (data_layout, scales_layout) = BatchedGlobalLayoutLaunch::from_quantized_handle(
+                    client,
+                    data,
+                    scale,
+                    shape,
+                    problem,
+                    config.global_memory_config(ident),
+                    **scheme,
+                );
+                let data_view =
+                    ViewArg::new::<BatchedGlobalLayout>(data.as_array_arg(line_size), data_layout);
+                let scales_view = ViewArg::new::<BatchedGlobalScaleLayout>(
+                    scale.as_array_arg(line_size),
+                    scales_layout,
+                );
+                ViewArg::new_quantized(data_view, scales_view, **scheme)
+            }
         };
 
         TensorInputsLaunch::new(
-            view(lhs.data(), MatmulIdent::Lhs, line_sizes.lhs),
-            view(rhs.data(), MatmulIdent::Rhs, line_sizes.rhs),
+            view(lhs, MatmulIdent::Lhs, line_sizes.lhs),
+            view(rhs, MatmulIdent::Rhs, line_sizes.rhs),
             CubeOptionArgs::None,
         )
     }
 
@@ -2,7 +2,7 @@ use std::{fmt::Debug, hash::Hash};
 
 use crate::components::MatrixLayout;
 
-#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
+#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq, Default)]
 pub struct GlobalMemoryConfig {
     pub elements_in_tile_row: u32,
     pub elements_in_tile_col: u32,
Original file line number	Diff line number	Diff line change
`@@ -224,6 +224,9 @@ pub struct BlockSize {`
`224`	`224`	`}`
`225`	`225`
`226`	`226`	`impl BlockSize {`
	`227`	`+ /// Max number of dimensions for block size`
	`228`	`+ pub const MAX_DIMS: usize = MAX_DIMS;`
	`229`	`+`
`227`	`230`	/// Create a new blocksize from a set of values. The number of values must be `<= MAX_DIMS`.
`228`	`231`	`pub fn new(values: impl AsRef<[u8]>) -> Self {`
`229`	`232`	`let values = values.as_ref();`