tracel-ai
diff --git a/‎crates/cubecl-common/src/lib.rs‎
Lines changed: 3 additions & 0 deletions b/‎crates/cubecl-common/src/lib.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/cubecl-common/src/quant/mod.rs‎
Lines changed: 2 additions & 0 deletions b/‎crates/cubecl-common/src/quant/mod.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎crates/cubecl-quant/src/scheme.rs‎ renamed to ‎crates/cubecl-common/src/quant/scheme.rs‎
Lines changed: 20 additions & 3 deletions b/‎crates/cubecl-quant/src/scheme.rs‎ renamed to ‎crates/cubecl-common/src/quant/scheme.rs‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎crates/cubecl-cpu/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/cubecl-cpu/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cubecl-cuda/src/lib.rs‎
Lines changed: 1 addition & 0 deletions b/‎crates/cubecl-cuda/src/lib.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cubecl-macros/src/parse/expression.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-macros/src/parse/expression.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-quant/src/lib.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/cubecl-quant/src/lib.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/cubecl-std/Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎crates/cubecl-std/Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/cubecl-std/src/lib.rs‎
Lines changed: 2 additions & 0 deletions b/‎crates/cubecl-std/src/lib.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎crates/cubecl-std/src/quant/dequantize.rs‎
Lines changed: 109 additions & 0 deletions b/‎crates/cubecl-std/src/quant/dequantize.rs‎
Lines changed: 109 additions & 0 deletions
@@ -48,6 +48,9 @@ pub mod reader;
 /// Future utils with a compatible API for native, non-std and wasm environments.
 pub mod future;
 
+/// Quantization primitives required outside of `cubecl-quant`
+pub mod quant;
+
 /// Various utilities to create ID's.
 extern crate alloc;
 
 
@@ -0,0 +1,2 @@
+/// Types representing the quantization scheme
+pub mod scheme;
@@ -1,7 +1,6 @@
 use alloc::vec;
 use alloc::vec::Vec;
 use core::{default::Default, ops::Deref};
-use cubecl_common::{e4m3, e5m2};
 use serde::{Deserialize, Serialize};
 
 /// Describes a quantization scheme/configuration.
@@ -79,6 +78,12 @@ impl QuantScheme {
     pub fn num_quants(&self) -> usize {
         self.size_bits_stored() / self.value.size_bits()
     }
+
+    /// Returns the native packing factor for the values. When native packing > 1, the packed
+    /// representation stores `num_quants` elements grouped into packs of `native_packing` size.
+    pub fn native_packing(&self) -> usize {
+        self.value.native_packing()
+    }
 }
 
 /// Level or granularity of quantization.
@@ -91,6 +96,7 @@ pub enum QuantLevel {
 }
 
 impl QuantLevel {
+    /// Converting constructor for [`QuantLevel::Block`]
     pub fn block(values: impl AsRef<[u8]>) -> Self {
         QuantLevel::Block(BlockSize::new(values))
     }
@@ -129,6 +135,15 @@ impl QuantValue {
         }
     }
 
+    /// Packing factor for the native representation used for intermediate values. If > 1, values
+    /// should always be processed in `native_packing` sized chunks.
+    pub fn native_packing(&self) -> usize {
+        match self {
+            QuantValue::E2M1 => 2,
+            _ => 1,
+        }
+    }
+
     /// The possible range of values allowed by the quant value.
     pub fn range(&self) -> (f32, f32) {
         match self {
@@ -138,8 +153,8 @@ impl QuantValue {
             QuantValue::Q8S => (-i8::MAX as f32, i8::MAX as f32),
             QuantValue::Q4S => (-7.0, 7.0),
             QuantValue::Q2S => (-1.0, 1.0),
-            QuantValue::E4M3 => (e4m3::MIN as f32, e4m3::MAX as f32),
-            QuantValue::E5M2 => (e5m2::MIN as f32, e5m2::MAX as f32),
+            QuantValue::E4M3 => (-448.0, 448.0),
+            QuantValue::E5M2 => (-57344.0, 57344.0),
             QuantValue::E2M1 => (-6.0, 6.0), // Hardcoded because of no-std
         }
     }
@@ -253,10 +268,12 @@ impl BlockSize {
         out
     }
 
+    /// Create an iterator over all stored dimensions
     pub fn iter(&self) -> impl Iterator<Item = &u8> {
         self.as_slice().iter()
     }
 
+    /// Returns the total number of elements in each block
     pub fn num_elements(&self) -> usize {
         self.iter().map(|it| *it as usize).product()
     }
 
@@ -11,6 +11,7 @@ mod tests {
     cubecl_core::testgen_all!(f32: [f16, f32, f64], i32: [i8, i16, i32, i64], u32: [u8, u16, u32, u64]);
     cubecl_std::testgen!();
     cubecl_std::testgen_tensor_identity!([f16, f32, u32]);
+    cubecl_std::testgen_quantized_view!(f32);
     cubecl_random::testgen_random!();
     cubecl_matmul::testgen_matmul_simple!([f16, f32]);
     cubecl_matmul::testgen_matmul_unit!();
 
@@ -87,6 +87,7 @@ mod tests {
     // TODO: re-instate matmul quantized tests
     cubecl_matmul::testgen_matmul_simple!([f16, bf16, f32]);
     cubecl_std::testgen_tensor_identity!([f16, bf16, f32, u32]);
+    cubecl_std::testgen_quantized_view!(f16);
     cubecl_convolution::testgen_conv2d_accelerated!([f16: f16, bf16: bf16, f32: tf32]);
     cubecl_reduce::testgen_reduce!([f16, bf16, f32, f64]);
     cubecl_random::testgen_random!();
 
@@ -482,7 +482,7 @@ fn fn_associated_type(path: &Expression) -> Option<(Path, Option<QSelf>, PathSeg
     // All supported primitives. Primitives don't start with an uppercase letter
     const PRIMITIVES: &[&str] = &[
         "bool", "i8", "i16", "i32", "i64", "u8", "u16", "u32", "u64", "f16", "bf16", "f32", "f64",
-        "flex32", "e2m1", "e2m3", "e3m2", "e4m3", "e5m2", "ue8m0",
+        "flex32", "e2m1", "e2m1x2", "e2m3", "e3m2", "e4m3", "e5m2", "ue8m0",
     ];
     if !matches!(path, Expression::Path { .. }) {
         panic!("path: {path:?}");
 
@@ -11,7 +11,7 @@ pub mod quantize;
 #[cfg(feature = "kernels")]
 pub mod layout;
 
-pub mod scheme;
+pub use cubecl_common::quant::scheme;
 
 #[cfg(feature = "export_tests")]
 pub mod tests;
 
@@ -19,6 +19,7 @@ export_tests = []
 
 [dependencies]
 
+cubecl-common = { path = "../cubecl-common", version = "0.7.0", default-features = false }
 cubecl-core = { path = "../cubecl-core", version = "0.7.0", default-features = false }
 cubecl-runtime = { path = "../cubecl-runtime", version = "0.7.0", default-features = false }
 half.workspace = true
 
@@ -9,6 +9,8 @@ pub use fast_math::*;
 mod option;
 pub use option::*;
 
+/// Quantization functionality required in views
+pub mod quant;
 pub mod tensor;
 
 #[cfg(feature = "export_tests")]
 
@@ -0,0 +1,109 @@
+use cubecl::prelude::*;
+use cubecl_common::quant::scheme::*;
+use cubecl_common::{e2m1x2, e4m3, e5m2};
+use cubecl_core::{self as cubecl, intrinsic};
+
+/// Dequantize a line of values, where `line_size * num_quants` is a power of two.
+/// Unaligned values can't be dequantized in place.
+#[cube]
+pub fn dequantize_aligned<Q: CubePrimitive, S: CubePrimitive, F: Float>(
+    value: Line<Q>,
+    scale: S,
+    #[comptime] scheme: QuantScheme,
+) -> Line<F> {
+    let q_values = match scheme.store {
+        QuantStore::Native => Line::<F>::cast_from(value),
+        QuantStore::U32 => unpack_cast_u32::<F>(Line::cast_from(value), scheme),
+    };
+    let scale = Line::<F>::cast_from(scale);
+
+    match scheme.mode {
+        QuantMode::Symmetric => q_values * scale,
+    }
+}
+
+/// Unpack a set of values from u32, and convert to the specified floating point format.
+#[cube]
+pub fn unpack_cast_u32<F: Float>(value: Line<u32>, #[comptime] scheme: QuantScheme) -> Line<F> {
+    let num_quants = comptime![scheme.num_quants() as u32];
+    let native_packing = comptime![scheme.native_packing() as u32];
+    let out_line_size = comptime![value.line_size() * num_quants];
+    let size_bits = comptime![scheme.size_bits_value() as u32];
+    let mask = comptime![packing_mask(scheme)];
+
+    let mut out = Line::<F>::empty(out_line_size);
+
+    #[unroll]
+    for line_idx in 0..value.line_size() {
+        let line_idx = unwrap(line_idx);
+        let packed_val = value[line_idx];
+        let out_offset = comptime![line_idx * num_quants];
+        #[unroll]
+        for packed_idx in range_stepped(0, num_quants, native_packing) {
+            let packed_idx = unwrap(packed_idx);
+            let shift = packed_idx * size_bits;
+            let value = (packed_val >> shift) & mask;
+
+            let float_value = cast_masked::<F>(value, scheme);
+
+            #[unroll]
+            for native_idx in 0..native_packing {
+                let native_idx = unwrap(native_idx);
+                let out_offset = comptime![out_offset + packed_idx + native_idx];
+                out[out_offset] = float_value[native_idx];
+            }
+        }
+    }
+
+    out
+}
+
+/// The mask required for each packed value, taking into account the native packing required for
+/// `e2m1`.
+fn packing_mask(scheme: QuantScheme) -> u32 {
+    let bits = match scheme.value {
+        QuantValue::E2M1 => 8, // Packed conversion
+        other => other.size_bits(),
+    };
+    (1u32 << bits) - 1
+}
+
+/// Cast a masked-out value in the low `n` bits of a `u32` to the specified float type.
+/// Applies sign conversion for integer quantization before casting to the float type,
+/// while minifloats are simply truncated to `u8`, reinterpreted and then cast.
+/// For `e2m1`, casting is done on the packed `e2m1x2` representation.
+///
+/// # Returns
+/// Two floating point numbers for `e2m1`, one for all other formats.
+#[cube]
+fn cast_masked<F: Float>(value: u32, #[comptime] scheme: QuantScheme) -> Line<F> {
+    match scheme.value {
+        // For minifloat we can assume if they're supported then u8 is supported
+        QuantValue::E5M2 => Line::<F>::cast_from(e5m2::reinterpret(value as u8)),
+        QuantValue::E4M3 => Line::<F>::cast_from(e4m3::reinterpret(value as u8)),
+        QuantValue::E2M1 => Line::<F>::cast_from(e2m1x2::reinterpret(value as u8)),
+        QuantValue::Q8F
+        | QuantValue::Q4F
+        | QuantValue::Q2F
+        | QuantValue::Q8S
+        | QuantValue::Q4S
+        | QuantValue::Q2S => {
+            let size_quant = comptime!(scheme.size_bits_value() as u32);
+            let sign_bit = comptime!(1u32 << (size_quant - 1));
+            let two_pow_n = comptime!(1 << size_quant);
+
+            // Branchless two's complement conversion
+            // If raw >= 2^(n-1), then result = raw - 2^n
+            let raw_i32 = value as i32;
+            let is_negative = (value >= sign_bit) as i32; // 1 if negative, 0 if positive
+            let signed_value = raw_i32 - (is_negative * two_pow_n);
+            Line::<F>::cast_from(signed_value)
+        }
+    }
+}
+
+#[allow(unused_variables)]
+#[cube]
+pub(crate) fn unwrap(v: u32) -> comptime_type!(u32) {
+    intrinsic!(|_| v.constant().expect("Must be constant").as_u32())
+}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+/// Types representing the quantization scheme`
	`2`	`+pub mod scheme;`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,6 @@`
`1`	`1`	`use alloc::vec;`
`2`	`2`	`use alloc::vec::Vec;`
`3`	`3`	`use core::{default::Default, ops::Deref};`
`4`		`-use cubecl_common::{e4m3, e5m2};`
`5`	`4`	`use serde::{Deserialize, Serialize};`
`6`	`5`
`7`	`6`	`/// Describes a quantization scheme/configuration.`
`@@ -79,6 +78,12 @@ impl QuantScheme {`
`79`	`78`	`pub fn num_quants(&self) -> usize {`
`80`	`79`	`self.size_bits_stored() / self.value.size_bits()`
`81`	`80`	`}`
	`81`	`+`
	`82`	`+ /// Returns the native packing factor for the values. When native packing > 1, the packed`
	`83`	+ /// representation stores `num_quants` elements grouped into packs of `native_packing` size.
	`84`	`+ pub fn native_packing(&self) -> usize {`
	`85`	`+ self.value.native_packing()`
	`86`	`+ }`
`82`	`87`	`}`
`83`	`88`
`84`	`89`	`/// Level or granularity of quantization.`
`@@ -91,6 +96,7 @@ pub enum QuantLevel {`
`91`	`96`	`}`
`92`	`97`
`93`	`98`	`impl QuantLevel {`
	`99`	+ /// Converting constructor for [`QuantLevel::Block`]
`94`	`100`	`pub fn block(values: impl AsRef<[u8]>) -> Self {`
`95`	`101`	`QuantLevel::Block(BlockSize::new(values))`
`96`	`102`	`}`
`@@ -129,6 +135,15 @@ impl QuantValue {`
`129`	`135`	`}`
`130`	`136`	`}`
`131`	`137`
	`138`	`+ /// Packing factor for the native representation used for intermediate values. If > 1, values`
	`139`	+ /// should always be processed in `native_packing` sized chunks.
	`140`	`+ pub fn native_packing(&self) -> usize {`
	`141`	`+ match self {`
	`142`	`+ QuantValue::E2M1 => 2,`
	`143`	`+ _ => 1,`
	`144`	`+ }`
	`145`	`+ }`
	`146`	`+`
`132`	`147`	`/// The possible range of values allowed by the quant value.`
`133`	`148`	`pub fn range(&self) -> (f32, f32) {`
`134`	`149`	`match self {`
`@@ -138,8 +153,8 @@ impl QuantValue {`
`138`	`153`	`QuantValue::Q8S => (-i8::MAX as f32, i8::MAX as f32),`
`139`	`154`	`QuantValue::Q4S => (-7.0, 7.0),`
`140`	`155`	`QuantValue::Q2S => (-1.0, 1.0),`
`141`		`- QuantValue::E4M3 => (e4m3::MIN as f32, e4m3::MAX as f32),`
`142`		`- QuantValue::E5M2 => (e5m2::MIN as f32, e5m2::MAX as f32),`
	`156`	`+ QuantValue::E4M3 => (-448.0, 448.0),`
	`157`	`+ QuantValue::E5M2 => (-57344.0, 57344.0),`
`143`	`158`	`QuantValue::E2M1 => (-6.0, 6.0), // Hardcoded because of no-std`
`144`	`159`	`}`
`145`	`160`	`}`
`@@ -253,10 +268,12 @@ impl BlockSize {`
`253`	`268`	`out`
`254`	`269`	`}`
`255`	`270`
	`271`	`+ /// Create an iterator over all stored dimensions`
`256`	`272`	`pub fn iter(&self) -> impl Iterator<Item = &u8> {`
`257`	`273`	`self.as_slice().iter()`
`258`	`274`	`}`
`259`	`275`
	`276`	`+ /// Returns the total number of elements in each block`
`260`	`277`	`pub fn num_elements(&self) -> usize {`
`261`	`278`	`self.iter().map(\|it\| *it as usize).product()`
`262`	`279`	`}`