fix: Align shift right behaviour to align with most other compilers (#894)

wingertge · web-flow · commit 74f28d96db18 · 2025-09-19T15:56:42.000-04:00
diff --git a/crates/cubecl-attention/src/components/tile/dummy/flash_matmul/accelerated/setup.rs b/crates/cubecl-attention/src/components/tile/dummy/flash_matmul/accelerated/setup.rs
@@ -33,7 +33,7 @@ impl FlashMatmulFamily for AcceleratedFlashMatmul {
             1,
             line_sizes.query as u32,
             line_sizes.key as u32,
-            problem.seq_kv as u32 % selection.attention_tile_size.seq_kv != 0,
+            !(problem.seq_kv as u32).is_multiple_of(selection.attention_tile_size.seq_kv),
         )
     }
 }
diff --git a/crates/cubecl-attention/src/components/tile/dummy/flash_matmul/dummy_register/setup.rs b/crates/cubecl-attention/src/components/tile/dummy/flash_matmul/dummy_register/setup.rs
@@ -35,7 +35,7 @@ impl FlashMatmulFamily for DummyRegisterFlashMatmul {
             1,
             line_sizes.query as u32,
             line_sizes.key as u32,
-            problem.seq_kv as u32 % selection.attention_tile_size.seq_kv != 0,
+            !(problem.seq_kv as u32).is_multiple_of(selection.attention_tile_size.seq_kv),
         )
     }
 }
diff --git a/crates/cubecl-attention/src/components/tile/dummy/writer.rs b/crates/cubecl-attention/src/components/tile/dummy/writer.rs
@@ -48,7 +48,7 @@ impl<EO: Numeric> DummyWriter<EO> {
 
         let unit_step = config.plane_dim() * output_line_size;
         let num_unit_writes = comptime!(div_ceil(tile_size, unit_step));
-        let balanced_workload = comptime!(tile_size % unit_step == 0);
+        let balanced_workload = comptime!(tile_size.is_multiple_of(unit_step));
 
         #[unroll(num_unit_writes == 1)]
         for i in 0..num_unit_writes {
diff --git a/crates/cubecl-core/src/runtime_tests/cmma.rs b/crates/cubecl-core/src/runtime_tests/cmma.rs
@@ -597,8 +597,8 @@ pub fn test_simple_tf32<R: Runtime>(
         return;
     }
 
-    let lhs: Vec<f32> = (0..128).map(|i| (i as f32)).collect();
-    let rhs: Vec<f32> = (0..128).map(|i| ((i % 8) as f32)).collect();
+    let lhs: Vec<f32> = (0..128).map(|i| i as f32).collect();
+    let rhs: Vec<f32> = (0..128).map(|i| (i % 8) as f32).collect();
 
     let lhs = client.create(f32::as_bytes(&lhs));
     let rhs = client.create(f32::as_bytes(&rhs));
diff --git a/crates/cubecl-cpp/src/shared/item.rs b/crates/cubecl-cpp/src/shared/item.rs
@@ -51,7 +51,7 @@ impl<D: Dialect> Item<D> {
     }
 
     pub fn optimized(&self) -> Item<D> {
-        if !self.can_be_optimized() || self.vectorization % 2 != 0 {
+        if !self.can_be_optimized() || !self.vectorization.is_multiple_of(2) {
             return *self;
         }
 
diff --git a/crates/cubecl-cuda/src/compute/server.rs b/crates/cubecl-cuda/src/compute/server.rs
@@ -44,7 +44,6 @@ use cudarc::driver::sys::{
 use cudarc::driver::sys::{CUfunc_st, CUtensorMapInterleave};
 #[cfg(feature = "cuda-12080")]
 use cudarc::driver::sys::{CUtensorMapIm2ColWideMode, cuTensorMapEncodeIm2colWide};
-use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::ffi::c_char;
 use std::path::PathBuf;
@@ -78,7 +77,8 @@ pub(crate) struct CudaContext {
     compilation_options: CompilationOptions,
 }
 
-#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)]
+#[cfg(feature = "compilation-cache")]
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq, Clone)]
 pub struct PtxCacheEntry {
     entrypoint_name: String,
     cube_dim: (u32, u32, u32),
@@ -336,7 +336,7 @@ impl ComputeServer for CudaServer {
                     .expect("Failed to find resource");
                 let device_ptr = resource.ptr as *mut c_void;
                 debug_assert!(
-                    device_ptr as usize % 16 == 0,
+                    (device_ptr as usize).is_multiple_of(16),
                     "Tensor pointer must be 16 byte aligned"
                 );
                 let mut map_ptr = MaybeUninit::zeroed();
diff --git a/crates/cubecl-matmul/src/components/batch/partitioned_matmul/hypercube/base.rs b/crates/cubecl-matmul/src/components/batch/partitioned_matmul/hypercube/base.rs
@@ -76,13 +76,13 @@ impl HypercubeConfig {
         match self.global_order {
             RowMajor | ColMajor => Ok(()),
 
-            SwizzleRowMajor(w) if m_cubes % w != 0 => {
+            SwizzleRowMajor(w) if !m_cubes.is_multiple_of(w) => {
                 Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                     "In swizzle row major, number of cubes in m {m_cubes:?} must be divisible by swizzle step length {w:?}."
                 ))))
             }
 
-            SwizzleColMajor(w) if n_cubes % w != 0 => {
+            SwizzleColMajor(w) if !n_cubes.is_multiple_of(w) => {
                 Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                     "In swizzle col major, number of cubes in n {n_cubes:?} must be divisible by swizzle step length {w:?}."
                 ))))
diff --git a/crates/cubecl-matmul/src/components/batch/partitioned_matmul/hypercube/sm_allocation.rs b/crates/cubecl-matmul/src/components/batch/partitioned_matmul/hypercube/sm_allocation.rs
@@ -51,7 +51,7 @@ impl SmAllocation {
                     let mut i = 1;
 
                     while i * i <= n {
-                        if n % i == 0 {
+                        if n.is_multiple_of(i) {
                             divs.push(i);
                             if i != n / i {
                                 divs.push(n / i);
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/async_full_cyclic.rs b/crates/cubecl-matmul/src/components/global/load/strategy/async_full_cyclic.rs
@@ -26,7 +26,7 @@ impl<T: TilingOrder> LoadingValidation for AsyncFullCyclicLoading<T> {
         let num_slices = config.tiling_scheme().elements_in_tile_row(ident)
             * config.tiling_scheme().tiles_in_stage(ident);
 
-        if num_slices >= total_units && num_slices % total_units != 0 {
+        if num_slices >= total_units && !num_slices.is_multiple_of(total_units) {
             return Err(Box::new(format!(
                 "Number of units ({total_units:?}) must divide number of slices ({num_slices:?}). Would require units doing different numbers of slices"
             )));
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/async_full_maximize_slice_length.rs b/crates/cubecl-matmul/src/components/global/load/strategy/async_full_maximize_slice_length.rs
@@ -83,7 +83,7 @@ impl<IP: InputPrecision> AsyncLoadingJob<IP, StridedTilingLayout>
         let nth_slice = this.unit_count * task_id + UNIT_POS;
 
         #[allow(clippy::collapsible_else_if)]
-        if comptime!(this.num_slices % this.unit_count == 0) {
+        if comptime!(this.num_slices.is_multiple_of(this.unit_count)) {
             load_nth_slice::<IP::Global, IP::Stage, CM, G>(
                 nth_slice,
                 tensor_reader,
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/async_full_maximize_unit_count.rs b/crates/cubecl-matmul/src/components/global/load/strategy/async_full_maximize_unit_count.rs
@@ -34,7 +34,7 @@ impl LoadingValidation for AsyncFullMaximizeUnitCountLoading {
         };
         let unit_count = config.plane_dim() * config.num_loading_planes(ident);
 
-        if unit_count % num_slices != 0 {
+        if !unit_count.is_multiple_of(num_slices) {
             return Err(Box::new(
                 "Number of slices must divide number of units evenly",
             ));
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/async_partial_maximize_slice_length.rs b/crates/cubecl-matmul/src/components/global/load/strategy/async_partial_maximize_slice_length.rs
@@ -147,7 +147,7 @@ impl<IP: InputPrecision> AsyncLoadingJob<IP, StridedTilingLayout>
         let mut dest = destination.slice_mut(start, end);
 
         #[allow(clippy::collapsible_else_if)]
-        if comptime!(this.num_slices % this.unit_count == 0) {
+        if comptime!(this.num_slices.is_multiple_of(this.unit_count)) {
             CM::memcpy_async(mechanism, &src.try_cast_unchecked(), &mut dest);
         } else {
             if nth_slice_in_stage < this.num_slices {
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_cyclic.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_cyclic.rs
@@ -28,7 +28,7 @@ impl<TO: TilingOrder> LoadingValidation for SyncFullCyclicLoading<TO> {
             let num_stage_lines = config.tiling_scheme().elements_in_stage(ident) / line_size;
             let total_units = config.num_loading_planes(ident) * config.plane_dim();
 
-            if num_stage_lines % total_units != 0 {
+            if !num_stage_lines.is_multiple_of(total_units) {
                 return Err(Box::new(
                 "Too many data will be loaded, resulting in out of bounds.
         Try setting line size and number of planes so that total unit count {:?} divides number of lines in stage.",
@@ -68,7 +68,7 @@ impl<TO: TilingOrder> SyncFullLoadingStrategy for SyncFullCyclicLoading<TO> {
         let num_stage_lines = num_stage_elements.div_ceil(line_size);
         let total_units = comptime!(config.num_loading_planes(ident) * config.plane_dim());
         let num_tasks_per_unit = comptime!(num_stage_lines.div_ceil(total_units));
-        let balanced_workload = comptime!(num_stage_lines % total_units == 0);
+        let balanced_workload = comptime!(num_stage_lines.is_multiple_of(total_units));
         let jump_length = comptime!(total_units * line_size);
 
         let unit_id = RoleRule::new(config.role_rule_config())
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_ordered.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_ordered.rs
@@ -34,7 +34,7 @@ impl LoadingValidation for SyncFullOrderedLoading {
         let num_planes = config.num_loading_planes(ident);
         let num_tiles = config.tiling_scheme().tiles_in_stage(ident);
 
-        if num_tiles % num_planes != 0 {
+        if !num_tiles.is_multiple_of(num_planes) {
             return Err(FormattedConfigError::new(move || {
                 format!(
                     "Number of planes {num_planes:?} must divide number of tiles {num_tiles:?} for ordered loading.",
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_strided.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_strided.rs
@@ -22,7 +22,7 @@ impl LoadingValidation for SyncFullStridedLoading {
         let num_stage_lines = config.tiling_scheme().elements_in_stage(ident) / line_size;
         let total_units = config.num_loading_planes(ident) * config.plane_dim();
 
-        if num_stage_lines % total_units != 0 {
+        if !num_stage_lines.is_multiple_of(total_units) {
             return Err(Box::new(
                 "Too many data will be loaded, resulting in out of bounds.
         Try setting line size and number of planes so that total unit count {:?} divides number of lines in stage.",
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_tilewise.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_full_tilewise.rs
@@ -46,7 +46,7 @@ impl<T: TilingOrder> LoadingValidation for SyncFullTilewiseLoading<T> {
         let num_planes = config.num_loading_planes(ident);
         let num_tiles = config.tiling_scheme().tiles_in_stage(ident);
 
-        if num_tiles % num_planes != 0 {
+        if !num_tiles.is_multiple_of(num_planes) {
             return Err(FormattedConfigError::new(move || {
                 format!(
                     "Number of planes {num_planes:?} must divide number of tiles {num_tiles:?} for tilewise loading.",
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_partial_cyclic.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_partial_cyclic.rs
@@ -84,7 +84,7 @@ impl<TO: TilingOrder> SyncPartialLoadingStrategy for SyncPartialCyclicLoading<TO
 
         let num_tiles_in_stage = tile_count_row * tile_count_col;
         let total_num_lines = num_tiles_in_stage * num_lines_per_tile;
-        let balanced_workload = total_num_lines % total_units == 0;
+        let balanced_workload = total_num_lines.is_multiple_of(total_units);
         let num_tasks_per_unit = total_num_lines.div_ceil(total_units);
         let jump_length = total_units * line_size;
 
diff --git a/crates/cubecl-matmul/src/components/global/load/strategy/sync_partial_tilewise.rs b/crates/cubecl-matmul/src/components/global/load/strategy/sync_partial_tilewise.rs
@@ -46,7 +46,7 @@ impl<T: TilingOrder> LoadingValidation for SyncPartialTilewiseLoading<T> {
         let num_planes = config.num_loading_planes(ident);
         let num_tiles = config.tiling_scheme().tiles_in_stage(ident);
 
-        if num_tiles % num_planes != 0 {
+        if !num_tiles.is_multiple_of(num_planes) {
             return Err(FormattedConfigError::new(move || {
                 "Number of planes {num_planes:?} must divide number of tiles {num_tiles:?} for tilewise loading.".to_string()
             }));
diff --git a/crates/cubecl-matmul/src/components/global/multi_stage/double_buffering/setup.rs b/crates/cubecl-matmul/src/components/global/multi_stage/double_buffering/setup.rs
@@ -79,9 +79,9 @@ where
             client,
             stage_config,
             num_planes,
-            problem.m as u32 % stage_shape_m != 0,
-            problem.n as u32 % stage_shape_n != 0,
-            problem.k as u32 % (2 * stage_shape_k) != 0,
+            !(problem.m as u32).is_multiple_of(stage_shape_m),
+            !(problem.n as u32).is_multiple_of(stage_shape_n),
+            !(problem.k as u32).is_multiple_of(2 * stage_shape_k),
             selection.loading_precompute_strategy,
             selection.loader_mode,
             selection.load_specialization_config.into(),
diff --git a/crates/cubecl-matmul/src/components/global/multi_stage/ordered/setup.rs b/crates/cubecl-matmul/src/components/global/multi_stage/ordered/setup.rs
@@ -81,9 +81,9 @@ where
             client,
             stage_config,
             num_planes,
-            problem.m as u32 % stage_shape_m != 0,
-            problem.n as u32 % stage_shape_n != 0,
-            problem.k as u32 % (2 * stage_shape_k) != 0,
+            !(problem.m as u32).is_multiple_of(stage_shape_m),
+            !(problem.n as u32).is_multiple_of(stage_shape_n),
+            !(problem.k as u32).is_multiple_of(2 * stage_shape_k),
             selection.loading_precompute_strategy,
             selection.loader_mode,
             selection.load_specialization_config.into(),
diff --git a/crates/cubecl-matmul/src/components/global/single_stage/barrier/setup.rs b/crates/cubecl-matmul/src/components/global/single_stage/barrier/setup.rs
@@ -78,9 +78,9 @@ where
             client,
             stage_config,
             num_planes,
-            problem.m as u32 % stage_shape_m != 0,
-            problem.n as u32 % stage_shape_n != 0,
-            problem.k as u32 % stage_shape_k != 0,
+            !(problem.m as u32).is_multiple_of(stage_shape_m),
+            !(problem.n as u32).is_multiple_of(stage_shape_n),
+            !(problem.k as u32).is_multiple_of(stage_shape_k),
             stage_shape_k,
             selection.loading_precompute_strategy,
             selection.loader_mode,
diff --git a/crates/cubecl-matmul/src/components/global/single_stage/simple/setup.rs b/crates/cubecl-matmul/src/components/global/single_stage/simple/setup.rs
@@ -79,9 +79,9 @@ where
             client,
             stage_config,
             num_planes,
-            problem.m as u32 % stage_shape_m != 0,
-            problem.n as u32 % stage_shape_n != 0,
-            problem.k as u32 % stage_shape_k != 0,
+            !(problem.m as u32).is_multiple_of(stage_shape_m),
+            !(problem.n as u32).is_multiple_of(stage_shape_n),
+            !(problem.k as u32).is_multiple_of(stage_shape_k),
             stage_shape_k,
             selection.loading_precompute_strategy,
             selection.loader_mode,
diff --git a/crates/cubecl-matmul/src/components/global/single_stage/tma/setup.rs b/crates/cubecl-matmul/src/components/global/single_stage/tma/setup.rs
@@ -73,9 +73,9 @@ where
             client,
             stage_config,
             num_planes,
-            problem.m as u32 % stage_shape_m != 0,
-            problem.n as u32 % stage_shape_n != 0,
-            problem.k as u32 % stage_shape_k != 0,
+            !(problem.m as u32).is_multiple_of(stage_shape_m),
+            !(problem.n as u32).is_multiple_of(stage_shape_n),
+            !(problem.k as u32).is_multiple_of(stage_shape_k),
             stage_shape_k,
             selection.loading_precompute_strategy,
             selection.loader_mode,
diff --git a/crates/cubecl-matmul/src/components/global/specialization/config.rs b/crates/cubecl-matmul/src/components/global/specialization/config.rs
@@ -78,7 +78,7 @@ fn best_divisor_close_to_reference(dividible_value: u32, reference: u32) -> u32
     let mut best_dist = reference.abs_diff(1);
 
     for d in 1..=dividible_value {
-        if dividible_value % d == 0 {
+        if dividible_value.is_multiple_of(d) {
             let dist = reference.abs_diff(d);
             if dist < best_dist || (dist == best_dist && d > best) {
                 best = d;
diff --git a/crates/cubecl-matmul/src/components/global/write/plane.rs b/crates/cubecl-matmul/src/components/global/write/plane.rs
@@ -50,7 +50,7 @@ impl<EG: Numeric> StageUnloader<EG> for PlaneWriter<EG> {
 
         let unit_step = config.plane_dim() * output_line_size;
         let num_unit_writes = comptime!(div_ceil(tile_size, unit_step));
-        let balanced_workload = comptime!(tile_size % unit_step == 0);
+        let balanced_workload = comptime!(tile_size.is_multiple_of(unit_step));
 
         #[unroll(num_unit_writes == 1)]
         for i in 0..num_unit_writes {
@@ -100,7 +100,8 @@ fn write_line<EG: Numeric>(
     let value = if comptime!(output_line_size == out_smem_line_size) {
         out_smem_slice[unit_write / output_line_size]
     } else if comptime!(
-        out_smem_line_size < output_line_size && output_line_size % out_smem_line_size == 0
+        out_smem_line_size < output_line_size
+            && output_line_size.is_multiple_of(out_smem_line_size)
     ) {
         let mut value = Line::empty(output_line_size);
         #[unroll]
diff --git a/crates/cubecl-matmul/src/components/tile/plane_vec_mat_inner_product/config.rs b/crates/cubecl-matmul/src/components/tile/plane_vec_mat_inner_product/config.rs
@@ -140,7 +140,7 @@ impl PlaneVecMatInnerProductConfig {
             ))));
         }
 
-        if n % out_line != 0 {
+        if !n.is_multiple_of(out_line) {
             return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                 "n must be divisible by out line size, got n={n:?}, out_line_size={out_line:?}",
             ))));
diff --git a/crates/cubecl-matmul/src/components/tile/register/config.rs b/crates/cubecl-matmul/src/components/tile/register/config.rs
@@ -120,14 +120,14 @@ impl RegisterConfig {
 
         match self.matrix_layout(StageIdent::Lhs) {
             MatrixLayout::RowMajor => {
-                if k % lhs != 0 {
+                if !k.is_multiple_of(lhs) {
                     return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                         "Tile shape in lined axis {k:?} should be divisible by line size {lhs:?}"
                     ))));
                 }
             }
             MatrixLayout::ColMajor => {
-                if m % lhs != 0 {
+                if !m.is_multiple_of(lhs) {
                     return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                         "Tile shape in lined axis {m:?} should be divisible by line size {lhs:?}"
                     ))));
@@ -136,22 +136,22 @@ impl RegisterConfig {
         }
         match self.matrix_layout(StageIdent::Rhs) {
             MatrixLayout::RowMajor => {
-                if n % rhs != 0 {
+                if !n.is_multiple_of(rhs) {
                     return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                         "Tile shape in lined axis {n:?} should be divisible by line size {rhs:?}"
                     ))));
                 }
             }
             MatrixLayout::ColMajor => {
-                if k % rhs != 0 {
+                if !k.is_multiple_of(rhs) {
                     return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                         "Tile shape in lined axis {k:?} should be divisible by line size {rhs:?}"
                     ))));
                 }
             }
         }
 
-        if n % out != 0 {
+        if !n.is_multiple_of(out) {
             return Err(MatmulSetupError::InvalidConfig(Box::new(format!(
                 "Tile shape in lined axis {n:?} should be divisible by line size {out:?}"
             ))));
diff --git a/crates/cubecl-matmul/src/kernels/layered/selector/unit.rs b/crates/cubecl-matmul/src/kernels/layered/selector/unit.rs
@@ -384,7 +384,7 @@ fn selection(
 pub fn closest_factor_pair(n: u32) -> (u32, u32) {
     let sqrt_n = (n as f64).sqrt() as u32;
     for a in (1..=sqrt_n).rev() {
-        if n % a == 0 {
+        if n.is_multiple_of(a) {
             return (n / a, a);
         }
     }
diff --git a/crates/cubecl-matmul/src/tune_key.rs b/crates/cubecl-matmul/src/tune_key.rs
@@ -136,7 +136,7 @@ impl MatmulAutotuneKey {
 /// Defines the potential vectorization.
 fn pow2_factor(axis: usize) -> u8 {
     for i in (1..4).rev() {
-        if axis % 2usize.pow(i as u32) == 0 {
+        if axis.is_multiple_of(2usize.pow(i as u32)) {
             return i;
         }
     }
diff --git a/crates/cubecl-quant/src/dequantize.rs b/crates/cubecl-quant/src/dequantize.rs
@@ -247,7 +247,7 @@ fn dequantize_packed<R: Runtime, F: Float, FS: Float>(
     let line_size_out = num_quants;
     let rank = output.shape.len();
 
-    if output.shape[rank - 1] % line_size_out as usize != 0 {
+    if !output.shape[rank - 1].is_multiple_of(line_size_out as usize) {
         line_size_in = 1;
     }
 
diff --git a/crates/cubecl-quant/src/lib.rs b/crates/cubecl-quant/src/lib.rs
@@ -1,4 +1,6 @@
 #![cfg_attr(not(feature = "std"), no_std)]
+#![allow(unknown_lints)] // `manual_div_ceil` only appeared in 1.83
+#![allow(clippy::manual_div_ceil, clippy::manual_is_multiple_of)]
 
 extern crate alloc;
 
diff --git a/crates/cubecl-reduce/src/config.rs b/crates/cubecl-reduce/src/config.rs
@@ -160,7 +160,7 @@ impl ReduceConfig {
                     && output.strides[rank - 1] == 1;
             let shape = output.shape.get(axis + 1).cloned().unwrap_or(1) as u32;
 
-            if is_contiguous && shape % self.line_size_input == 0 {
+            if is_contiguous && shape.is_multiple_of(self.line_size_input) {
                 self.line_size_output = self.line_size_input;
             }
         }
diff --git a/crates/cubecl-reduce/src/tune_key.rs b/crates/cubecl-reduce/src/tune_key.rs
@@ -57,7 +57,9 @@ impl ReduceAutotuneKey {
         let mut potential_line_size = 1;
         let max_bytes_in_line = 16; // 128 bits
         //
-        while shape % 2 == 0 && potential_line_size as usize * elem_size < max_bytes_in_line {
+        while shape.is_multiple_of(2)
+            && potential_line_size as usize * elem_size < max_bytes_in_line
+        {
             potential_line_size *= 2;
             shape /= 2;
         }
diff --git a/crates/cubecl-runtime/src/config/logger.rs b/crates/cubecl-runtime/src/config/logger.rs
diff --git a/crates/cubecl-runtime/src/memory_management/memory_pool/sliced_pool.rs b/crates/cubecl-runtime/src/memory_management/memory_pool/sliced_pool.rs
diff --git a/crates/cubecl-spirv/src/bitwise.rs b/crates/cubecl-spirv/src/bitwise.rs
diff --git a/crates/cubecl-std/src/lib.rs b/crates/cubecl-std/src/lib.rs
diff --git a/crates/cubecl-std/src/reinterpret_slice.rs b/crates/cubecl-std/src/reinterpret_slice.rs
diff --git a/crates/cubecl-std/src/tensor/contiguous.rs b/crates/cubecl-std/src/tensor/contiguous.rs

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ impl FlashMatmulFamily for AcceleratedFlashMatmul {`
`33`	`33`	`1,`
`34`	`34`	`line_sizes.query as u32,`
`35`	`35`	`line_sizes.key as u32,`
`36`		`- problem.seq_kv as u32 % selection.attention_tile_size.seq_kv != 0,`
	`36`	`+ !(problem.seq_kv as u32).is_multiple_of(selection.attention_tile_size.seq_kv),`
`37`	`37`	`)`
`38`	`38`	`}`
`39`	`39`	`}`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@ impl FlashMatmulFamily for DummyRegisterFlashMatmul {`
`35`	`35`	`1,`
`36`	`36`	`line_sizes.query as u32,`
`37`	`37`	`line_sizes.key as u32,`
`38`		`- problem.seq_kv as u32 % selection.attention_tile_size.seq_kv != 0,`
	`38`	`+ !(problem.seq_kv as u32).is_multiple_of(selection.attention_tile_size.seq_kv),`
`39`	`39`	`)`
`40`	`40`	`}`
`41`	`41`	`}`
Original file line number	Diff line number	Diff line change
`@@ -597,8 +597,8 @@ pub fn test_simple_tf32<R: Runtime>(`
`597`	`597`	`return;`
`598`	`598`	`}`
`599`	`599`
`600`		`- let lhs: Vec<f32> = (0..128).map(\|i\| (i as f32)).collect();`
`601`		`- let rhs: Vec<f32> = (0..128).map(\|i\| ((i % 8) as f32)).collect();`
	`600`	`+ let lhs: Vec<f32> = (0..128).map(\|i\| i as f32).collect();`
	`601`	`+ let rhs: Vec<f32> = (0..128).map(\|i\| (i % 8) as f32).collect();`
`602`	`602`
`603`	`603`	`let lhs = client.create(f32::as_bytes(&lhs));`
`604`	`604`	`let rhs = client.create(f32::as_bytes(&rhs));`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ impl<D: Dialect> Item<D> {`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`pub fn optimized(&self) -> Item<D> {`
`54`		`- if !self.can_be_optimized() \|\| self.vectorization % 2 != 0 {`
	`54`	`+ if !self.can_be_optimized() \|\| !self.vectorization.is_multiple_of(2) {`
`55`	`55`	`return *self;`
`56`	`56`	`}`
`57`	`57`