Fix/perf line size (#953)

nathanielsimard · web-flow · commit caacb7b7809a · 2025-10-09T15:36:40.000-04:00
diff --git a/crates/cubecl-core/src/runtime.rs b/crates/cubecl-core/src/runtime.rs
@@ -38,6 +38,11 @@ pub trait Runtime: Send + Sync + 'static + core::fmt::Debug {
     /// Returns the supported line sizes for the current runtime's compiler.
     fn supported_line_sizes() -> &'static [u8];
 
+    /// The maximum line size that can be used for global buffer bindings.
+    fn max_global_line_size() -> u8 {
+        u8::MAX
+    }
+
     /// Returns all line sizes that are useful to perform optimal IO operation on the given element.
     fn io_optimized_line_sizes(elem: &StorageType) -> impl Iterator<Item = u8> + Clone {
         let max = (LOAD_WIDTH / elem.size_bits()) as u8;
@@ -50,7 +55,12 @@ pub trait Runtime: Send + Sync + 'static + core::fmt::Debug {
     /// unrolled, and may not support dynamic indexing.
     fn io_optimized_line_sizes_unchecked(elem: &StorageType) -> impl Iterator<Item = u8> + Clone {
         let max = LOAD_WIDTH / elem.size_bits();
-        (1..max as u8).rev().filter(|v| v.is_power_of_two())
+        let max = usize::min(Self::max_global_line_size() as usize, max);
+
+        // If the max is 8, we want to test 1, 2, 4, 8 which is log2(8) + 1.
+        let num_candidates = f32::log2(max as f32) as u32 + 1;
+
+        (0..num_candidates).map(|i| 2u8.pow(i)).rev()
     }
 
     /// Returns the maximum cube count on each dimension that can be launched.
diff --git a/crates/cubecl-wgpu/src/runtime.rs b/crates/cubecl-wgpu/src/runtime.rs
@@ -74,6 +74,10 @@ impl Runtime for WgpuRuntime {
         }
     }
 
+    fn max_global_line_size() -> u8 {
+        4
+    }
+
     fn max_cube_count() -> (u32, u32, u32) {
         let max_dim = u16::MAX as u32;
         (max_dim, max_dim, max_dim)
diff --git a/crates/cubecl/benches/matmul.rs b/crates/cubecl/benches/matmul.rs
@@ -1,5 +1,6 @@
 use core::marker::PhantomData;
 use cubecl::prelude::*;
+use cubecl_matmul::AsyncReadingStrategy;
 use cubecl_matmul::components::batch::HypercubeSelection;
 use cubecl_matmul::components::stage::PartitionBuffering;
 use cubecl_matmul::components::{
@@ -11,17 +12,10 @@ use cubecl_matmul::kernels::layered::double_unit::DoubleUnitSelectionArgs;
 use cubecl_matmul::kernels::layered::ordered_double_buffering::OrderedSelectionArgs;
 use cubecl_matmul::kernels::layered::simple::SimpleArgs;
 use cubecl_matmul::kernels::layered::simple_unit::SimpleUnitSelectionArgs;
-use cubecl_matmul::kernels::layered::{
-    MatmulSelection, MultiRowStrategy, Selection, TileSizeSelection, closest_factor_pair,
-};
 use cubecl_matmul::kernels::layered::{Selection, TileSizeSelection};
-use cubecl_matmul::{self as matmul};
 use cubecl_matmul::{
     self as matmul, MatmulInputHandle, SyncPartialReadingStrategy, SyncReadingStrategy,
 };
-use cubecl_matmul::{self as matmul, SyncPartialReadingStrategy, SyncReadingStrategy};
-use cubecl_matmul::{AsyncReadingStrategy, components::MatmulPrecision};
-use cubecl_matmul::{SyncPartialReadingStrategy, SyncReadingStrategy};
 use std::collections::BTreeMap;
 use std::time::Duration;
 
@@ -98,8 +92,8 @@ impl<R: Runtime, MP: MatmulPrecision> Benchmark for MatmulBench<R, MP> {
             matmul_elems.rhs_global,
             matmul_elems.rhs_stage,
             matmul_elems.rhs_register,
-            matmul_elems.acc,
-            matmul_elems.out,
+            matmul_elems.acc_register,
+            matmul_elems.acc_global,
             self.strategy
         )
         .to_lowercase()
@@ -145,13 +139,13 @@ fn entry(m: usize, n: usize, k: usize) -> (usize, usize, usize, usize) {
 #[allow(dead_code)]
 fn run<R: Runtime, MP: MatmulPrecision>(device: R::Device, strategy: matmul::Strategy) {
     for tl in [false] {
-        for tr in [true] {
+        for tr in [false] {
             for (b, m, n, k) in [
                 // entry(8192, 8192, 8192),
-                // entry(6144, 6144, 6144),
+                entry(6144, 6144, 6144),
                 // entry(4096, 4096, 4096),
                 // entry(2048, 2048, 2048),
-                entry(1024, 1024, 1024),
+                // entry(1024, 1024, 1024),
                 // entry(512, 512, 512),
                 // entry(64, 1024, 64),
                 // entry(32, 1024, 32),
@@ -397,7 +391,7 @@ fn run_algos_wmma<R: Runtime, MP: MatmulPrecision>() {
 #[allow(unused)]
 fn run_benches<R: Runtime, MP: MatmulPrecision>() {
     // run_grid_search::<R, MP>();
-    run_algos_unit::<R, MP>();
+    // run_algos_unit::<R, MP>();
     run_algos_wmma::<R, MP>();
     // run_algos_vecmat::<R, MP>();
 }

Original file line number	Diff line number	Diff line change
`@@ -74,6 +74,10 @@ impl Runtime for WgpuRuntime {`
`74`	`74`	`}`
`75`	`75`	`}`
`76`	`76`
	`77`	`+ fn max_global_line_size() -> u8 {`
	`78`	`+ 4`
	`79`	`+ }`
	`80`	`+`
`77`	`81`	`fn max_cube_count() -> (u32, u32, u32) {`
`78`	`82`	`let max_dim = u16::MAX as u32;`
`79`	`83`	`(max_dim, max_dim, max_dim)`