fix: Quant matmul line sizes (#978)

wingertge · web-flow · commit b3eb8fe7dc42 · 2025-10-23T09:41:38.000-04:00
diff --git a/crates/cubecl-attention/src/base.rs b/crates/cubecl-attention/src/base.rs
@@ -14,7 +14,6 @@ use crate::{
 
 use crate::components::batch::BatchAttentionConfig;
 use crate::components::batch::BatchAttentionFamily;
-use cubecl_core::frontend::CubePrimitive;
 
 pub enum Strategy {
     /// Temporary implementation
@@ -66,9 +65,9 @@ pub fn launch_tmp<R: Runtime, AP: AttentionPrecision>(
     out: &TensorHandleRef<R>,
 ) -> Result<(), AttentionSetupError> {
     let line_sizes = AvailableLineSizes::from_elem_types::<R>(
-        &QG::<AP>::as_type_native_unchecked(),
-        &MSK::<AP>::as_type_native_unchecked(),
-        &OG::<AP>::as_type_native_unchecked(),
+        query.elem_size,
+        size_of::<MSK<AP>>(),
+        out.elem_size,
     );
     let line_sizes = DummyRegisterAlgorithm::filter_line_sizes(line_sizes)
         .filter_with_tensor(AttentionIdent::Query, query.strides, query.shape)
diff --git a/crates/cubecl-attention/src/components/line_size.rs b/crates/cubecl-attention/src/components/line_size.rs
@@ -1,6 +1,6 @@
 use std::fmt::Debug;
 
-use cubecl_core::{LineSizeError, Runtime, ir::StorageType, tensor_line_size_parallel};
+use cubecl_core::{LineSizeError, Runtime, tensor_line_size_parallel};
 
 use crate::components::{AttentionIdent, AttentionSetupError};
 
@@ -29,11 +29,7 @@ pub struct AvailableLineSizes {
 }
 
 impl AvailableLineSizes {
-    pub fn from_elem_types<R: Runtime>(
-        elem_in: &StorageType,
-        elem_mask: &StorageType,
-        elem_out: &StorageType,
-    ) -> Self {
+    pub fn from_elem_types<R: Runtime>(elem_in: usize, elem_mask: usize, elem_out: usize) -> Self {
         let in_available: Vec<u8> = R::io_optimized_line_sizes_unchecked(elem_in).collect();
         let mask_available: Vec<u8> = R::io_optimized_line_sizes_unchecked(elem_mask).collect();
         let out_available = R::io_optimized_line_sizes_unchecked(elem_out).collect();
diff --git a/crates/cubecl-attention/src/tests/attention_test_launcher.rs b/crates/cubecl-attention/src/tests/attention_test_launcher.rs
@@ -58,9 +58,9 @@ pub fn test_attention_algorithm<A, P, R>(
     let out = tensor_raw_parts_output::<P, R>(&client, &problem);
 
     let line_sizes = AvailableLineSizes::from_elem_types::<R>(
-        &P::EG::as_type_native_unchecked(),
-        &P::EM::as_type_native_unchecked(),
-        &P::EG::as_type_native_unchecked(),
+        size_of::<P::EG>(),
+        size_of::<P::EM>(),
+        size_of::<P::EG>(),
     );
     let line_sizes = A::filter_line_sizes(line_sizes);
     let line_sizes = line_sizes
diff --git a/crates/cubecl-convolution/src/components/stage/reader.rs b/crates/cubecl-convolution/src/components/stage/reader.rs
@@ -1,8 +1,9 @@
 use cubecl::prelude::*;
 use cubecl_core as cubecl;
 use cubecl_matmul::components::{
-    MatrixLayout, StageIdent,
-    stage::{StageMemoryConfig, StridedStage, TilingLayout},
+    InvalidConfigError, MatrixLayout, StageIdent,
+    global::memory::GlobalMemoryConfig,
+    stage::{StageMemoryConfig, StridedStage, TilingLayout, TilingValidation},
     tile::StridedTile,
 };
 use cubecl_std::tensor::layout::Coords2d;
@@ -39,3 +40,13 @@ impl TilingLayout for BiasTilingLayout {
         )
     }
 }
+
+impl TilingValidation for BiasTilingLayout {
+    fn check(config: GlobalMemoryConfig) -> Result<(), InvalidConfigError> {
+        let stage_width = config.elements_in_stage_col;
+        if config.global_line_size > stage_width {
+            return Err(Box::new("Invalid line size"));
+        }
+        Ok(())
+    }
+}
diff --git a/crates/cubecl-convolution/src/launch.rs b/crates/cubecl-convolution/src/launch.rs
@@ -181,10 +181,10 @@ where
     Input<Alg, MP>: ConcreteInputsFactory,
     Output<Alg, MP>: ConcreteOutputFactory,
 {
-    let line_sizes = AvailableLineSizes::from_types::<R>(
-        &LhsG::<MP>::as_type_native_unchecked(),
-        &RhsG::<MP>::as_type_native_unchecked(),
-        &AccG::<MP>::as_type_native_unchecked(),
+    let line_sizes = AvailableLineSizes::from_type_sizes::<R>(
+        input.data().elem_size,
+        weight.data().elem_size,
+        out.elem_size,
     )
     .filter_lhs_with_tensor(input.data().strides, input.data().shape, problem.lhs_layout)
     .filter_rhs_with_tensor(
diff --git a/crates/cubecl-convolution/src/tests/convolution_test_launcher.rs b/crates/cubecl-convolution/src/tests/convolution_test_launcher.rs
@@ -56,7 +56,7 @@ pub fn test_convolution_algorithm<A, Args, P, R>(
     let line_sizes = AvailableLineSizes {
         lhs: vec![1],
         rhs: vec![1],
-        out: R::io_optimized_line_sizes_unchecked(&P::EG::as_type_native_unchecked()).collect(),
+        out: R::io_optimized_line_sizes_unchecked(size_of::<P::EG>()).collect(),
     }
     .filter_lhs_with_tensor(&lhs.strides, &lhs.shape, problem.lhs_layout)
     .filter_rhs_with_tensor(&rhs.strides, &rhs.shape, problem.rhs_layout)
diff --git a/crates/cubecl-core/src/frontend/element/cube_elem.rs b/crates/cubecl-core/src/frontend/element/cube_elem.rs
@@ -45,6 +45,11 @@ pub trait CubePrimitive:
         Self::as_type_native().map(|t| t.size_bits())
     }
 
+    /// Only native element types have a size.
+    fn size_bits_unchecked() -> usize {
+        Self::as_type_native_unchecked().size_bits()
+    }
+
     fn from_expand_elem(elem: ExpandElement) -> Self::ExpandType {
         ExpandElementTyped::new(elem)
     }
diff --git a/crates/cubecl-core/src/runtime.rs b/crates/cubecl-core/src/runtime.rs
@@ -50,8 +50,9 @@ pub trait Runtime: Send + Sync + 'static + core::fmt::Debug {
     /// Returns all line sizes that are useful to perform optimal IO operation on the given element.
     /// Ignores native support, and allows all line sizes. This means the returned size may be
     /// unrolled, and may not support dynamic indexing.
-    fn io_optimized_line_sizes_unchecked(elem: &StorageType) -> impl Iterator<Item = u8> + Clone {
-        let max = LOAD_WIDTH / elem.size_bits();
+    fn io_optimized_line_sizes_unchecked(size: usize) -> impl Iterator<Item = u8> + Clone {
+        let size_bits = size * 8;
+        let max = LOAD_WIDTH / size_bits;
         let max = usize::min(Self::max_global_line_size() as usize, max);
 
         // If the max is 8, we want to test 1, 2, 4, 8 which is log2(8) + 1.
diff --git a/crates/cubecl-cpu/src/runtime.rs b/crates/cubecl-cpu/src/runtime.rs
@@ -111,8 +111,9 @@ impl Runtime for CpuRuntime {
         supported.iter().filter(move |v| **v <= max).cloned()
     }
 
-    fn io_optimized_line_sizes_unchecked(elem: &StorageType) -> impl Iterator<Item = u8> + Clone {
-        let max = LOAD_WIDTH / elem.size_bits();
+    fn io_optimized_line_sizes_unchecked(elem_size: usize) -> impl Iterator<Item = u8> + Clone {
+        let elem_size_bits = elem_size * 8;
+        let max = LOAD_WIDTH / elem_size_bits;
         (1..max as u8).rev().filter(|v| v.is_power_of_two())
     }
 
diff --git a/crates/cubecl-matmul/src/components/global/args.rs b/crates/cubecl-matmul/src/components/global/args.rs
@@ -157,10 +157,8 @@ impl<Lhs: Numeric, Rhs: Numeric, Acc: Numeric> ConcreteInputsFactory
                 );
                 let data_view =
                     ViewArg::new::<BatchedGlobalLayout>(data.as_array_arg(line_size), data_layout);
-                let scales_view = ViewArg::new::<BatchedGlobalScaleLayout>(
-                    scale.as_array_arg(line_size),
-                    scales_layout,
-                );
+                let scales_view =
+                    ViewArg::new::<BatchedGlobalScaleLayout>(scale.as_array_arg(1), scales_layout);
                 ViewArg::new_quantized(data_view, scales_view, **scheme)
             }
         };
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/async_full_cooperative.rs b/crates/cubecl-matmul/src/components/global/read/strategy/async_full_cooperative.rs
@@ -5,7 +5,7 @@ use crate::components::{
         memory::{GlobalIterator, load_window_in_stage},
         read::AsyncFullLoadingStrategy,
     },
-    stage::{StridedStage, StridedTilingLayout},
+    stage::{StridedStage, StridedTilingLayout, TilingValidation},
 };
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::BarrierLevel};
@@ -20,7 +20,9 @@ use super::{AsyncLoadingJob, LoadingValidation};
 pub struct AsyncFullCooperativeLoading {}
 
 impl LoadingValidation for AsyncFullCooperativeLoading {
-    fn check<C: GlobalConfig>(_config: &C, _ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+    fn check<C: GlobalConfig>(config: &C, ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+        StridedTilingLayout::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/async_full_cyclic.rs b/crates/cubecl-matmul/src/components/global/read/strategy/async_full_cyclic.rs
@@ -7,7 +7,7 @@ use crate::components::{
         memory::{GlobalIterator, load_window_in_tile},
         read::AsyncFullLoadingStrategy,
     },
-    stage::{ContiguousTilingLayout, StridedStage, TilingOrder},
+    stage::{ContiguousTilingLayout, StridedStage, TilingOrder, TilingValidation},
 };
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::BarrierLevel};
@@ -34,6 +34,8 @@ impl<T: TilingOrder> LoadingValidation for AsyncFullCyclicLoading<T> {
             )));
         }
 
+        ContiguousTilingLayout::<T>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/async_full_maximize_slice_length.rs b/crates/cubecl-matmul/src/components/global/read/strategy/async_full_maximize_slice_length.rs
@@ -5,7 +5,7 @@ use crate::components::{
         memory::{GlobalIterator, load_window_in_stage},
         read::AsyncFullLoadingStrategy,
     },
-    stage::{StridedStage, StridedTilingLayout},
+    stage::{StridedStage, StridedTilingLayout, TilingValidation},
 };
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::BarrierLevel};
@@ -18,7 +18,9 @@ use super::{AsyncLoadingJob, LoadingValidation};
 pub struct AsyncFullMaximizeSliceLengthLoading {}
 
 impl LoadingValidation for AsyncFullMaximizeSliceLengthLoading {
-    fn check<C: GlobalConfig>(_config: &C, _ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+    fn check<C: GlobalConfig>(config: &C, ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+        StridedTilingLayout::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/async_full_maximize_unit_count.rs b/crates/cubecl-matmul/src/components/global/read/strategy/async_full_maximize_unit_count.rs
@@ -5,7 +5,7 @@ use crate::components::{
         memory::{GlobalIterator, load_window_in_stage},
         read::AsyncFullLoadingStrategy,
     },
-    stage::{StageConfig, StridedStage, StridedTilingLayout},
+    stage::{StageConfig, StridedStage, StridedTilingLayout, TilingValidation},
 };
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::BarrierLevel};
@@ -45,6 +45,8 @@ impl LoadingValidation for AsyncFullMaximizeUnitCountLoading {
             ));
         }
 
+        StridedTilingLayout::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/async_partial_maximize_slice_length.rs b/crates/cubecl-matmul/src/components/global/read/strategy/async_partial_maximize_slice_length.rs
@@ -5,7 +5,7 @@ use crate::components::{
         memory::{GlobalIterator, load_window_in_stage},
         read::AsyncPartialLoadingStrategy,
     },
-    stage::{StageConfig, StridedStage, StridedTilingLayout},
+    stage::{StageConfig, StridedStage, StridedTilingLayout, TilingValidation},
 };
 use cubecl_core::prelude::*;
 use cubecl_core::{self as cubecl, prelude::barrier::BarrierLevel};
@@ -18,7 +18,9 @@ use super::{AsyncLoadingJob, LoadingValidation};
 pub struct AsyncPartialMaximizeSliceLengthLoading {}
 
 impl LoadingValidation for AsyncPartialMaximizeSliceLengthLoading {
-    fn check<C: GlobalConfig>(_config: &C, _ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+    fn check<C: GlobalConfig>(config: &C, ident: MatmulIdent) -> Result<(), InvalidConfigError> {
+        StridedTilingLayout::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_cyclic.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_cyclic.rs
@@ -1,12 +1,12 @@
 use std::marker::PhantomData;
 
-use crate::components::global::memory::GlobalIterator;
 use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::{SyncFullLoadingStrategy, tiled::TiledLayout};
 use crate::components::global::{GlobalConfig, RoleRule};
 use crate::components::stage::{ContiguousTilingLayout, StridedStage, TilingOrder};
 use crate::components::{InvalidConfigError, MatmulIdent};
 use crate::components::{MatrixPrecision, TilingScheme};
+use crate::components::{global::memory::GlobalIterator, stage::TilingValidation};
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
@@ -36,6 +36,8 @@ impl<TO: TilingOrder> LoadingValidation for SyncFullCyclicLoading<TO> {
             }
         }
 
+        ContiguousTilingLayout::<TO>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_ordered.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_ordered.rs
@@ -1,11 +1,11 @@
-use crate::components::global::RoleRule;
 use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::SyncFullLoadingStrategy;
 use crate::components::stage::OrderedTilingOrder;
 use crate::components::{
     FormattedConfigError, InvalidConfigError, MatmulIdent, MatrixPrecision, TilingScheme,
 };
 use crate::components::{global::GlobalConfig, stage::ContiguousTilingLayout};
+use crate::components::{global::RoleRule, stage::TilingValidation};
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
@@ -67,6 +67,8 @@ impl LoadingValidation for SyncFullOrderedLoading {
             }));
         }
 
+        ContiguousTilingLayout::<OrderedTilingOrder>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_strided.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_strided.rs
@@ -1,10 +1,10 @@
-use crate::components::global::memory::GlobalIterator;
 use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::{SyncFullLoadingStrategy, stage::FullStageLayout};
 use crate::components::global::{GlobalConfig, RoleRule};
 use crate::components::stage::{StridedStage, StridedTilingLayout};
 use crate::components::{InvalidConfigError, MatmulIdent};
 use crate::components::{MatrixPrecision, TilingScheme};
+use crate::components::{global::memory::GlobalIterator, stage::TilingValidation};
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
@@ -29,6 +29,8 @@ impl LoadingValidation for SyncFullStridedLoading {
             ));
         }
 
+        StridedTilingLayout::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_tilewise.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_full_tilewise.rs
@@ -1,11 +1,11 @@
 use std::marker::PhantomData;
 
-use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::SyncFullLoadingStrategy;
 use crate::components::global::{RoleRule, read::tiled::TiledLayout};
 use crate::components::{
     FormattedConfigError, InvalidConfigError, MatmulIdent, MatrixPrecision, TilingScheme,
 };
+use crate::components::{global::multi_stage::LoadMaxRoundPlaneCount, stage::TilingValidation};
 use crate::components::{
     global::{GlobalConfig, memory::GlobalIterator},
     stage::{ContiguousTilingLayout, StridedStage, TilingOrder},
@@ -69,6 +69,8 @@ impl<T: TilingOrder> LoadingValidation for SyncFullTilewiseLoading<T> {
             }));
         }
 
+        ContiguousTilingLayout::<T>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_partial_cyclic.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_partial_cyclic.rs
@@ -1,11 +1,11 @@
 use std::marker::PhantomData;
 
-use crate::components::global::memory::GlobalIterator;
 use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::{SyncPartialLoadingStrategy, tiled::TiledLayout};
 use crate::components::global::{GlobalConfig, RoleRule};
 use crate::components::stage::{ContiguousTilingLayout, StridedStage, TilingOrder};
 use crate::components::{InvalidConfigError, MatmulIdent, MatrixPrecision, TilingScheme};
+use crate::components::{global::memory::GlobalIterator, stage::TilingValidation};
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 
@@ -44,6 +44,8 @@ impl<TO: TilingOrder> LoadingValidation for SyncPartialCyclicLoading<TO> {
             }
         }
 
+        ContiguousTilingLayout::<TO>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/global/read/strategy/sync_partial_tilewise.rs b/crates/cubecl-matmul/src/components/global/read/strategy/sync_partial_tilewise.rs
@@ -1,12 +1,12 @@
 use std::marker::PhantomData;
 
-use crate::components::global::multi_stage::LoadMaxRoundPlaneCount;
 use crate::components::global::read::SyncPartialLoadingStrategy;
 use crate::components::global::{RoleRule, read::tiled::TiledLayout};
 use crate::components::stage::TilingOrderEnum;
 use crate::components::{
     FormattedConfigError, InvalidConfigError, MatmulIdent, MatrixPrecision, TilingScheme,
 };
+use crate::components::{global::multi_stage::LoadMaxRoundPlaneCount, stage::TilingValidation};
 use crate::components::{
     global::{GlobalConfig, memory::GlobalIterator},
     stage::{ContiguousTilingLayout, StridedStage, TilingOrder},
@@ -85,6 +85,8 @@ impl<T: TilingOrder> LoadingValidation for SyncPartialTilewiseLoading<T> {
             MatmulIdent::Out => unreachable!(),
         }
 
+        ContiguousTilingLayout::<T>::check(config.global_memory_config(ident))?;
+
         Ok(())
     }
 }
diff --git a/crates/cubecl-matmul/src/components/line_size.rs b/crates/cubecl-matmul/src/components/line_size.rs
diff --git a/crates/cubecl-matmul/src/components/stage/memory/layout.rs b/crates/cubecl-matmul/src/components/stage/memory/layout.rs
diff --git a/crates/cubecl-matmul/src/kernels/layered/base.rs b/crates/cubecl-matmul/src/kernels/layered/base.rs
diff --git a/crates/cubecl-matmul/src/tests/layered/matmul_test_launcher.rs b/crates/cubecl-matmul/src/tests/layered/matmul_test_launcher.rs
diff --git a/crates/cubecl-matmul/src/tests/layered/tma_test_launcher.rs b/crates/cubecl-matmul/src/tests/layered/tma_test_launcher.rs
diff --git a/crates/cubecl-quant/src/dequantize.rs b/crates/cubecl-quant/src/dequantize.rs
diff --git a/crates/cubecl-quant/src/quantize.rs b/crates/cubecl-quant/src/quantize.rs
diff --git a/crates/cubecl-reduce/src/config.rs b/crates/cubecl-reduce/src/config.rs
diff --git a/crates/cubecl-reduce/src/shared_sum.rs b/crates/cubecl-reduce/src/shared_sum.rs

Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ pub fn test_convolution_algorithm<A, Args, P, R>(`
`56`	`56`	`let line_sizes = AvailableLineSizes {`
`57`	`57`	`lhs: vec![1],`
`58`	`58`	`rhs: vec![1],`
`59`		`- out: R::io_optimized_line_sizes_unchecked(&P::EG::as_type_native_unchecked()).collect(),`
	`59`	`+ out: R::io_optimized_line_sizes_unchecked(size_of::<P::EG>()).collect(),`
`60`	`60`	`}`
`61`	`61`	`.filter_lhs_with_tensor(&lhs.strides, &lhs.shape, problem.lhs_layout)`
`62`	`62`	`.filter_rhs_with_tensor(&rhs.strides, &rhs.shape, problem.rhs_layout)`
Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,11 @@ pub trait CubePrimitive:`
`45`	`45`	`Self::as_type_native().map(\|t\| t.size_bits())`
`46`	`46`	`}`
`47`	`47`
	`48`	`+ /// Only native element types have a size.`
	`49`	`+ fn size_bits_unchecked() -> usize {`
	`50`	`+ Self::as_type_native_unchecked().size_bits()`
	`51`	`+ }`
	`52`	`+`
`48`	`53`	`fn from_expand_elem(elem: ExpandElement) -> Self::ExpandType {`
`49`	`54`	`ExpandElementTyped::new(elem)`
`50`	`55`	`}`
Original file line number	Diff line number	Diff line change
`@@ -111,8 +111,9 @@ impl Runtime for CpuRuntime {`
`111`	`111`	`supported.iter().filter(move \|v\| **v <= max).cloned()`
`112`	`112`	`}`
`113`	`113`
`114`		`- fn io_optimized_line_sizes_unchecked(elem: &StorageType) -> impl Iterator<Item = u8> + Clone {`
`115`		`- let max = LOAD_WIDTH / elem.size_bits();`
	`114`	`+ fn io_optimized_line_sizes_unchecked(elem_size: usize) -> impl Iterator<Item = u8> + Clone {`
	`115`	`+ let elem_size_bits = elem_size * 8;`
	`116`	`+ let max = LOAD_WIDTH / elem_size_bits;`
`116`	`117`	`(1..max as u8).rev().filter(\|v\| v.is_power_of_two())`
`117`	`118`	`}`
`118`	`119`