feat: Add broadcasting support to linear layout (#889)

wingertge · web-flow · commit 018341e10e98 · 2025-09-16T08:54:33.000-04:00
diff --git a/crates/cubecl-cpp/src/shared/instruction.rs b/crates/cubecl-cpp/src/shared/instruction.rs
@@ -467,7 +467,7 @@ for ({i_ty} {i} = {start}; {i} {cmp} {end}; {increment}) {{
             } => {
                 let out = out.fmt_left();
                 match *split_meta {
-                    true => writeln!(f, "{out} = static_info.x[{info_offset}];"),
+                    true => writeln!(f, "{out} = {STATIC_INFO_NAME}.x[{info_offset}];"),
                     false => writeln!(f, "{out} = {INFO_NAME}[{info_offset}];"),
                 }
             }
diff --git a/crates/cubecl-std/src/fast_math.rs b/crates/cubecl-std/src/fast_math.rs
@@ -16,10 +16,6 @@ pub enum FastDivmod {
         multiplier: u32,
         shift_right: u32,
     },
-    PowerOfTwo {
-        shift: u32,
-        mask: u32,
-    },
     Fallback {
         divisor: u32,
     },
@@ -36,13 +32,6 @@ impl<R: Runtime> FastDivmodArgs<'_, R> {
     pub fn new(client: &ComputeClient<R::Server, R::Channel>, divisor: u32) -> Self {
         debug_assert!(divisor != 0);
 
-        if divisor.is_power_of_two() {
-            return FastDivmodArgs::PowerOfTwo {
-                shift: ScalarArg::new(divisor.trailing_zeros()),
-                mask: ScalarArg::new(divisor - 1),
-            };
-        }
-
         if !u64::supported_uses(client).contains(TypeUsage::Arithmetic) {
             return FastDivmodArgs::Fallback {
                 divisor: ScalarArg::new(divisor),
@@ -73,7 +62,6 @@ impl FastDivmod {
                 let t = u32::mul_hi(dividend, *multiplier);
                 (t + dividend) >> shift_right
             }
-            FastDivmod::PowerOfTwo { shift, .. } => dividend >> *shift,
             FastDivmod::Fallback { divisor } => dividend / divisor,
         }
     }
@@ -82,7 +70,6 @@ impl FastDivmod {
         let q = self.div(dividend);
         match self {
             FastDivmod::Fast { divisor, .. } => dividend - q * divisor,
-            FastDivmod::PowerOfTwo { mask, .. } => dividend & mask,
             FastDivmod::Fallback { divisor } => dividend % divisor,
         }
     }
@@ -92,7 +79,6 @@ impl FastDivmod {
         let r = match self {
             FastDivmod::Fast { divisor, .. } => dividend - q * divisor,
             FastDivmod::Fallback { divisor } => dividend - q * divisor,
-            FastDivmod::PowerOfTwo { mask, .. } => dividend & *mask,
         };
 
         (q, r)
diff --git a/crates/cubecl-std/src/tensor/layout/linear.rs b/crates/cubecl-std/src/tensor/layout/linear.rs
@@ -49,6 +49,7 @@ impl LinearLayoutExpand {
 }
 
 impl<'a, R: Runtime> LinearLayoutArgs<'a, R> {
+    /// Construct a linear layout from shapes, strides and line size of the tensor
     pub fn from_shape_strides(
         client: &ComputeClient<R::Server, R::Channel>,
         shape: &[usize],
@@ -69,13 +70,52 @@ impl<'a, R: Runtime> LinearLayoutArgs<'a, R> {
         }
     }
 
+    /// Construct a possibly broadcast linear layout from shapes/strides and a reference shape
+    pub fn from_shape_strides_with_reference(
+        client: &ComputeClient<R::Server, R::Channel>,
+        shape: &[usize],
+        reference_shape: &[usize],
+        strides: &[usize],
+        line_size: &'a u8,
+    ) -> Self {
+        if shape != reference_shape {
+            // Broadcast layouts are always treated as permuted
+            Self::Permuted(PermutedLayoutLaunch::from_shapes_strides_ref(
+                client,
+                shape,
+                reference_shape,
+                strides,
+                line_size,
+            ))
+        } else {
+            Self::from_shape_strides(client, shape, strides, line_size)
+        }
+    }
+
+    /// Construct a linear layout from a tensor handle
     pub fn from_handle(
         client: &ComputeClient<R::Server, R::Channel>,
         handle: &TensorHandleRef<'a, R>,
         line_size: &'a u8,
     ) -> Self {
         Self::from_shape_strides(client, handle.shape, handle.strides, line_size)
     }
+
+    /// Construct a possibly broadcast linear layout from a tensor handle and reference handle
+    pub fn from_handle_with_reference(
+        client: &ComputeClient<R::Server, R::Channel>,
+        handle: &TensorHandleRef<'a, R>,
+        reference: &TensorHandleRef<'a, R>,
+        line_size: &'a u8,
+    ) -> Self {
+        Self::from_shape_strides_with_reference(
+            client,
+            handle.shape,
+            reference.shape,
+            handle.strides,
+            line_size,
+        )
+    }
 }
 
 #[cube]
@@ -120,6 +160,21 @@ pub fn linear_view<'a, R: Runtime>(
     LinearViewLaunch::new(buffer, layout)
 }
 
+/// Create a possibly broadcast linear tensor view from a handle, reference handle and line size
+pub fn linear_view_with_reference<'a, R: Runtime>(
+    client: &ComputeClient<R::Server, R::Channel>,
+    handle: &'a TensorHandleRef<'a, R>,
+    reference: &'a TensorHandleRef<'a, R>,
+    line_size: &'a u8,
+) -> LinearViewLaunch<'a, R> {
+    let len = handle.shape.iter().product::<usize>();
+    let layout = LinearLayoutArgs::from_handle_with_reference(client, handle, reference, line_size);
+    let buffer = unsafe {
+        ArrayArg::from_raw_parts_and_size(handle.handle, len, *line_size, handle.elem_size)
+    };
+    LinearViewLaunch::new(buffer, layout)
+}
+
 pub fn linear_view_alias<'a, R: Runtime>(
     client: &ComputeClient<R::Server, R::Channel>,
     handle: &'a TensorHandleRef<'a, R>,
diff --git a/crates/cubecl-std/src/tensor/layout/permuted.rs b/crates/cubecl-std/src/tensor/layout/permuted.rs
@@ -21,6 +21,8 @@ pub struct PermutedLayout {
 }
 
 impl<'a, R: Runtime> PermutedLayoutLaunch<'a, R> {
+    /// Create a new permuted layout for a possibly broadcast tensor, with a reference shape to be
+    /// broadcast to.
     pub fn from_shape_strides(
         client: &ComputeClient<R::Server, R::Channel>,
         shape: &[usize],
@@ -45,6 +47,51 @@ impl<'a, R: Runtime> PermutedLayoutLaunch<'a, R> {
         Self::new(shape, strides, ScalarArg::new(len as u32), line_size)
     }
 
+    /// Create a new permuted layout for a possibly broadcast tensor, with a reference shape to be
+    /// broadcast to.
+    pub fn from_shapes_strides_ref(
+        client: &ComputeClient<R::Server, R::Channel>,
+        shape: &[usize],
+        reference_shape: &[usize],
+        strides: &[usize],
+        line_size: &'a u8,
+    ) -> Self {
+        debug_assert!(
+            shape.len() == reference_shape.len(),
+            "Shape and reference should have the same rank"
+        );
+        debug_assert!(
+            shape
+                .iter()
+                .zip(reference_shape)
+                .all(|(s, r)| s == r || *s == 1),
+            "Shape should be equal to reference or 1 on each dimension"
+        );
+
+        let strides: Vec<usize> = strides
+            .iter()
+            .zip(shape.iter().zip(reference_shape))
+            .map(|(stride, (s, r))| if *s == *r { *stride } else { 0 })
+            .collect();
+
+        Self::from_shape_strides(client, reference_shape, &strides, line_size)
+    }
+
+    pub fn from_handles_ref(
+        client: &ComputeClient<R::Server, R::Channel>,
+        handle: &TensorHandleRef<'_, R>,
+        reference_handle: &TensorHandleRef<'_, R>,
+        line_size: &'a u8,
+    ) -> Self {
+        Self::from_shapes_strides_ref(
+            client,
+            handle.shape,
+            reference_handle.shape,
+            handle.strides,
+            line_size,
+        )
+    }
+
     pub fn from_handle(
         client: &ComputeClient<R::Server, R::Channel>,
         handle: &TensorHandleRef<'_, R>,

Original file line number	Diff line number	Diff line change
`@@ -467,7 +467,7 @@ for ({i_ty} {i} = {start}; {i} {cmp} {end}; {increment}) {{`
`467`	`467`	`} => {`
`468`	`468`	`let out = out.fmt_left();`
`469`	`469`	`match *split_meta {`
`470`		`- true => writeln!(f, "{out} = static_info.x[{info_offset}];"),`
	`470`	`+ true => writeln!(f, "{out} = {STATIC_INFO_NAME}.x[{info_offset}];"),`
`471`	`471`	`false => writeln!(f, "{out} = {INFO_NAME}[{info_offset}];"),`
`472`	`472`	`}`
`473`	`473`	`}`