crutcher
diff --git a/‎crates/burn-autodiff/src/ops/bool_tensor.rs‎
Lines changed: 4 additions & 0 deletions b/‎crates/burn-autodiff/src/ops/bool_tensor.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎crates/burn-autodiff/src/ops/int_tensor.rs‎
Lines changed: 4 additions & 0 deletions b/‎crates/burn-autodiff/src/ops/int_tensor.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎crates/burn-autodiff/src/ops/tensor.rs‎
Lines changed: 4 additions & 0 deletions b/‎crates/burn-autodiff/src/ops/tensor.rs‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎crates/burn-cubecl/src/ops/base.rs‎
Lines changed: 45 additions & 4 deletions b/‎crates/burn-cubecl/src/ops/base.rs‎
Lines changed: 45 additions & 4 deletions
diff --git a/‎crates/burn-cubecl/src/ops/bool_ops.rs‎
Lines changed: 5 additions & 1 deletion b/‎crates/burn-cubecl/src/ops/bool_ops.rs‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎crates/burn-cubecl/src/ops/float_ops.rs‎
Lines changed: 5 additions & 1 deletion b/‎crates/burn-cubecl/src/ops/float_ops.rs‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎crates/burn-cubecl/src/ops/int_ops.rs‎
Lines changed: 5 additions & 1 deletion b/‎crates/burn-cubecl/src/ops/int_ops.rs‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎crates/burn-fusion/src/ops/boolean.rs‎
Lines changed: 52 additions & 5 deletions b/‎crates/burn-fusion/src/ops/boolean.rs‎
Lines changed: 52 additions & 5 deletions
diff --git a/‎crates/burn-fusion/src/ops/float.rs‎
Lines changed: 51 additions & 1 deletion b/‎crates/burn-fusion/src/ops/float.rs‎
Lines changed: 51 additions & 1 deletion
diff --git a/‎crates/burn-fusion/src/ops/int.rs‎
Lines changed: 51 additions & 0 deletions b/‎crates/burn-fusion/src/ops/int.rs‎
Lines changed: 51 additions & 0 deletions
@@ -107,4 +107,8 @@ impl<B: Backend, C: CheckpointStrategy> BoolTensorOps<Self> for Autodiff<B, C> {
     fn bool_repeat_dim(tensor: BoolTensor<B>, dim: usize, times: usize) -> BoolTensor<B> {
         B::bool_repeat_dim(tensor, dim, times)
     }
+
+    fn bool_unfold(tensor: BoolTensor<Self>, dim: usize, size: usize, step: usize) -> BoolTensor<Self> {
+        B::bool_unfold(tensor, dim, size, step)
+    }
 }
@@ -377,4 +377,8 @@ impl<B: Backend, C: CheckpointStrategy> IntTensorOps<Self> for Autodiff<B, C> {
     fn int_cast(tensor: IntTensor<Self>, dtype: IntDType) -> IntTensor<Self> {
         B::int_cast(tensor, dtype)
     }
+
+    fn int_unfold(tensor: IntTensor<Self>, dim: usize, size: usize, step: usize) -> IntTensor<Self> {
+        B::int_unfold(tensor, dim, size, step)
+    }
 }
@@ -2562,6 +2562,10 @@ impl<B: Backend, C: CheckpointStrategy> FloatTensorOps<Self> for Autodiff<B, C>
 
     // TODO: Implement float_prod and float_sum
     // https://github.com/tracel-ai/burn/issues/1458
+
+    fn float_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {
+        AutodiffTensor::new(B::float_unfold(tensor.primitive, dim, size, step))
+    }
 }
 
 #[derive(Debug, Clone)]
 
@@ -1,9 +1,7 @@
+use std::cmp::max;
 use crate::{CubeRuntime, element::CubeElement, kernel, tensor::CubeTensor};
 use burn_common::tensor::{ReshapeAction, reshape_action};
-use burn_tensor::{
-    Shape, TensorData,
-    quantization::{QTensorPrimitive, QuantLevel},
-};
+use burn_tensor::{Shape, TensorData, quantization::{QTensorPrimitive, QuantLevel}};
 use cubecl::{server::CopyDescriptor, tensor_vectorization_factor};
 
 pub(crate) fn from_data<R: CubeRuntime>(data: TensorData, device: &R::Device) -> CubeTensor<R> {
@@ -213,3 +211,46 @@ pub(crate) fn max_line_size_many<R: CubeRuntime>(tensors: &[&CubeTensor<R>], dim
 
     vec.unwrap_or(0)
 }
+
+/// Unfold windows along a dimension.
+///
+/// Returns a view of the tensor with all complete windows of size `size` in dimension `dim`;
+/// where windows are advanced by `step` at each index.
+///
+/// The number of windows is `max(0, (shape[dim] - size).ceil_div(step))`.
+///
+/// # Arguments
+///
+/// * `tensor` - The input tensor to unfold; of shape ``[pre=..., dim shape, post=...]``
+/// * `dim` - the dimension to unfold.
+/// * `size` - the size of each unfolded window.
+/// * `stride` - the step between each window.
+///
+/// # Returns
+///
+/// A tensor view with shape ``[pre=..., windows, size, post=...]``.
+pub fn unfold<R: CubeRuntime>(
+    tensor: CubeTensor<R>,
+    dim: usize,
+    size: usize,
+    step: usize,
+) -> CubeTensor<R> {
+    let d_shape = tensor.shape.dims[dim];
+    let d_stride = tensor.strides[dim];
+
+    let windows = max(0, (d_shape - size).div_ceil(step));
+
+    let mut shape = tensor.shape.clone();
+    shape.dims[dim] = windows;
+    shape.dims.insert(dim + 1, size);
+
+    let mut strides = tensor.strides.clone();
+    strides[dim] = step * d_stride;
+    strides.insert(dim + 1, d_stride);
+
+    CubeTensor {
+        shape,
+        strides,
+        ..tensor
+    }
+}
@@ -7,7 +7,7 @@ use burn_tensor::ops::{BoolTensor, BoolTensorOps, Device, FloatTensor, IntTensor
 use burn_tensor::{Shape, TensorData};
 use std::ops::Range;
 
-use super::{expand, numeric, permute};
+use super::{expand, numeric, permute, unfold};
 
 impl<R, F, I, BT> BoolTensorOps<Self> for CubeBackend<R, F, I, BT>
 where
@@ -126,4 +126,8 @@ where
     fn bool_flip(tensor: BoolTensor<Self>, axes: &[usize]) -> BoolTensor<Self> {
         kernel::flip::<R, BT, BT>(tensor, axes)
     }
+
+    fn bool_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {
+        unfold(tensor, dim, size, step)
+    }
 }
@@ -1,4 +1,4 @@
-use super::{expand, numeric, permute};
+use super::{expand, numeric, permute, unfold};
 use crate::kernel::prng::{random_bernoulli, random_normal, random_uniform};
 use crate::kernel::unary_basic::BasicFloatUnaryKind;
 use crate::kernel::{
@@ -683,4 +683,8 @@ where
             _ => unimplemented!("Unsupported floating point type cast"),
         }
     }
+
+    fn float_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {
+        unfold(tensor, dim, size, step)
+    }
 }
@@ -1,6 +1,6 @@
 use self::unary_basic_int::BasicIntUnaryKind;
 
-use super::{expand, numeric, permute};
+use super::{expand, numeric, permute, unfold};
 use crate::{
     CubeBackend, CubeRuntime, FloatElement, IntElement,
     kernel::{
@@ -661,4 +661,8 @@ where
             }
         )
     }
+
+    fn int_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {
+        unfold(tensor, dim, size, step)
+    }
 }
@@ -1,8 +1,5 @@
-use burn_ir::{
-    BaseOperationIr, BinaryOpIr, BoolOperationIr, CatOpIr, ExpandOpIr, FlipOpIr, HandleContainer,
-    InitOperationIr, OperationIr, PermuteOpIr, RepeatDimOpIr, SliceAssignOpIr, SliceOpIr,
-    SwapDimsOpIr, TensorIr, UnaryOpIr,
-};
+use std::cmp::max;
+use burn_ir::{BaseOperationIr, BinaryOpIr, BoolOperationIr, CatOpIr, ExpandOpIr, FlipOpIr, HandleContainer, InitOperationIr, OperationIr, PermuteOpIr, RepeatDimOpIr, SliceAssignOpIr, SliceOpIr, SwapDimsOpIr, TensorIr, UnaryOpIr, UnfoldOpIr};
 use burn_tensor::{
     Device, Element, Shape, TensorData, TensorMetadata,
     ops::{BoolTensor, BoolTensorOps, FloatTensor, IntTensor, binary_ops_shape},
@@ -749,4 +746,54 @@ impl<B: FusionBackend> BoolTensorOps<Self> for Fusion<B> {
 
         out
     }
+
+    fn bool_unfold(tensor: BoolTensor<Self>, dim: usize, size: usize, step: usize) -> BoolTensor<Self> {
+        #[derive(new, Debug)]
+        struct UnfoldOps<B: FusionBackend> {
+            desc: UnfoldOpIr,
+            _b: PhantomData<B>,
+        }
+
+        impl<B: FusionBackend> Operation<B::FusionRuntime> for UnfoldOps<B> {
+            fn execute(&self, handles: &mut HandleContainer<B::Handle>) {
+                let input = handles.get_bool_tensor::<B>(&self.desc.input);
+                let output = B::bool_unfold(
+                    input,
+                    self.desc.dim,
+                    self.desc.size,
+                    self.desc.step);
+
+                handles.register_bool_tensor::<B>(&self.desc.out.id, output);
+            }
+        }
+
+        let mut streams = OperationStreams::default();
+        streams.tensor(&tensor);
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = max(0, (d_shape - size).div_ceil(step));
+        shape[dim] = windows;
+        shape.insert(dim + 1, size);
+
+        let out = tensor
+            .client
+            .tensor_uninitialized(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim: dim,
+            size: size,
+            step: step,
+        };
+
+        out.client.register(
+            streams,
+            OperationIr::BaseBool(BaseOperationIr::Unfold(desc.clone())),
+            UnfoldOps::<B>::new(desc),
+        );
+
+        out
+    }
 }
@@ -13,7 +13,7 @@ use burn_tensor::{
     ops::{BoolTensor, FloatElem, FloatTensor, FloatTensorOps, IntTensor, binary_ops_shape},
 };
 use std::{marker::PhantomData, ops::Range};
-
+use std::cmp::max;
 use super::NoOp;
 
 impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
@@ -2264,4 +2264,54 @@ impl<B: FusionBackend> FloatTensorOps<Self> for Fusion<B> {
 
         out
     }
+
+    fn float_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {
+        #[derive(new, Debug)]
+        struct UnfoldOps<B: FusionBackend> {
+            desc: UnfoldOpIr,
+            _b: PhantomData<B>,
+        }
+
+        impl<B: FusionBackend> Operation<B::FusionRuntime> for UnfoldOps<B> {
+            fn execute(&self, handles: &mut HandleContainer<B::Handle>) {
+                let input = handles.get_float_tensor::<B>(&self.desc.input);
+                let output = B::float_unfold(
+                    input,
+                    self.desc.dim,
+                    self.desc.size,
+                    self.desc.step);
+
+                handles.register_float_tensor::<B>(&self.desc.out.id, output);
+            }
+        }
+
+        let mut streams = OperationStreams::default();
+        streams.tensor(&tensor);
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = max(0, (d_shape - size).div_ceil(step));
+        shape[dim] = windows;
+        shape.insert(dim + 1, size);
+
+        let out = tensor
+            .client
+            .tensor_uninitialized(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim: dim,
+            size: size,
+            step: step,
+        };
+
+        out.client.register(
+            streams,
+            OperationIr::BaseFloat(BaseOperationIr::Unfold(desc.clone())),
+            UnfoldOps::<B>::new(desc),
+        );
+
+        out
+    }
 }
@@ -11,6 +11,7 @@ use burn_tensor::{
     ops::{BoolTensor, FloatTensor, IntElem, IntTensor, IntTensorOps, binary_ops_shape},
 };
 use core::ops::Range;
+use std::cmp::max;
 use std::marker::PhantomData;
 
 use super::NoOp;
@@ -2176,4 +2177,54 @@ impl<B: FusionBackend> IntTensorOps<Self> for Fusion<B> {
 
         out
     }
+
+    fn int_unfold(tensor: IntTensor<Self>, dim: usize, size: usize, step: usize) -> IntTensor<Self> {
+        #[derive(new, Debug)]
+        struct UnfoldOps<B: FusionBackend> {
+            desc: UnfoldOpIr,
+            _b: PhantomData<B>,
+        }
+
+        impl<B: FusionBackend> Operation<B::FusionRuntime> for UnfoldOps<B> {
+            fn execute(&self, handles: &mut HandleContainer<B::Handle>) {
+                let input = handles.get_int_tensor::<B>(&self.desc.input);
+                let output = B::int_unfold(
+                    input,
+                    self.desc.dim,
+                    self.desc.size,
+                    self.desc.step);
+
+                handles.register_int_tensor::<B>(&self.desc.out.id, output);
+            }
+        }
+
+        let mut streams = OperationStreams::default();
+        streams.tensor(&tensor);
+
+        let mut shape = tensor.shape().dims.clone();
+        let d_shape = shape[dim];
+        let windows = max(0, (d_shape - size).div_ceil(step));
+        shape[dim] = windows;
+        shape.insert(dim + 1, size);
+
+        let out = tensor
+            .client
+            .tensor_uninitialized(shape.clone(), tensor.dtype);
+
+        let desc = UnfoldOpIr {
+            input: tensor.into_ir(),
+            out: out.to_ir_out(),
+            dim: dim,
+            size: size,
+            step: step,
+        };
+
+        out.client.register(
+            streams,
+            OperationIr::BaseInt(BaseOperationIr::Unfold(desc.clone())),
+            UnfoldOps::<B>::new(desc),
+        );
+
+        out
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -107,4 +107,8 @@ impl<B: Backend, C: CheckpointStrategy> BoolTensorOps<Self> for Autodiff<B, C> {`
`107`	`107`	`fn bool_repeat_dim(tensor: BoolTensor<B>, dim: usize, times: usize) -> BoolTensor<B> {`
`108`	`108`	`B::bool_repeat_dim(tensor, dim, times)`
`109`	`109`	`}`
	`110`	`+`
	`111`	`+ fn bool_unfold(tensor: BoolTensor<Self>, dim: usize, size: usize, step: usize) -> BoolTensor<Self> {`
	`112`	`+ B::bool_unfold(tensor, dim, size, step)`
	`113`	`+ }`
`110`	`114`	`}`
Original file line number	Diff line number	Diff line change
`@@ -377,4 +377,8 @@ impl<B: Backend, C: CheckpointStrategy> IntTensorOps<Self> for Autodiff<B, C> {`
`377`	`377`	`fn int_cast(tensor: IntTensor<Self>, dtype: IntDType) -> IntTensor<Self> {`
`378`	`378`	`B::int_cast(tensor, dtype)`
`379`	`379`	`}`
	`380`	`+`
	`381`	`+ fn int_unfold(tensor: IntTensor<Self>, dim: usize, size: usize, step: usize) -> IntTensor<Self> {`
	`382`	`+ B::int_unfold(tensor, dim, size, step)`
	`383`	`+ }`
`380`	`384`	`}`
Original file line number	Diff line number	Diff line change
`@@ -2562,6 +2562,10 @@ impl<B: Backend, C: CheckpointStrategy> FloatTensorOps<Self> for Autodiff<B, C>`
`2562`	`2562`
`2563`	`2563`	`// TODO: Implement float_prod and float_sum`
`2564`	`2564`	`// https://github.com/tracel-ai/burn/issues/1458`
	`2565`	`+`
	`2566`	`+ fn float_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {`
	`2567`	`+ AutodiffTensor::new(B::float_unfold(tensor.primitive, dim, size, step))`
	`2568`	`+ }`
`2565`	`2569`	`}`
`2566`	`2570`
`2567`	`2571`	`#[derive(Debug, Clone)]`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-use super::{expand, numeric, permute};`
	`1`	`+use super::{expand, numeric, permute, unfold};`
`2`	`2`	`use crate::kernel::prng::{random_bernoulli, random_normal, random_uniform};`
`3`	`3`	`use crate::kernel::unary_basic::BasicFloatUnaryKind;`
`4`	`4`	`use crate::kernel::{`
`@@ -683,4 +683,8 @@ where`
`683`	`683`	`_ => unimplemented!("Unsupported floating point type cast"),`
`684`	`684`	`}`
`685`	`685`	`}`
	`686`	`+`
	`687`	`+ fn float_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {`
	`688`	`+ unfold(tensor, dim, size, step)`
	`689`	`+ }`
`686`	`690`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`use self::unary_basic_int::BasicIntUnaryKind;`
`2`	`2`
`3`		`-use super::{expand, numeric, permute};`
	`3`	`+use super::{expand, numeric, permute, unfold};`
`4`	`4`	`use crate::{`
`5`	`5`	`CubeBackend, CubeRuntime, FloatElement, IntElement,`
`6`	`6`	`kernel::{`
`@@ -661,4 +661,8 @@ where`
`661`	`661`	`}`
`662`	`662`	`)`
`663`	`663`	`}`
	`664`	`+`
	`665`	`+ fn int_unfold(tensor: FloatTensor<Self>, dim: usize, size: usize, step: usize) -> FloatTensor<Self> {`
	`666`	`+ unfold(tensor, dim, size, step)`
	`667`	`+ }`
`664`	`668`	`}`