diff --git a/candle-core/examples/cuda_sum_benchmark.rs b/candle-core/examples/cuda_sum_benchmark.rs index d6d182e8fc..5bd4b4eefe 100644 --- a/candle-core/examples/cuda_sum_benchmark.rs +++ b/candle-core/examples/cuda_sum_benchmark.rs @@ -10,7 +10,7 @@ use anyhow::Result; use candle_core::{Device, Tensor}; fn cos_sin(n: usize, device: &Device) -> Result { - let thetas: Vec<_> = (0..n).map(|i| (i as f32 / n as f32)).collect(); + let thetas: Vec<_> = (0..n).map(|i| i as f32 / n as f32).collect(); let xs: Vec<_> = thetas.iter().map(|t| t.cos().abs()).collect(); let ys: Vec<_> = thetas.iter().map(|t| t.sin().abs()).collect(); let xs = Tensor::from_vec(xs, (n, 1), device)?; diff --git a/candle-core/src/backprop.rs b/candle-core/src/backprop.rs index a14306657b..6f96f916b7 100644 --- a/candle-core/src/backprop.rs +++ b/candle-core/src/backprop.rs @@ -122,6 +122,7 @@ impl Tensor { | Op::MaxPool2D { arg: node, .. } | Op::Copy(node) | Op::Broadcast(node) + | Op::Unfold(node, _, _, _) | Op::Cmp(node, _) | Op::Reduce(node, ReduceOp::Min | ReduceOp::Sum | ReduceOp::Max, _) | Op::ToDevice(node) @@ -495,6 +496,33 @@ impl Tensor { let sum_grad = grads.or_insert(arg)?; *sum_grad = sum_grad.add(&arg_grad.broadcast_as(sum_grad.dims())?)?; } + &Op::Unfold(ref arg, dim, size, step) => { + assert!(false, "never runs"); + + let arg_dims = arg.dims(); + let node_dims = node.dims(); + + println!("arg.id {:?}", arg.id()); + println!("node.id {:?}", node.id()); + + let sum_grad = grads.or_insert(arg)?; + let extra_dim = arg_dims.len(); + + let windows = node_dims[dim]; + for widx in 0..windows { + let window_slice = grad + .get_on_dim(dim, widx)? + .unsqueeze(dim)? + .transpose(dim, extra_dim)? + .squeeze(extra_dim)?; + + let start = widx * step; + let end = start + size; + + let indexes = Tensor::arange(start as f32, end as f32, self.device())?; + *sum_grad = sum_grad.index_add(&indexes, &window_slice, dim)?; + } + } Op::Reduce(arg, ReduceOp::Sum, reduced_dims) => { let grad = broadcast_back(arg, &grad, reduced_dims)?; let sum_grad = grads.or_insert(arg)?; diff --git a/candle-core/src/op.rs b/candle-core/src/op.rs index 8e24368ff1..5863807755 100644 --- a/candle-core/src/op.rs +++ b/candle-core/src/op.rs @@ -158,6 +158,7 @@ pub enum Op { ToDType(Tensor), Copy(Tensor), Broadcast(Tensor), + Unfold(Tensor, usize, usize, usize), Narrow(Tensor, usize, usize, usize), SliceScatter0(Tensor, Tensor, usize), Reshape(Tensor), diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs index d71630212d..8e33701c3f 100644 --- a/candle-core/src/tensor.rs +++ b/candle-core/src/tensor.rs @@ -1,5 +1,6 @@ //! Tensors are N-dimensional matrixes of elements using a single data type. #![allow(clippy::redundant_closure_call)] + use crate::backend::{BackendDevice, BackendStorage}; use crate::op::{BackpropOp, BinaryOp, CmpOp, Op, ReduceOp, UnaryOp}; use crate::scalar::TensorOrScalar; @@ -2269,6 +2270,56 @@ impl Tensor { self.broadcast_as(shape) } + /// Unfold windows along a dimension. + /// + /// Returns a view of the tensor with all complete windows of size `size` in dimension `dim`; + /// where windows are advanced by `step` at each index. + /// + /// The number of windows is `max(0, (shape[dim] - size).ceil_div(step))`. + /// + /// The new view will have the unfolded dimension replaced by two dimensions; + /// one in the position of the original dimension, with size equal to the number of windows, + /// and one appended to the right-most position, with size equal to `size`. + /// + /// # Arguments + /// + /// * `dim` - the dimension to unfold. + /// * `size` - the size of each unfolded window. + /// * `stride` - the step between each window. + /// + /// # Returns + /// + /// A tensor view with the shape ``[pre=..., windows, post=..., size]``. + pub fn unfold(&self, dim: usize, size: usize, step: usize) -> Result { + let mut shape = self.layout.shape().dims().to_vec(); + let mut strides = self.layout.stride().to_vec(); + + let d_shape = shape[dim]; + let d_stride = strides[dim]; + + let tmp = d_shape + step; + let windows = if tmp < size { 0 } else { (tmp - size) / step }; + + shape[dim] = windows; + shape.push(size); + + strides[dim] = step * d_stride; + strides.push(d_stride); + + let unfold_layout = Layout::new(shape.into(), strides, self.layout.start_offset()); + + let tensor_ = Tensor_ { + id: TensorId::new(), + storage: self.storage.clone(), + layout: unfold_layout, + op: BackpropOp::new1(self, |arg| Op::Unfold(arg, dim, size, step)), + is_variable: false, + dtype: self.dtype, + device: self.device.clone(), + }; + Ok(Tensor(Arc::new(tensor_))) + } + /// Casts the input tensor to the target `dtype`. /// /// ```rust diff --git a/candle-core/tests/grad_tests.rs b/candle-core/tests/grad_tests.rs index b5e4e28094..3240e1f030 100644 --- a/candle-core/tests/grad_tests.rs +++ b/candle-core/tests/grad_tests.rs @@ -505,6 +505,21 @@ fn binary_grad(device: &Device) -> Result<()> { Ok(()) } +fn unfold_grad(device: &Device) -> Result<()> { + let data = &[[0f32, 1., 2., 3., 4.], [5f32, 6., 7., 8., 9.]]; + let x = Tensor::new(data, device)?; + let unf = x.unfold(1, 3, 1)?; + let y = (&unf + 1.)?; + let grads = y.backward()?; + println!("grads: {:?}", grads); + println!("x.id: {:?}", x.id()); + println!("unf.id: {:?}", unf.id()); + println!("y.id: {:?}", x.id()); + let _grad_tensor = grads.get(&x).context("no grad for tensor")?; + + Ok(()) +} + #[test] fn test_flip_backprop() -> Result<()> { let device = &Device::Cpu; @@ -555,6 +570,7 @@ test_device!( grad_descent_metal ); test_device!(unary_grad, unary_grad_cpu, unary_grad_gpu, unary_grad_metal); +test_device!(unfold_grad, unfold_grad_cpu, unfold_grad_gpu, unfold_grad_metal); test_device!( binary_grad, binary_grad_cpu, diff --git a/candle-core/tests/tensor_tests.rs b/candle-core/tests/tensor_tests.rs index d264cc0bd9..057ecadc28 100644 --- a/candle-core/tests/tensor_tests.rs +++ b/candle-core/tests/tensor_tests.rs @@ -767,6 +767,20 @@ fn broadcast(device: &Device) -> Result<()> { Ok(()) } +fn unfold(device: &Device) -> Result<()> { + let data = &[[0f32, 1., 2., 3., 4.], [5f32, 6., 7., 8., 9.]]; + let tensor = Tensor::new(data, device)?; + let actual = tensor.unfold(1, 3, 2)?; + assert_eq!( + actual.to_vec3::()?, + &[ + [[0f32, 1., 2.], [2f32, 3., 4.]], + [[5f32, 6., 7.], [7f32, 8., 9.]], + ] + ); + Ok(()) +} + fn slice_set(device: &Device) -> Result<()> { let (b, h, max_t, d) = (2, 4, 7, 3); let cache = Tensor::zeros((b, h, max_t, d), DType::F32, device)?; @@ -1655,6 +1669,7 @@ test_device!(add_mul, add_mul_cpu, add_mul_gpu, add_mul_metal); test_device!(tensor_2d, tensor_2d_cpu, tensor_2d_gpu, tensor_2d_metal); test_device!(narrow, narrow_cpu, narrow_gpu, narrow_metal); test_device!(broadcast, broadcast_cpu, broadcast_gpu, broadcast_metal); +test_device!(unfold, unfold_cpu, unfold_gpu, unfold_metal); test_device!(slice_set, ss_cpu, ss_gpu, ss_metal); test_device!(cat, cat_cpu, cat_gpu, cat_metal); test_device!(sum, sum_cpu, sum_gpu, sum_metal);