Glu batching rule (forward + backward) (#665)

Samantha Andow · web-flow · commit ecb9c3c19c3b · 2022-04-08T13:14:48.000-04:00
* glu forward

* glu backwards
diff --git a/functorch/csrc/BatchRulesActivation.cpp b/functorch/csrc/BatchRulesActivation.cpp
@@ -11,6 +11,43 @@
 // NB: most activation functions fit pointwise unary or binary rules.
 // These are only the ones that have special batch rules to help with organization
 namespace at { namespace functorch {
+std::tuple<Tensor,optional<int64_t>>
+glu_batch_rule(const Tensor& self, optional<int64_t> self_bdim, int64_t dim) {
+  // repeated error message from glu because 0D -> 1D when batched
+  // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
+  // can't be evenly halved, but give a nicer error message here.
+  TORCH_CHECK(self.dim() > 1, "glu does not support 0-dimensional tensors");
+
+  const auto rank = rankWithoutBatchDim(self, self_bdim);
+  const auto dim_ = maybe_wrap_dim(dim, rank) + 1;
+
+  const auto self_ = moveBatchDimToFront(self, self_bdim);
+
+  const auto res = at::glu(self_, dim_);
+  return std::make_tuple(res, 0);
+}
+
+std::tuple<Tensor,optional<int64_t>> glu_backward_batch_rule(
+    const Tensor& grad_output, optional<int64_t> grad_output_bdim,
+    const Tensor& self, optional<int64_t> self_bdim, int64_t dim) {
+  if (self_bdim) {
+    // repeated error message from glu because 0D -> 1D when batched
+    // this can't pass anyway because a 0-dimensional tensor has "size" 1, which
+    // can't be evenly halved, but give a nicer error message here.
+    TORCH_CHECK(self.dim() > 1, "glu does not support 0-dimensional tensors");
+  }
+
+  const auto rank = rankWithoutBatchDim(self, self_bdim);
+  const auto dim_ = maybe_wrap_dim(dim, rank) + 1;
+
+  const auto batch_size = get_bdim_size2(grad_output, grad_output_bdim, self, self_bdim);
+  const auto grad_output_ = ensure_has_bdim(moveBatchDimToFront(grad_output, grad_output_bdim), grad_output_bdim.has_value(), batch_size);
+  const auto self_ = ensure_has_bdim(moveBatchDimToFront(self, self_bdim), self_bdim.has_value(), batch_size);
+
+  const auto res = at::glu_backward(grad_output_, self_, dim_);
+  return std::make_tuple(res, 0);
+}
+
 std::tuple<Tensor,optional<int64_t>> prelu_batch_rule(
     const Tensor& input, optional<int64_t> input_bdim,
     const Tensor& weight, optional<int64_t> weight_bdim) {
@@ -175,6 +212,8 @@ std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>> prelu_backward_bat
 }
 
 TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
+  VMAP_SUPPORT(glu_backward, glu_backward_batch_rule);
+  VMAP_SUPPORT(glu, glu_batch_rule);
   VMAP_SUPPORT(prelu, prelu_batch_rule)
   VMAP_SUPPORT(prelu_backward, prelu_backward_batch_rule)
 }
diff --git a/functorch/csrc/BatchRulesUnaryOps.cpp b/functorch/csrc/BatchRulesUnaryOps.cpp
@@ -113,7 +113,6 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   UNARY_POINTWISE_ALL(expm1);
   UNARY_POINTWISE_ALL(floor);
   UNARY_POINTWISE_ALL(frac);
-  UNARY_POINTWISE(glu);
   UNARY_POINTWISE(isfinite);
   UNARY_POINTWISE(isnan);
   UNARY_POINTWISE(isinf);
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -610,7 +610,6 @@ def vjp_of_vjp(*args_and_cotangents):
         skip('qr'),  # Nondetermistic
         xfail('_masked.prod'),  # calls aten::item
         xfail('stft'),
-        xfail('nn.functional.glu'),
         xfail('nn.functional.fractional_max_pool3d'),
         xfail('as_strided'),
         xfail('nn.functional.fractional_max_pool2d'),
@@ -954,7 +953,6 @@ def test():
         xfail('nn.functional.huber_loss'),
         xfail('nn.functional.poisson_nll_loss'),
         xfail('nn.functional.bilinear'),
-        xfail('nn.functional.glu'),
         xfail('nn.functional.fractional_max_pool3d'),
         xfail('as_strided'),
         xfail('linalg.solve_triangular'),
@@ -1018,7 +1016,6 @@ def test():
         xfail('masked_select'),
         skip('nn.functional.fractional_max_pool3d'),  # generator works on cpu, fails on cuda
         xfail('__rpow__'),  # https://github.com/pytorch/functorch/issues/617
-        xfail('nn.functional.glu'),
         xfail('as_strided'),
         skip('nn.functional.fractional_max_pool2d'),  # generator works on cpu, fails on cuda
         skip('solve'),
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3115,7 +3115,6 @@ class TestVmapOperatorsOpInfo(TestCase):
         xfail('nn.functional.fractional_max_pool2d'),
         xfail('nn.functional.embedding_bag'),
         xfail('nonzero'),
-        xfail('nn.functional.glu'),
         xfail('nn.functional.rrelu'),  # random?
         xfail('__rpow__'),  # https://github.com/pytorch/functorch/issues/617
         xfail('bernoulli', ''),
@@ -3251,7 +3250,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('nn.functional.fractional_max_pool2d'),
         xfail('stft'),
         xfail('linalg.solve_triangular'),
-        xfail('nn.functional.glu'),
         xfail('isclose'),
         xfail('nn.functional.fractional_max_pool3d'),
         xfail('nn.functional.bilinear'),