fix gelu_backward (#640)

Samantha Andow · web-flow · commit deb706d76729 · 2022-04-05T13:08:52.000-04:00
diff --git a/functorch/csrc/BatchRulesBinaryOps.cpp b/functorch/csrc/BatchRulesBinaryOps.cpp
@@ -203,6 +203,23 @@ std::tuple<Tensor,optional<int64_t>> where_self_batch_rule(
   return std::make_tuple(at::where(condition_, self_, other_), 0);
 }
 
+std::tuple<Tensor, optional<int64_t>> gelu_backward_batch_rule(
+    const Tensor& grad_out, optional<int64_t> grad_out_bdim, const Tensor& input, optional<int64_t> input_bdim,
+    c10::string_view approximate) {
+
+  // repeat the preprocessing from _binary_pointwise_batch_rule
+  const auto tensor_other = _binary_pointwise_helper(grad_out, grad_out_bdim, input, input_bdim);
+  auto grad_out_ = std::get<0>(tensor_other);
+  auto input_ = std::get<1>(tensor_other);
+
+  // gelu_backward doesn't broadcast well so we need to insist all inputs have a bdim
+  const auto batch_size = get_bdim_size2(grad_out, grad_out_bdim, input, input_bdim);
+  grad_out_ = ensure_has_bdim(grad_out_, grad_out_bdim.has_value(), batch_size);
+  input_ = ensure_has_bdim(input_, input_bdim.has_value(), batch_size);
+
+  return std::make_tuple(at::gelu_backward(grad_out_, input_, approximate), 0);
+}
+
 std::tuple<Tensor,optional<int64_t>> masked_select_batch_rule(
     const Tensor& self, optional<int64_t> self_bdim,
     const Tensor& mask, optional<int64_t> mask_bdim) {
@@ -399,7 +416,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   BINARY_POINTWISE(leaky_relu_backward);
   BINARY_POINTWISE(logit_backward);
   POINTWISE_BOXED(log_sigmoid_backward);
-  BINARY_POINTWISE(gelu_backward);
+  VMAP_SUPPORT(gelu_backward, gelu_backward_batch_rule);
   BINARY_POINTWISE(sigmoid_backward);
   POINTWISE_BOXED(softplus_backward);
   BINARY_POINTWISE(softshrink_backward);
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -625,8 +625,6 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('index_put', ''),
         xfail('lu_solve'),
         xfail('index_copy'),
-        xfail('nn.functional.gelu', device_type='cpu'),
-
         xfail('linalg.lu_factor', ''),
     })
 
@@ -703,9 +701,6 @@ def test_vmapvjp(self, device, dtype, op):
         xfail('nanquantile'),
         xfail('quantile'),
 
-        # RuntimeError: vmap: inplace arithmetic(self, *extra_args)
-        xfail('nn.functional.gelu', device_type='cpu'),
-
         # Not implemented
         xfail('scatter'),
 
@@ -778,7 +773,6 @@ def test_vmapjvp(self, device, dtype, op):
         xfail('maximum'),
         xfail('linalg.householder_product'),
         xfail('tensor_split'),
-        xfail('nn.functional.gelu', device_type='cpu'),
         xfail('quantile'),
         xfail('var_mean'),
         xfail('as_strided'),