More batch rule fixes (#348)

zou3519 · web-flow · commit 7c4453d7a735 · 2021-12-15T19:30:54.000-05:00
diff --git a/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/csrc/BatchRulesDecompositions.cpp
@@ -234,6 +234,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE2(conv2d, padding);
   OP_DECOMPOSE2(conv3d, padding);
   OP_DECOMPOSE(_convolution_mode);
+  OP_DECOMPOSE(frobenius_norm);
   OP_DECOMPOSE(type_as);
   DECOMPOSE_FUNCTIONAL(diag_embed);
   DECOMPOSE_FUNCTIONAL(block_diag);
diff --git a/functorch/csrc/PyTorchOperatorHacks.cpp b/functorch/csrc/PyTorchOperatorHacks.cpp
@@ -2,6 +2,8 @@
 #include <functorch/csrc/Constants.h>
 #include <torch/library.h>
 #include <ATen/ATen.h>
+#include <functorch/csrc/TensorWrapper.h>
+#include <functorch/csrc/BatchedTensorImpl.h>
 
 namespace at { namespace functorch {
 
@@ -34,9 +36,98 @@ Tensor index_select_backward_hack(const Tensor& grad, IntArrayRef self_sizes, in
   return at::zeros(self_sizes, grad.options()).index_add(dim, index, grad);
 }
 
+// TODO: https://github.com/pytorch/pytorch/issues/69991
+Tensor frobenius_norm_dim_hack(const Tensor& self, IntArrayRef dim, bool keepdim) {
+  if (dim.size() == 1 || dim.size() == 0) {
+    return at::norm(self, 2, dim, keepdim);
+  } else {
+    auto dim_ = dim.vec();
+    maybe_wrap_dims(dim_, self.dim());
+    TORCH_CHECK(dim_[0] != dim_[1], "Expected dims to be different, got ", dim, " instead");
+    if (self.is_complex()){
+      return at::sqrt(at::sum(at::real(self.conj() * self), dim_, keepdim));
+    } else {
+      return at::sqrt(at::sum((self * self), dim_, keepdim));
+    }
+  }
+}
+
+static optional<std::tuple<Tensor,int64_t>> unwrap(const Tensor& tensor) {
+  auto* wrapped = maybeGetTensorWrapper(tensor);
+  if (wrapped) {
+    if (wrapped->level().has_value()) {
+      return std::make_tuple(wrapped->value(), *wrapped->level());
+    }
+    return unwrap(wrapped->value());
+  }
+  auto* batched = maybeGetBatchedImpl(tensor);
+  if (batched) {
+    return std::make_tuple(batched->value(), batched->level());
+  }
+  return nullopt;
+}
+
+static bool can_perform_inplace(const Tensor& a, const Tensor& b) {
+  // TODO: generalize this to more transforms
+  auto a_ = unwrap(a);
+  auto b_ = unwrap(b);
+  if (!a_.has_value() && b_.has_value()) {
+    return false;
+  }
+  if (!a_.has_value() && !b_.has_value()) {
+    return true;
+  }
+  if (a_.has_value() && !b_.has_value()) {
+    return true;
+  }
+  TORCH_INTERNAL_ASSERT(a_.has_value() && b_.has_value());
+
+  // If b has any wrapper that a does not, then we cannot do a.inplace_(b)
+  if (std::get<1>(*a_) < std::get<1>(*b_)) {
+    return false;
+  }
+  if (std::get<1>(*a_) > std::get<1>(*b_)) {
+    return can_perform_inplace(std::get<0>(*a_), b);
+  }
+  return can_perform_inplace(std::get<0>(*a_), std::get<0>(*b_));
+}
+
+// TODO: linear is pretty important for performance, but I'm not sure how to work
+// around the in-place.
+Tensor linear_hack(const Tensor& input, const Tensor& weight, const c10::optional<Tensor>& bias_opt) {
+  // See [Note: hacky wrapper removal for optional tensor]
+  auto bias = bias_opt.has_value()
+    ? c10::MaybeOwned<Tensor>::borrowed(*bias_opt)
+    : c10::MaybeOwned<Tensor>::owned(c10::in_place);
+
+  if (input.is_mkldnn()) {
+    return at::mkldnn_linear(input, weight, *bias);
+  }
+#if defined(C10_MOBILE)
+  if (xnnpack::use_linear(input, weight, *bias)) {
+    return xnnpack::linear(input, weight, *bias);
+  }
+#endif
+  if (input.dim() == 2 && bias->defined()) {
+    // Fused op is marginally faster.
+    return at::addmm(*bias, input, weight.t());
+  }
+  auto output = at::matmul(input, weight.t());
+  if (bias->defined()) {
+    // TODO(rzou): I'm a little uncomfortable with this
+    if (can_perform_inplace(output, *bias)) {
+      return output.add_(*bias);
+    }
+    return output.add(*bias);
+  }
+  return output;
+}
+
 TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
   m.impl("value_selecting_reduction_backward", value_selecting_reduction_backward_hack);
   m.impl("index_select_backward", index_select_backward_hack);
+  m.impl("frobenius_norm.dim", frobenius_norm_dim_hack);
+  m.impl("linear", linear_hack);
 }
 
 }}
diff --git a/test/discover_coverage.py b/test/discover_coverage.py
@@ -374,6 +374,7 @@ def print_coverage_info(th=100, nn=25):
         'torch.nonzero', # dynamic
         'torch.masked_select', # dynamic
         'torch.prod', # dynamic (backward)
+        'torch.norm', # norm with nuc is not commonly used.
     }
     remove_from_set(statuses['test_vmap_exhaustive'], vmap_exemptions)
     remove_from_set(statuses['test_vmapvjp'], vmap_exemptions)
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -459,7 +459,6 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('masked_scatter'),
         xfail('matrix_exp'),
         xfail('nanquantile'),
-        xfail('norm', 'fro'),
         xfail('norm', 'nuc'),
         xfail('prod'),
         xfail('put'),
@@ -481,17 +480,13 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('_masked.prod'), # calls aten::item
         xfail('stft'),
         xfail('nn.functional.glu'),
-
         xfail('nn.functional.fractional_max_pool3d'),
         xfail('as_strided'),
         xfail('nn.functional.fractional_max_pool2d'),
     })
     @ops(functorch_lagging_op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vmapvjp', vmapvjp_fail)
     def test_vmapvjp(self, device, dtype, op):
-        # These are too annoying to put into the list above
-        if op.name in {'nn.functional.linear'}:
-            self.skipTest("Skipped! ExpectedF failures")
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
             return
@@ -741,7 +736,6 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('nn.functional.conv_transpose2d'),
         xfail('nn.functional.gelu'),
         xfail('nn.functional.pad', 'circular'),
-        xfail('norm', 'fro'),
         xfail('norm', 'nuc'),
         xfail('pinverse'),
         xfail('prod'),
@@ -794,7 +788,7 @@ def test_vmapjvpall(self, device, dtype, op):
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         # These are too annoying to put into the list above
-        if op.name in {'nn.functional.linear', 'nn.functional.conv2d'}:
+        if op.name in {'nn.functional.conv2d'}:
             self.skipTest("Skipped! ExpectedF failures")
         if not op.supports_autograd:
             self.skipTest("Skipped! Autograd not supported.")
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3197,7 +3197,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('masked_scatter'),
         xfail('masked_select'),
         xfail('nanquantile'),
-        xfail('norm', 'fro'),
         xfail('norm', 'nuc'),
         xfail('ormqr'),
         xfail('put'),

Original file line number	Diff line number	Diff line change
`@@ -374,6 +374,7 @@ def print_coverage_info(th=100, nn=25):`
`374`	`374`	`'torch.nonzero', # dynamic`
`375`	`375`	`'torch.masked_select', # dynamic`
`376`	`376`	`'torch.prod', # dynamic (backward)`
	`377`	`+ 'torch.norm', # norm with nuc is not commonly used.`
`377`	`378`	`}`
`378`	`379`	`remove_from_set(statuses['test_vmap_exhaustive'], vmap_exemptions)`
`379`	`380`	`remove_from_set(statuses['test_vmapvjp'], vmap_exemptions)`