add batching rule for block_diag, kill DECOMPOSE_FUNCTIONAL (#814)

bdhirsh · web-flow · commit abe4c4de06ff · 2022-05-31T10:57:15.000-04:00
* add batching rule for block_diag, kill DECOMPOSE_FUNCTIONAL

* add batching rule for block_diag, kill DECOMPOSE_FUNCTIONAL
diff --git a/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/csrc/BatchRulesDecompositions.cpp
@@ -15,88 +15,6 @@
 
 namespace at { namespace functorch {
 
-at::Tensor sync_and_unwrap_functional_output(at::Tensor out_functional) {
-  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(out_functional));
-  auto out_wrapper_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(out_functional);
-  out_wrapper_impl->sync_();
-  auto out_unwrapped = out_wrapper_impl->value();
-  return out_unwrapped;
-}
-
-c10::List<at::Tensor> sync_and_unwrap_functional_output(const c10::List<at::Tensor>& t_list) {
-  c10::List<Tensor> outputs;
-  outputs.reserve(t_list.size());
-  for (const auto i : c10::irange(t_list.size())) {
-    outputs.push_back(sync_and_unwrap_functional_output(t_list[i]));
-  }
-  return outputs;
-}
-
-void decompose_functional(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
-  const auto& schema = op.schema();
-
-  const auto num_arguments = schema.arguments().size();
-  const auto arguments = torch::jit::last(stack, num_arguments);
-  const auto arguments_begin = stack->size() - num_arguments;
-  //
-  // Step 1: Wrap any tensor inputs into Functional tensors
-  // and put them on the stack at the correct indices.
-  for (const auto idx : c10::irange(arguments.size())) {
-    const auto& ivalue = arguments[idx];
-    if (ivalue.isTensor()) {
-      auto functional_ivalue = at::functionalization::impl::to_functional_tensor(ivalue.toTensor());
-      (*stack)[arguments_begin + idx] = std::move(functional_ivalue);
-    } else if (ivalue.isTensorList()) {
-      auto functional_ivalue = at::functionalization::impl::to_functional_tensor(ivalue.toTensorList());
-      (*stack)[arguments_begin + idx] = std::move(functional_ivalue);
-    }
-  }
-
-  // Step 2: set up TLS such that we hit the functionalization kernels before the batching rules.
-  // Note: this relies on the fact that Functionalize > FuncTorchBatched in DispatchKey.h.
-  // Also, adding Functionalize to the include set isn't enough: we also need to remove it from the exclude set.
-  // That's because functorch DynamicLayer logic may have added Functionalize to the exclude set beforehand.
-  auto local_keyset = c10::impl::tls_local_dispatch_key_set();
-  local_keyset.excluded_ = local_keyset.excluded_.remove(c10::DispatchKey::Functionalize);
-  local_keyset.included_ = local_keyset.included_.add(c10::DispatchKey::Functionalize);
-  c10::impl::ForceDispatchKeyGuard guard(local_keyset);
-
-  at::functionalization::impl::FunctionalizationReapplyViewsGuard functional_guard(true);
-
-  // Step 3: redispatch to native kernel
-  // TODO: this is technically kind of sketchy, since we're relying on the fact
-  // that the composite kernel is registered to a particular dispatch key.
-  // In reality, a C++ extension could register their own custom kernels to any dispatch key, which would override
-  // the composite kernel entry.
-  // I'm using CPU because C++ extensions that register custom kernels to existing composite operators are pretty uncommon,
-  // and only really matter for out-of-tree keys like XLA.
-  // I wonder if we should make "alias dispatch key kernels" a runtime-accessible property on the OperatorHandle?
-  op.redispatchBoxed(c10::DispatchKeySet(c10::DispatchKey::CPU), stack);
-
-  const auto& schema_returns = op.schema().returns();
-  const auto& num_returns = schema_returns.size();
-  auto returns = torch::jit::last(stack, num_returns);
-  const auto returns_begin = stack->size() - num_returns;
-
-  // Step 4: Unwrap each functional output tensor, syncing any pending updates
-  for (const auto idx : c10::irange(returns.size())) {
-    if (returns[idx].isTensor()) {
-      const auto& out_functional = returns[idx].toTensor();
-      auto out_unwrapped = sync_and_unwrap_functional_output(out_functional);
-      (*stack)[returns_begin + idx] = c10::IValue(out_unwrapped);
-    } else if (returns[idx].isTensorList()) {
-      const auto& out_functional = returns[idx].toTensorList();
-      auto out_unwrapped = sync_and_unwrap_functional_output(out_functional);
-      (*stack)[returns_begin + idx] = c10::IValue(out_unwrapped);
-    }
-  }
-}
-
-
-#define DECOMPOSE_FUNCTIONAL(op) \
-  m.impl(#op, torch::CppFunction::makeFromBoxedFunction<&decompose_functional>());
-
-
 #define OP_DECOMPOSE(op)  m.impl(#op, static_cast<decltype(&ATEN_FN(op))>(native::op));
 #define OP_DECOMPOSE2(op, overload)  m.impl(#op"."#overload, static_cast<decltype(&ATEN_FN2(op, overload))>(native::op));
 
@@ -315,8 +233,6 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE(pad);
   OP_DECOMPOSE(_pad_circular);
 
-  DECOMPOSE_FUNCTIONAL(block_diag);
-
   // divide, alias for div
   OP_DECOMPOSE2(divide, Tensor);
   OP_DECOMPOSE2(divide_, Tensor);
diff --git a/functorch/csrc/BatchingRegistrations.cpp b/functorch/csrc/BatchingRegistrations.cpp
@@ -557,6 +557,34 @@ Tensor cat_batching_rule(TensorList tensors, int64_t dim) {
   return physical_views[0].getPhysicalToLogicalMap().apply(result);
 }
 
+Tensor block_diag_batching_rule(TensorList tensors) {
+  if (!participatesInCurrentLevel(tensors)) {
+    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
+    return at::block_diag(tensors);
+  }
+  auto physical_views = MultiBatchVmapTransform::logicalToPhysical(tensors);
+  auto physical_tensors = fmap(
+      physical_views, [](const VmapPhysicalView& view) -> Tensor { return view.tensor(); });
+  TORCH_INTERNAL_ASSERT(
+      tensors.size() > 0, "The dispatcher should not have dispatched here otherwise.");
+  // Implementing this as a dummy for loop for now, since I'm not sure how to do it any better.
+  // I'm probably not accounting for potentially multiple batched dimensions?
+  auto bdim = physical_tensors[0].size(0);
+  std::vector<Tensor> batched_outputs;
+  batched_outputs.reserve(bdim);
+  for (const auto& i : c10::irange(bdim)) {
+    std::vector<Tensor> inputs_for_batch;
+    inputs_for_batch.reserve(physical_tensors.size());
+    for (const auto& t : physical_tensors) {
+      inputs_for_batch.push_back(t[i]);
+    }
+    auto out_for_batch = at::block_diag(inputs_for_batch);
+    batched_outputs.push_back(out_for_batch.unsqueeze(0));
+  }
+  auto result = at::cat(batched_outputs);
+  return physical_views[0].getPhysicalToLogicalMap().apply(result);
+}
+
 Tensor stack_batching_rule(TensorList tensors, int64_t dim) {
   if (!participatesInCurrentLevel(tensors)) {
     c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
@@ -666,6 +694,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   m.impl("split_with_sizes", split_with_sizes_batching_rule);
   m.impl("unbind.int", unbind_batching_rule);
   m.impl("cat", cat_batching_rule);
+  m.impl("block_diag", block_diag_batching_rule);
   m.impl("stack", stack_batching_rule);
 
   // still legacy b/c needs special inplace rules
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -1136,7 +1136,6 @@ def get_vjp(cotangents, *primals):
         xfail('_masked.softmin', ''),
         xfail('amax', ''),
         xfail('amin', ''),
-        xfail('block_diag', ''),
         xfail('cdist', ''),
         xfail('cholesky', ''),
         xfail('eig', ''),
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3262,7 +3262,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('linalg.lu_factor_ex', ''),
         xfail('diagflat', ''),
         xfail('special.log_ndtr'),
-        xfail('block_diag'),  # aten::slice_copy.Tensor hit the vmap fallback which is currently disabled
         xfail('nn.functional.triplet_margin_loss', ''),
         xfail('nn.functional.pdist', ''),
         xfail('scatter_reduce', 'sum'),