Fix batch rule for max_pool2d

zou3519 · zou3519 · commit ea180d27512b · 2021-10-12T14:28:42.000-07:00
diff --git a/functorch/csrc/BatchRulesPooling.cpp b/functorch/csrc/BatchRulesPooling.cpp
@@ -62,12 +62,36 @@ Tensor max_pool2d_with_indices_backward_plumbing(const Tensor & grad_output, con
   return slow_fallback<Tensor>(op, { grad_output, self, kernel_size, stride, padding, dilation, ceil_mode, indices });
 }
 
+std::tuple<Tensor,optional<int64_t>,Tensor,optional<int64_t>>
+max_pool2d_with_indices_batch_rule(
+    const Tensor& self, optional<int64_t> self_bdim,
+    IntArrayRef kernel_size, IntArrayRef stride,
+    IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) {
+  auto logical_rank = rankWithoutBatchDim(self, self_bdim);
+  TORCH_INTERNAL_ASSERT(logical_rank == 3 || logical_rank == 4);
+  // Tensor[B, C, H, W] -> just call max_pool2d
+  if (logical_rank == 3) {
+    auto self_ = moveBatchDimToFront(self, self_bdim);
+    auto result = at::max_pool2d_with_indices(
+        self_, kernel_size, stride, padding, dilation, ceil_mode);
+    return std::make_tuple(std::move(std::get<0>(result)), 0, std::move(std::get<1>(result)), 0);
+  }
+  // Tensor[B, N, C, H, W] -> Tensor[B * N, C, H, W]
+  auto bdim_size = self.size(*self_bdim);
+  auto self_ = reshape_dim_into(*self_bdim, 0, self);
+  auto result = at::max_pool2d_with_indices(
+      self_, kernel_size, stride, padding, dilation, ceil_mode);
+  return std::make_tuple(
+      reshape_dim_outof(0, bdim_size, std::get<0>(result)), 0,
+      reshape_dim_outof(0, bdim_size, std::get<1>(result)), 0);
+}
 
 TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   EXISTING_BDIM(_adaptive_avg_pool2d);
   EXISTING_BDIM(avg_pool2d);
   m.impl("max_pool2d_with_indices_backward", max_pool2d_with_indices_backward_plumbing);
   EXISTING_BDIM_ALL_BOXED(avg_pool2d_backward);
+  VMAP_SUPPORT("max_pool2d_with_indices", max_pool2d_with_indices_batch_rule);
 }
 
 }}
diff --git a/functorch/csrc/BatchingRegistrations.cpp b/functorch/csrc/BatchingRegistrations.cpp
@@ -94,33 +94,6 @@ static bool participatesInCurrentLevel(TensorList self) {
   return false;
 }
 
-std::tuple<Tensor,Tensor> max_pool2d_with_indices_batching_rule(
-    const Tensor & self, IntArrayRef kernel_size, IntArrayRef stride,
-    IntArrayRef padding, IntArrayRef dilation, bool ceil_mode) {
-  if (!participatesInCurrentLevel(self)) {
-    c10::impl::ExcludeDispatchKeyGuard guard(kBatchedKey);
-    return at::max_pool2d_with_indices(
-        self, kernel_size, stride, padding, dilation, ceil_mode);
-  }
-  auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self);
-  TORCH_INTERNAL_ASSERT(self_physical.tensor().dim() == 5);
-
-  auto N = self_physical.tensor().size(0);
-  auto M = self_physical.tensor().size(1);
-  auto physical = self_physical.tensor().flatten(0, 1);
-
-  auto result = max_pool2d_with_indices_batching_rule(physical,
-      kernel_size, stride, padding, dilation, ceil_mode);
-
-  auto first = std::get<0>(result).unflatten(0, {N, M});
-  auto second = std::get<1>(result).unflatten(0, {N, M});
-
-  first = self_physical.getPhysicalToLogicalMap().apply(first);
-  second = self_physical.getPhysicalToLogicalMap().apply(second);
-  return std::make_tuple<Tensor, Tensor>(std::move(first), std::move(second));
-}
-
-
 bool isPhysicalScalarTensor(const Tensor& logical_tensor) {
   if (logical_tensor.dim() > 0) {
     return false;
@@ -906,9 +879,6 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   // m.impl("_add_batch_dim", native::_add_batch_dim);
   // m.impl("_remove_batch_dim", native::_remove_batch_dim);
 
-  m.impl("max_pool2d", at::native::max_pool2d); // composite
-  m.impl("max_pool2d_with_indices", max_pool2d_with_indices_batching_rule);
-
   m.impl("is_complex", native::is_complex);
 //
 //   // inplace operations
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -350,7 +350,6 @@ def vjp_of_vjp(*args_and_cotangents):
         xfail('nanmean'),
         xfail('block_diag'),
         xfail('nn.functional.dropout'),
-        xfail('nn.functional.max_pool2d'),
         xfail('nn.functional.nll_loss'),
     }))
     def test_vmapvjp(self, device, dtype, op):
@@ -523,7 +522,6 @@ def test():
         xfail('nanmean'),
         xfail('vstack'),
         xfail('block_diag'),
-        xfail('nn.functional.max_pool2d'),
         xfail('nn.functional.batch_norm'),
         xfail('nn.functional.nll_loss'),
     }))
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3007,7 +3007,6 @@ class TestVmapOperatorsOpInfo(TestCase):
         xfail('svd', device_type='cuda'),
         xfail('linalg.svd', device_type='cuda'),
         xfail('index_put'),
-        xfail('nn.functional.max_pool2d'),
         xfail('nn.functional.batch_norm'),
         xfail('nn.functional.nll_loss'),
     })
@@ -3105,7 +3104,6 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('vstack'),
         xfail('block_diag'),
         xfail('nn.functional.dropout'),
-        xfail('nn.functional.max_pool2d'),
         xfail('nn.functional.conv2d', ''),
         xfail('nn.functional.batch_norm'),
     })