Embedding backward batch rule (#355)

zou3519 · web-flow · commit 5dde9b7eb64a · 2021-12-19T23:05:52.000-05:00
Test Plan:
- run tests
diff --git a/functorch/csrc/BatchRulesDecompositions.cpp b/functorch/csrc/BatchRulesDecompositions.cpp
@@ -236,6 +236,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   OP_DECOMPOSE(_convolution_mode);
   OP_DECOMPOSE(frobenius_norm);
   OP_DECOMPOSE(type_as);
+  OP_DECOMPOSE(embedding_backward);
   DECOMPOSE_FUNCTIONAL(diag_embed);
   DECOMPOSE_FUNCTIONAL(block_diag);
 }
diff --git a/functorch/csrc/BatchRulesModules.cpp b/functorch/csrc/BatchRulesModules.cpp
@@ -10,6 +10,14 @@
 
 namespace at { namespace functorch {
 
+static Tensor getStepTensor(const Tensor& indices, int64_t bdim_size, int64_t num_embeddings) {
+  // [batch_size, 1, 1, 1, ..., 1]
+  DimVector view_shape(indices.dim(), 1);
+  view_shape[0] = bdim_size;
+  auto range = at::arange(0, bdim_size * num_embeddings, num_embeddings, indices.options());
+  return range.view(view_shape);
+}
+
 std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
     const Tensor& weight, optional<int64_t> weight_bdim,
     const Tensor& indices, optional<int64_t> indices_bdim,
@@ -34,18 +42,43 @@ std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
   const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
   auto indices_ = moveBatchDimToFront(indices, indices_bdim);
 
-  // [batch_size, 1, 1, 1, ..., 1]
-  DimVector view_shape(indices_.dim(), 1);
-  view_shape[0] = batch_size;
-
-  auto range = at::arange(0, batch_size * num_embeddings, num_embeddings, indices_.options());
-  range = range.view(view_shape);
-
+  const auto range = getStepTensor(indices, batch_size, num_embeddings);
   indices_ = indices_ + range;
   const auto result = at::embedding(weight_, indices_, padding_idx, scale_grad_by_freq, sparse);
   return std::make_tuple(result, 0);
 }
 
+std::tuple<Tensor,optional<int64_t>>
+embedding_dense_backward_batch_rule(
+    const Tensor& grad_, optional<int64_t> grad_bdim,
+    const Tensor& indices_, optional<int64_t> indices_bdim,
+    int64_t num_weights, int64_t padding_idx, bool scale_grad_by_freq) {
+  Tensor grad = grad_;
+  Tensor indices = indices_;
+  if (!indices_bdim && grad_bdim) {
+    const auto bdim_size = grad.size(*grad_bdim);
+    grad = reshape_dim_into(*grad_bdim, -1, grad);
+    auto result = at::embedding_dense_backward(
+        grad, indices, num_weights, padding_idx, scale_grad_by_freq);
+    result = reshape_dim_outof(1, bdim_size, result);
+    return std::make_tuple(result, 1);
+  }
+  const auto bdim_size = indices.size(*indices_bdim);
+  indices = moveBatchDimToFront(indices, indices_bdim);
+  grad = moveBatchDimToFront(grad, grad_bdim);
+  grad = ensure_has_bdim(grad, grad_bdim.has_value(), bdim_size);
+  const auto range = getStepTensor(indices, bdim_size, num_weights);
+  auto result = at::embedding_dense_backward(
+      grad, indices + range, num_weights * bdim_size, -1, scale_grad_by_freq);
+  result = reshape_dim_outof(0, bdim_size, result);
+  // Fill in the padding. We can't do it in the embedding_dense_backward call
+  // because we need to fill in multiple rows!
+  if (padding_idx >= 0) {
+    result.select(1, padding_idx).fill_(0);
+  }
+  return std::make_tuple(result, 0);
+}
+
 /**
  * grid sample batch rule breaks down into 3 cases:
  *   case 1 (input is batched, grid is not):
@@ -358,6 +391,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   EXISTING_BDIM(im2col_backward);
 
   VMAP_SUPPORT("embedding", embedding_batch_rule);
+  VMAP_SUPPORT("embedding_dense_backward", embedding_dense_backward_batch_rule);
 
   VMAP_SUPPORT("grid_sampler_2d", GRID_SAMPLE_BATCH_RULE(grid_sampler));
   VMAP_SUPPORT("grid_sampler_2d_backward", GRID_SAMPLE_BW_BATCH_RULE(grid_sampler_2d_backward));
diff --git a/test/discover_coverage.py b/test/discover_coverage.py
@@ -407,6 +407,7 @@ def print_coverage_info(th=100, nn=25):
         'torch.prod',  # dynamic (backward)
         'torch.norm',  # norm with nuc is not commonly used; we support the other cases.
         'torch.svd',  # There isn't a bug, it is just nondeterministic so we can't test it.
+        'torch.nn.functional.embedding',  # We support everything except the sparse option.
     }
     remove_from_set(statuses['test_vmap_exhaustive'], vmap_exemptions)
     remove_from_set(statuses['test_vmapvjp'], vmap_exemptions)
diff --git a/test/functorch_additional_op_db.py b/test/functorch_additional_op_db.py
@@ -201,7 +201,8 @@ def sample_inputs_cross_entropy(self, device, dtype, requires_grad, reduction):
                supports_out=True))
 
 
-# TODO: split embedding in pytorch core
+# TODO: PyTorch core has a check for if requires_grad=True or not.
+# We actually want to test more things for backward here which is why we have our own
 def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs):
     def make_input(shape):
         return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
@@ -225,27 +226,22 @@ def generator():
         idx = make_long_input((S, S), low=0, high=M)
         yield SampleInput(make_input((M, S)), args=(idx,),)
 
-        if not requires_grad:
-            # Following inputs return different gradient from the numerical gradient.
-            # This is expected and relevant tests are present in `test_nn.py`.
-
-            # The gradient vector at `padding_idx` is not updated.
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 2
-            idx[1, 1] = 2
-            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
-
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 4
-            idx[1, 1] = 4
-            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
-
-            # Scale the gradient based on the inverse frequency of a particular index.
-            idx = make_long_input((2, 2), low=0, high=S)
-            idx[0, 0] = 1
-            idx[0, 1] = 1
-            weights = make_input((S, S))
-            yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 2
+        idx[1, 1] = 2
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
+
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 4
+        idx[1, 1] = 4
+        yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
+
+        # Scale the gradient based on the inverse frequency of a particular index.
+        idx = make_long_input((2, 2), low=0, high=S)
+        idx[0, 0] = 1
+        idx[0, 1] = 1
+        weights = make_input((S, S))
+        yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
 
     return list(generator())
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -818,7 +818,6 @@ def test_vmapjvpall(self, device, dtype, op):
         xfail('fft.ihfft2'),
         xfail('fft.ihfftn'),
         xfail('fft.rfft2'),
-        xfail('nn.functional.embedding'),
         xfail('cross'),
         xfail('double', 'channels_last'),
         xfail('linalg.cross'),

Original file line number	Diff line number	Diff line change
`@@ -236,6 +236,7 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {`
`236`	`236`	`OP_DECOMPOSE(_convolution_mode);`
`237`	`237`	`OP_DECOMPOSE(frobenius_norm);`
`238`	`238`	`OP_DECOMPOSE(type_as);`
	`239`	`+ OP_DECOMPOSE(embedding_backward);`
`239`	`240`	`DECOMPOSE_FUNCTIONAL(diag_embed);`
`240`	`241`	`DECOMPOSE_FUNCTIONAL(block_diag);`
`241`	`242`	`}`
Original file line number	Diff line number	Diff line change
`@@ -407,6 +407,7 @@ def print_coverage_info(th=100, nn=25):`
`407`	`407`	`'torch.prod', # dynamic (backward)`
`408`	`408`	`'torch.norm', # norm with nuc is not commonly used; we support the other cases.`
`409`	`409`	`'torch.svd', # There isn't a bug, it is just nondeterministic so we can't test it.`
	`410`	`+ 'torch.nn.functional.embedding', # We support everything except the sparse option.`
`410`	`411`	`}`
`411`	`412`	`remove_from_set(statuses['test_vmap_exhaustive'], vmap_exemptions)`
`412`	`413`	`remove_from_set(statuses['test_vmapvjp'], vmap_exemptions)`