Embedding batch rule (#351)

zou3519 · web-flow · commit ed9673d0fdf5 · 2021-12-17T10:21:28.000-05:00
diff --git a/functorch/csrc/BatchRulesModules.cpp b/functorch/csrc/BatchRulesModules.cpp
@@ -217,6 +217,41 @@ std::tuple<Tensor,Tensor> cudnn_convolution_backward_plumbing(const Tensor & sel
   return slow_fallback<Tensor,Tensor>(op, { self, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, output_mask });
 }
 
+std::tuple<Tensor,optional<int64_t>> embedding_batch_rule(
+    const Tensor& weight, optional<int64_t> weight_bdim,
+    const Tensor& indices, optional<int64_t> indices_bdim,
+    int64_t padding_idx, bool scale_grad_by_freq, bool sparse) {
+  if (!weight_bdim && indices_bdim) {
+    // B*, ED -> B*D
+    const auto result = at::embedding(weight, indices, padding_idx, scale_grad_by_freq, sparse);
+    return std::make_tuple(result, indices_bdim);
+  } else if (weight_bdim && !indices_bdim) {
+    // *, BED -> *, E(BD) -> *(BD) -> *BD
+    const auto batch_size = weight.size(*weight_bdim);
+    const auto weight_ = reshape_dim_into(*weight_bdim, /*embedding_dim*/1, weight);
+    auto result = at::embedding(weight_, indices, padding_idx, scale_grad_by_freq, sparse);
+    result = reshape_dim_outof(-1, batch_size, result);
+    return std::make_tuple(result, result.dim() - 2);
+  }
+  TORCH_INTERNAL_ASSERT(weight_bdim && indices_bdim);
+  // B*, BED -> B*, (BE)D -> B*D
+  // We'll need to do something extra: add (0, E, 2*E, ...) to the indices.
+  const auto batch_size = weight.size(*weight_bdim);
+  const auto num_embeddings = weight.size((*weight_bdim == 0) ? 1 : 0);
+  const auto weight_ = reshape_dim_into(*weight_bdim, 0, weight);
+  auto indices_ = moveBatchDimToFront(indices, indices_bdim);
+
+  // [batch_size, 1, 1, 1, ..., 1]
+  DimVector view_shape(indices_.dim(), 1);
+  view_shape[0] = batch_size;
+
+  auto range = at::arange(0, batch_size * num_embeddings, num_embeddings, indices_.options());
+  range = range.view(view_shape);
+
+  indices_ = indices_ + range;
+  const auto result = at::embedding(weight_, indices_, padding_idx, scale_grad_by_freq, sparse);
+  return std::make_tuple(result, 0);
+}
 
 /**
  * grid sample batch rule breaks down into 3 cases:
@@ -535,6 +570,8 @@ TORCH_LIBRARY_IMPL(aten, FT_BATCHED_KEY, m) {
   EXISTING_BDIM(im2col);
   EXISTING_BDIM(im2col_backward);
 
+  VMAP_SUPPORT("embedding", embedding_batch_rule);
+
   VMAP_SUPPORT("grid_sampler_2d", GRID_SAMPLE_BATCH_RULE(grid_sampler));
   VMAP_SUPPORT("grid_sampler_2d_backward", GRID_SAMPLE_BW_BATCH_RULE(grid_sampler_2d_backward));
 
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -90,6 +90,7 @@ def get_fallback_and_vmap_exhaustive(op, arg_values, kwarg_values, compute_loop_
         # def f(a):
         #     return op(a)
         # t = make_fx(vmap(f, in_dims=in_dims, out_dims=out_dim))(*batched_args, **kwarg_values)
+        # print(in_dims, [arg.shape for arg in batched_args], kwarg_values)
         batched_out = vmap(op, in_dims=in_dims, out_dims=out_dim)(*batched_args, **kwarg_values)
         yield (loop_out, batched_out)
 
diff --git a/test/functorch_additional_op_db.py b/test/functorch_additional_op_db.py
@@ -224,3 +224,64 @@ def sample_inputs_cross_entropy(self, device, dtype, requires_grad, reduction):
                dtypes=floating_types(),
                dtypesIfCUDA=floating_types_and(torch.half, torch.bfloat16),
                supports_out=True))
+
+# TODO: split embedding in pytorch core
+def sample_inputs_embedding(op_info, device, dtype, requires_grad, **kwargs):
+    def make_input(shape):
+        return make_tensor(shape, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    def make_long_input(shape, *, low, high):
+        return make_tensor(shape, device=device, dtype=torch.long, low=low, high=high)
+
+    M = 20
+    S = 5
+
+    def generator():
+        # 0-D index tensor
+        idx = make_long_input((), low=0, high=M)
+        yield SampleInput(make_input((M, S)), args=(idx,),)
+
+        # 1-D index tensor
+        idx = make_long_input((S,), low=0, high=M)
+        yield SampleInput(make_input((M, S)), args=(idx,),)
+
+        # 2-D index tensor
+        idx = make_long_input((S, S), low=0, high=M)
+        yield SampleInput(make_input((M, S)), args=(idx,),)
+
+        if not requires_grad:
+            # Following inputs return different gradient from the numerical gradient.
+            # This is expected and relevant tests are present in `test_nn.py`.
+
+            # The gradient vector at `padding_idx` is not updated.
+            idx = make_long_input((2, 2), low=0, high=S)
+            idx[0, 0] = 2
+            idx[1, 1] = 2
+            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': 2},)
+
+            idx = make_long_input((2, 2), low=0, high=S)
+            idx[0, 0] = 4
+            idx[1, 1] = 4
+            yield SampleInput(make_input((S, S)), args=(idx,), kwargs={'padding_idx': -1},)
+
+            # Scale the gradient based on the inverse frequency of a particular index.
+            idx = make_long_input((2, 2), low=0, high=S)
+            idx[0, 0] = 1
+            idx[0, 1] = 1
+            weights = make_input((S, S))
+            yield SampleInput(weights, args=(idx,), kwargs={'scale_grad_by_freq': True},)
+
+    return list(generator())
+
+additional_op_db.append(
+    OpInfo(
+        "nn.functional.embedding",
+        variant_test_name="functorch",
+        # We use lambda to reshuffle the positional arguments.
+        # This is because currently only the `input` field of SampleInput
+        # is tested in gradient tests.
+        op=lambda weight, idx, **kwargs: torch.nn.functional.embedding(idx, weight, **kwargs),
+        dtypes=floating_types_and(torch.bfloat16, torch.float16),
+        sample_inputs_func=sample_inputs_embedding,
+        supports_out=False,
+    ))
diff --git a/test/test_vmap.py b/test/test_vmap.py
@@ -3109,7 +3109,7 @@ class TestVmapOperatorsOpInfo(TestCase):
         xfail('nn.functional.batch_norm'),
         xfail('lu_unpack'),
         xfail('histogramdd'),
-        xfail('nn.functional.embedding'),
+        xfail('nn.functional.embedding', ''),
         xfail('randn_like'),
         xfail('allclose'),
         xfail('bfloat16', 'channels_last'),