jvp transform fully composes with vmap (#340)

zou3519 · web-flow · commit a53be5c92c8c · 2021-12-14T17:49:12.000-05:00
Test Plan:
- new tests
diff --git a/functorch/csrc/BatchRulesFactory.cpp b/functorch/csrc/BatchRulesFactory.cpp
@@ -40,10 +40,27 @@ std::tuple<Tensor,optional<int64_t>> _new_zeros_with_same_feature_meta_batch_rul
     const Tensor& self, optional<int64_t> self_bdim,
     const Tensor& other, optional<int64_t> other_bdim,
     int64_t self_num_batch_dims) {
-  TORCH_CHECK(!other_bdim.has_value(),
-      "NYI: vmap over jvp of the primal. Please file an issue.");
-  auto self_ = moveBatchDimToFront(self, self_bdim);
-  auto result = at::_new_zeros_with_same_feature_meta(self, other, self_num_batch_dims + 1);
+  // The "self, other" naming is too confusing
+  // What this function really says is "create a new tangent for this base".
+  const auto& base = other;
+  const auto& base_bdim = other_bdim;
+  const auto& tangent = self;
+  const auto& tangent_bdim = self_bdim;
+
+  // Three case:
+  //          Case 1  Case 2  Case 3
+  // base        [6]  [B, 6]  [B, 6]
+  // tangent  [B, 5]     [5]  [B, 5]
+
+  // Case 2 & 3: it doesn't matter at all what `tangent` is.
+  if (base_bdim) {
+    const auto result = at::_new_zeros_with_same_feature_meta(tangent, base, self_num_batch_dims);
+    return std::make_tuple(result, base_bdim);
+  }
+
+  // Case 1:
+  auto tangent_ = moveBatchDimToFront(tangent, tangent_bdim);
+  auto result = at::_new_zeros_with_same_feature_meta(tangent_, base, self_num_batch_dims + 1);
   return std::make_tuple(result, 0);
 }
 
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -171,6 +171,27 @@ def wrapped(*args):
 
     return wrapped, tangents
 
+def get_jvp_variant_primals_tangents(f, sample):
+    # We want this higher-order variant of jvp, so that it can
+    # be used to wrap vmap
+    fn, primals = normalize_op_input_output(f, sample, requires_grad=False)
+    tangents = _as_tuple(
+        tree_map(lambda x: torch.randn_like(x), primals))
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        primals_in = args[:len(primals)]
+        tangents_in = args[len(primals):]
+        primals_out, tangents_out = jvp(fn, primals_in, tangents_in)
+
+        if isinstance(primals_out, torch.Tensor):
+            return (primals_out, tangents_out)
+        else:
+            flat_primals_out, _ = tree_flatten(primals_out)
+            flat_tangents_out, _ = tree_flatten(tangents_out)
+            return tuple(flat_primals_out + flat_tangents_out)
+
+    return wrapped, primals + tangents
 
 def is_inplace(op, variant):
     if hasattr(variant, "__wrapped__"):
@@ -596,6 +617,84 @@ def test_vmapjvp(self, device, dtype, op):
             for loop_out, batched_out in get_fallback_and_vmap_exhaustive(fn, args, {}, bdims=(0,)):
                 self.assertEqual(loop_out, batched_out, atol=1e-4, rtol=1e-4)
 
+    @ops(functorch_lagging_op_db, allowed_dtypes=(torch.float,))
+    @skipOps('TestOperators', 'test_vmapjvpall', {
+        skip('nn.functional.dropout'),  # randomness
+        skip('nn.functional.rrelu'),  # randomness
+
+        # Causing a CUDA assert, needs investigation
+        skip('div', 'floor_rounding', device_type='cuda'),
+        skip('div', 'no_rounding_mode', device_type='cuda'),
+        skip('div', 'trunc_rounding', device_type='cuda'),
+        skip('true_divide', device_type='cuda'),
+
+        # xfail list
+        xfail('linalg.inv'),
+        xfail('masked_fill'),
+        xfail('__rpow__'),
+        xfail('logit'),
+        xfail('linalg.tensorinv'),
+        xfail('nn.functional.pad', 'circular'),
+        xfail('linalg.matrix_power'),
+        xfail('cumprod'),
+        xfail('maximum'),
+        xfail('corrcoef'),
+        xfail('linalg.householder_product'),
+        xfail('tensor_split'),
+        xfail('nn.functional.gelu'),
+        xfail('quantile'),
+        xfail('var_mean'),
+        xfail('index_add'),
+        xfail('as_strided'),
+        xfail('linalg.eigvalsh'),
+        xfail('clamp', 'scalar'),
+        xfail('pow'),
+        xfail('fill_'),
+        xfail('linalg.cholesky'),
+        xfail('max', 'binary'),
+        xfail('nn.functional.gaussian_nll_loss'),
+        xfail('min', 'binary'),
+        xfail('index_fill'),
+        xfail('index_put'),
+        xfail('std_mean'),
+        xfail('double', 'channels_last'),
+        xfail('block_diag'),
+        xfail('float_power'),
+        xfail('diag_embed'),
+        xfail('fmin'),
+        xfail('minimum'),
+        xfail('scatter'),
+        xfail('fmax'),
+        xfail('matrix_exp'),
+        xfail('nanquantile'),
+        xfail('lu'),
+        xfail('nn.functional.linear'),
+        xfail('index_copy'),
+        xfail('masked_scatter'),
+        xfail('view_as_complex'),
+    })
+    # This is technically a superset of test_vmapjvp. We should either delete test_vmapjvp
+    # or figure out if we can split vmapjvpall. It's useful to keep test_vmapjvp intact
+    # because that coresponds to "batched forward-mode AD" testing in PyTorch core
+    def test_vmapjvpall(self, device, dtype, op):
+        if is_inplace(op, op.get_op()):
+            # TODO: test in-place
+            self.skipTest("Skipped! NYI: inplace-testing not supported.")
+            return
+
+        samples = op.sample_inputs(device, dtype, requires_grad=False)
+
+        if not op.supports_forward_ad:
+            self.skipTest("Skipped! Forward AD not supported.")
+            return
+
+        for sample in samples:
+            arg_values = [sample.input] + list(sample.args)
+            kwarg_values = sample.kwargs
+            args = tuple([*arg_values, *kwarg_values])
+            fn, args = get_jvp_variant_primals_tangents(op, sample)
+            for loop_out, batched_out in get_fallback_and_vmap_exhaustive(fn, args, {}):
+                self.assertEqual(loop_out, batched_out, atol=1e-4, rtol=1e-4)
 
     @ops(functorch_lagging_op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_vmapvjp_has_batch_rule', vmapvjp_fail.union({
@@ -839,6 +938,7 @@ class TestDecompositionOpInfo(TestCase):
         skip('tensor_split'),
         skip('mvlgamma'),
         skip('tanh', device_type='cuda'), # cuda bfloat16 failure
+        skip('nn.functional.tanhshrink', device_type='cuda'), # cuda bfloat16 failure
         skip('eig'),
         skip('nn.functional.dropout'),
         skip('_masked.softmin'),
diff --git a/test/xfail_suggester.py b/test/xfail_suggester.py
@@ -30,6 +30,7 @@ def get_failed_test(line):
     'test_op_has_batch_rule_',
     'test_jvp_',
     'test_vmapjvp_',
+    'test_vmapjvpall_',
     'test_decomposition_',
     'test_make_fx_',
 }

Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,7 @@ def get_failed_test(line):`
`30`	`30`	`'test_op_has_batch_rule_',`
`31`	`31`	`'test_jvp_',`
`32`	`32`	`'test_vmapjvp_',`
	`33`	`+ 'test_vmapjvpall_',`
`33`	`34`	`'test_decomposition_',`
`34`	`35`	`'test_make_fx_',`
`35`	`36`	`}`