Add extremal testing for forward over reverse using decompositions (#818)

Samantha Andow · web-flow · commit 20e921688856 · 2022-05-20T16:01:04.000-04:00
* add extremal testing

* make nll_loss and cross_entropy only test sane things

* make argnum implicit

* fix nits

* fix devices
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -4,6 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import itertools
+
 from torch.testing._internal.common_utils import TestCase, run_tests, is_iterable_of_tensors
 import torch
 from torch import Tensor
@@ -1102,6 +1104,27 @@ def test_vjpvmap(self, device, dtype, op):
 
                 self.assertEqual(result_vjps, expected_vjps)
 
+    def _compare_jacobians_of_vjp(self, fn, cotangents_and_primals, argnums=None, atol_rtol=None):
+        if argnums is None:
+            argnums = tuple(range(len(cotangents_and_primals)))
+
+        def get_vjp(cotangents, *primals):
+            _, vjp_fn = vjp(fn, *primals)
+            return vjp_fn(cotangents)
+
+        jacobian_jvp = jacfwd(get_vjp, argnums)(*cotangents_and_primals)
+        jacobian_vjp = jacrev(get_vjp, argnums)(*cotangents_and_primals)
+
+        # For dtype changing operations, the jacobians have different dtype.
+        jacobian_jvp = tree_map(lambda x: x.to(torch.float), jacobian_jvp)
+        jacobian_vjp = tree_map(lambda x: x.to(torch.float), jacobian_vjp)
+
+        if atol_rtol is not None:
+            (atol, rtol) = atol_rtol
+            self.assertEqual(jacobian_jvp, jacobian_vjp, atol=atol, rtol=rtol)
+        else:
+            self.assertEqual(jacobian_jvp, jacobian_vjp)
+
     @ops(functorch_lagging_op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @skipOps('TestOperators', 'test_jvpvjp', vjp_fail.union({
         # These are weirdly non-deterministic
@@ -1223,24 +1246,6 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
                 return expected
 
-            def compare_jacobians(cotangents_and_primals, in_dims, atol_rtol):
-                def get_vjp(cotangents, *primals):
-                    _, vjp_fn = vjp(fn, *primals)
-                    return vjp_fn(cotangents)
-
-                jacobian_jvp = jacfwd(get_vjp, in_dims)(*cotangents_and_primals)
-                jacobian_vjp = jacrev(get_vjp, in_dims)(*cotangents_and_primals)
-
-                # For dtype changing operations, the jacobians have different dtype.
-                jacobian_jvp = tree_map(lambda x: x.to(torch.float), jacobian_jvp)
-                jacobian_vjp = tree_map(lambda x: x.to(torch.float), jacobian_vjp)
-
-                if atol_rtol is not None:
-                    (atol, rtol) = atol_rtol
-                    self.assertEqual(jacobian_jvp, jacobian_vjp, atol=atol, rtol=rtol)
-                else:
-                    self.assertEqual(jacobian_jvp, jacobian_vjp)
-
             # HACK: obviously pytorch should also have the same coverage
             # For things that do have the same coverage, we test that jvp x vjp
             # are the same between PyTorch and functorch. For things that don't,
@@ -1261,16 +1266,172 @@ def is_differentiable(t):
                     return isinstance(t, torch.Tensor) and t.dtype == torch.float32
                 args = (cotangents, *primals)
                 if op.name == 'nn.functional.binary_cross_entropy':
-                    in_dims = (0, 1)  # targets is float32 but isn't differentiable
-                    atol_rtol = 1.5E-4, 1.3e-06
+                    argnums = (0, 1)  # targets is float32 but isn't differentiable
+                    atol_rtol = 1.5e-4, 1.3e-06
                 else:
-                    in_dims = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
+                    argnums = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
                     atol_rtol = None
-                compare_jacobians(args, in_dims, atol_rtol)
+                self._compare_jacobians_of_vjp(fn, args, argnums, atol_rtol)
             else:
                 expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
                 self.assertEqual(result, expected)
 
+    def _make_extremal_inputs(self, shape, device):
+        if shape == None:
+            return (None,)
+        return (
+            torch.full(shape, -1000., device=device),
+            torch.zeros(shape, device=device),
+            torch.full(shape, 1000., device=device),
+        )
+
+    def _arg_and_kwarg_options(self, args_options, kwargs_options):
+        return itertools.product(*args_options, kwargs_options)
+
+    def test_extremal_numerics_nll_loss(self, device):
+        N, C = 3, 4
+        d1, d2, d3 = 5, 6, 7
+        shapes = (
+            ((N, C), (N,), (C,)),
+            ((N, C), (N,), None),
+            ((N, C, d1, d2, d3), (N, d1, d2, d3), (C,)),
+            ((N, C, d1, d2, d3), (N, d1, d2, d3), None),
+        )
+        kwargs_options = ({'ignore_index': 0, 'reduction': 'mean'}, {'reduction': 'sum'}, {'reduction': 'none'}, {})
+        for input_shape, target_shape, weight_shape in shapes:
+            input_options = self._make_extremal_inputs(input_shape, device)
+            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+                if weight_shape is None:
+                    weight = None
+                else:
+                    weight = torch.randn(weight_shape, device=device)
+                target = torch.randint(0, C, target_shape, device=device)
+                target[0] = 1  # since we're ignoring index 0, at least one element must be non-zero
+
+                fn = functools.partial(torch.nn.functional.nll_loss, target=target, weight=weight, **kwargs)
+                result = fn(input)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(fn, (cotangents, input))
+
+    def test_extremal_numerics_l1_loss(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        kwargs_options = ({'reduction': 'sum'}, {'reduction': 'none'}, {})
+        for shape in shapes:
+            input_options = self._make_extremal_inputs(shape, device)
+            target_options = self._make_extremal_inputs(shape, device)
+            for input, target, kwargs in self._arg_and_kwarg_options((input_options, target_options), kwargs_options):
+                result = torch.nn.functional.l1_loss(input, target)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(torch.nn.functional.l1_loss, (cotangents, input, target))
+
+    def test_extremal_numerics_mse_loss(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        kwargs_options = ({'reduction': 'sum'}, {'reduction': 'none'}, {})
+        for shape in shapes:
+            input_options = self._make_extremal_inputs(shape, device)
+            target_options = self._make_extremal_inputs(shape, device)
+            for input, target, kwargs in self._arg_and_kwarg_options((input_options, target_options), kwargs_options):
+                result = torch.nn.functional.mse_loss(input, target)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(torch.nn.functional.mse_loss, (cotangents, input, target))
+
+    def test_extremal_numerics_softmax(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        kwargs_options = ({'dim': 1}, {})
+        for shape in shapes:
+            input_options = self._make_extremal_inputs(shape, device)
+            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+                result = torch.nn.functional.softmax(input)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(torch.nn.functional.softmax, (cotangents, input))
+
+
+    def test_extremal_numerics_log_softmax(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        kwargs_options = ({'dim': 1}, {})
+        for shape in shapes:
+            input_options = self._make_extremal_inputs(shape, device)
+            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+                result = torch.nn.functional.log_softmax(input)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(torch.nn.functional.log_softmax, (cotangents, input))
+
+    def test_extremal_numerics_cross_entropy(self, device):
+        N, C = 3, 4
+        d1, d2, d3 = 5, 6, 7
+        shapes = (
+            ((N, C), (N,), (C,)),
+            ((N, C), (N,), None),
+            ((N, C), (N, C), (C,)),
+            ((N, C), (N, C), None),
+            ((C,), (), (C,)),
+            ((C,), (), None),
+            ((C,), (C,), (C,)),
+            ((C,), (C,), None),
+            ((N, C, d1, d2, d3), (N, d1, d2, d3), (C,)),
+            ((N, C, d1, d2, d3), (N, d1, d2, d3), None),
+            ((N, C, d1, d2, d3), (N, C, d1, d2, d3), (C,)),
+            ((N, C, d1, d2, d3), (N, C, d1, d2, d3), None),
+        )
+        for input_shape, target_shape, weight_shape in shapes:
+            input_options = self._make_extremal_inputs(input_shape, device)
+            kwargs_options = [{'reduction': 'sum'}, {'reduction': 'none'}, {}]
+            if input_shape != target_shape:
+                kwargs_options.append({'ignore_index': 0, 'reduction': 'mean'})
+
+            for input, kwargs in self._arg_and_kwarg_options((input_options,), kwargs_options):
+                if weight_shape is None:
+                    weight = None
+                else:
+                    weight = torch.randn(weight_shape, device=device)
+
+                if input_shape == target_shape:
+                    target = torch.rand(target_shape, device=device)
+                elif len(target_shape) == 0:
+                    target = torch.tensor(1, device=device)  # must be non-zero since ignore_index may be 0
+                else:
+                    target = torch.randint(0, C, target_shape, device=device)
+
+                fn = functools.partial(torch.nn.functional.cross_entropy, target=target, weight=weight, **kwargs)
+                result = fn(input)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(fn, (cotangents, input), atol_rtol=(1e-4, 1e-5))
+
+    def test_extremal_numerics_binary_cross_entropy(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        for shape in shapes:
+            weight_options = self._make_extremal_inputs(shape, device)
+            kwargs_options = [{'reduction': 'sum'}, {'reduction': 'none'}, {}]
+
+            for weight, kwargs in self._arg_and_kwarg_options((weight_options,), kwargs_options):
+                input = torch.rand(shape, device=device)
+                target = torch.rand(shape, device=device)
+                fn = functools.partial(torch.nn.functional.binary_cross_entropy, target=target, weight=weight, **kwargs)
+                result = fn(input)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(fn, (cotangents, input), atol_rtol=(1e-4, 2e-5))
+
+    def test_extremal_numerics_layer_norm(self, device):
+        N, C, H, W = 3, 4, 5, 6
+        shapes = ((N, C), (N, C, H), (N, C, H, W))
+        for shape in shapes:
+            input_options = self._make_extremal_inputs(shape, device)
+            normalized_shape = shape[1:]
+            weight_options = self._make_extremal_inputs(normalized_shape, device)
+            bias_options = self._make_extremal_inputs(normalized_shape, device)
+
+            for input, bias, weight in self._arg_and_kwarg_options((input_options, bias_options, weight_options), ()):
+                def fn(input, weight, bias):
+                    return torch.nn.functional.layer_norm(input, normalized_shape, weight=weight, bias=bias)
+                result = fn(input, weight, bias)
+                cotangents = torch.randn_like(result, device=device)
+                self._compare_jacobians_of_vjp(fn, (cotangents, input, weight, bias))
+
     @ops(filter(lambda op: op.name == "nn.functional.group_norm", functorch_lagging_op_db + additional_op_db),
          allowed_dtypes=(torch.float32, torch.double))  # TODO: generalize
     def test_group_norm_backward(self, device, dtype, op):