[functorch] add layer norm support, clean up some binary cross entropy support (pytorch/functorch#807)

Samantha Andow · zou3519 · commit 3c07478dcf4d · 2022-07-20T15:45:01.000-07:00
* add layer norm support, clean up some binary cross entropy support

* zero returns have the same shape as their input
diff --git a/functorch/functorch/_src/decompositions.py b/functorch/functorch/_src/decompositions.py
@@ -1,7 +1,7 @@
 import torch
 from torch import Tensor
 import torch._decomp
-from typing import Tuple
+from typing import Tuple, List, Optional
 
 aten = torch.ops.aten
 
@@ -21,6 +21,16 @@ def decorator(f):
     return decorator
 
 
+# Functions where we need a special decomposition for jvp but there's another version that
+# should be used more generally (ex. for jvp we need to recompute the mean and variance for
+# the backwards of a normalization function. Without jvp, it should used the saved value)
+decomposition_table_for_jvp = {}
+
+
+def register_decomposition_for_jvp(fn):
+    return register_decomposition(fn, registry=decomposition_table_for_jvp)
+
+
 @maybe_register_decomposition(aten.trace.default)
 def trace(self: Tensor) -> Tensor:
     return torch.sum(torch.diag(self))
@@ -35,3 +45,86 @@ def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
     else:
         buffer = z
     return min - torch.log1p(z), buffer
+
+
+@register_decomposition_for_jvp(aten.native_layer_norm_backward)
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices = list(range(axis, input_ndim))
+    outer_dim_indices = list(range(0, axis))
+
+    N = 1
+    for i in inner_dims:
+        N *= i
+    M = 1
+    for i in outer_dims:
+        M *= i
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape),
+            input.new_zeros(input_shape[axis:]),
+            input.new_zeros(input_shape[axis:]),
+        )
+
+    # this is exactly the same as the other decomposition except for here. We recompute the mean and variance
+    # so that they track gradients through input
+    mean_ = torch.mean(input, dim=inner_dim_indices, keepdim=True)
+    var = torch.var(input, dim=inner_dim_indices, unbiased=False, keepdim=True)
+    eps = torch.pow(1 / rstd, 2) - var  # this makes me so sad inside
+    eps = eps.detach()
+    rstd_ = 1 / torch.sqrt(var + eps)
+
+    x_hat = (input - mean_) * rstd_
+    if weight is not None:
+        grad_x_hat = grad_out * weight
+    else:
+        grad_x_hat = grad_out
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+    inner = a - b - c3
+
+    if output_mask[0]:
+        d_input: Optional[Tensor] = (rstd_ / N) * inner
+    else:
+        d_input = torch.zeros_like(input)  # should be None but doesn't work with vjp
+
+    if output_mask[1] and weight is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight: Optional[Tensor] = torch.sum(
+                grad_out * x_hat, outer_dim_indices, False
+            )
+        else:
+            d_weight = grad_out * x_hat
+    elif weight is not None:
+        d_weight = torch.zeros_like(weight)  # should be None but doesn't work with vjp
+    else:
+        d_weight = torch.zeros(())  # should be None but doesn't work with vjp
+
+    if output_mask[2] and bias is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
+        else:
+            d_bias = grad_out
+    elif bias is not None:
+        d_bias = torch.zeros_like(bias)  # should be None but doesn't work with vjp
+    else:
+        d_bias = torch.zeros(())  # should be None but doesn't work with vjp
+
+    return (d_input, d_weight, d_bias)
diff --git a/functorch/functorch/_src/eager_transforms.py b/functorch/functorch/_src/eager_transforms.py
@@ -14,7 +14,7 @@
 import torch.autograd.forward_ad as fwAD
 
 from .vmap import vmap
-from .decompositions import decomposition_table
+from .decompositions import decomposition_table, decomposition_table_for_jvp
 
 
 from functorch._C import (
@@ -1276,8 +1276,13 @@ def wrapped(*args, **kwargs):
 
 
 def _register_jit_decomposition(decomp, use_python=False):
-    assert decomp in decomposition_table, f"could not find {decomp}"
-    decomp_fn = decomposition_table[decomp]
+    if decomp in decomposition_table_for_jvp:
+        decomposition_table_used = decomposition_table_for_jvp
+    elif decomp in decomposition_table:
+        decomposition_table_used = decomposition_table
+    else:
+        raise RuntimeError(f"could not find decomposition for {decomp}")
+    decomp_fn = decomposition_table_used[decomp]
     if use_python:
         decomp_fn = torch.jit.ignore(decomp_fn)
         sig = inspect.signature(decomp_fn)
@@ -1310,3 +1315,4 @@ def get_function_def(sig):
 _register_jit_decomposition(torch.ops.aten.log_sigmoid_forward.default)
 _register_jit_decomposition(torch.ops.aten.binary_cross_entropy_backward.default)
 _register_jit_decomposition(torch.ops.aten.binary_cross_entropy.default)
+_register_jit_decomposition(torch.ops.aten.native_layer_norm_backward.default)
diff --git a/functorch/functorch/csrc/DynamicLayer.cpp b/functorch/functorch/csrc/DynamicLayer.cpp
@@ -481,6 +481,7 @@ TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {
   JVP_DECOMP(log_sigmoid_forward);
   JVP_DECOMP(binary_cross_entropy);
   JVP_DECOMP(binary_cross_entropy_backward);
+  JVP_DECOMP(native_layer_norm_backward);
 }
 
 
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
@@ -1148,7 +1148,6 @@ def test_vjpvmap(self, device, dtype, op):
         xfail('nn.functional.hardswish', ''),
         xfail('nn.functional.huber_loss', ''),
         xfail('nn.functional.instance_norm', ''),
-        xfail('nn.functional.layer_norm', ''),
         xfail('nn.functional.logsigmoid', ''),
         xfail('nn.functional.pad', 'circular'),
         xfail('nn.functional.prelu', ''),
@@ -1199,6 +1198,11 @@ def test_jvpvjp(self, device, dtype, op):
             primals_tangents = tree_map(lambda x: torch.randn_like(x), primals)
             cotangents_tangents = tree_map(lambda x: torch.randn_like(x), cotangents)
 
+            if isinstance(primals[0], torch.Tensor) and primals[0].numel() == 0:
+                # typically the first primal arg is the input. If the input has no elements, we will typically run
+                # into an issue of "Expected Tensor but got None"
+                continue
+
             def push_vjp(primals, cotangents):
                 _, vjp_fn = vjp(fn, *primals)
                 return vjp_fn(cotangents)
@@ -1228,19 +1232,23 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
                     expected = (tree_unflatten(primals_out, spec), tree_unflatten(tangents_out, spec))
                 return expected
 
-            def compare_jacobians(primals, cotangents, in_dims=(0,1)):
-                def get_vjp(primals, cotangents):
+            def compare_jacobians(cotangents_and_primals, in_dims, atol_rtol):
+                def get_vjp(cotangents, *primals):
                     _, vjp_fn = vjp(fn, *primals)
                     return vjp_fn(cotangents)
 
-                jacobian_jvp = jacfwd(get_vjp, in_dims)(primals, cotangents)
-                jacobian_vjp = jacrev(get_vjp, in_dims)(primals, cotangents)
+                jacobian_jvp = jacfwd(get_vjp, in_dims)(*cotangents_and_primals)
+                jacobian_vjp = jacrev(get_vjp, in_dims)(*cotangents_and_primals)
 
                 # For dtype changing operations, the jacobians have different dtype.
                 jacobian_jvp = tree_map(lambda x: x.to(torch.float), jacobian_jvp)
                 jacobian_vjp = tree_map(lambda x: x.to(torch.float), jacobian_vjp)
 
-                self.assertEqual(jacobian_jvp, jacobian_vjp)
+                if atol_rtol is not None:
+                    (atol, rtol) = atol_rtol
+                    self.assertEqual(jacobian_jvp, jacobian_vjp, atol=atol, rtol=rtol)
+                else:
+                    self.assertEqual(jacobian_jvp, jacobian_vjp)
 
             # HACK: obviously pytorch should also have the same coverage
             # For things that do have the same coverage, we test that jvp x vjp
@@ -1255,12 +1263,19 @@ def get_vjp(primals, cotangents):
                 'log_softmax',
                 'nn.functional.cross_entropy',
                 'nn.functional.binary_cross_entropy',
+                'nn.functional.layer_norm'
             }
             if op.name in FUNCTORCH_HAS_FORMULA_BUT_NOT_PYTORCH:
-                in_dims = (0, 1)
-                if op.name == 'nn.functional.binary_cross_entropy':  # reverse second derivative wrt target not defined
-                    in_dims = 1
-                compare_jacobians(primals, cotangents, in_dims)
+                def is_differentiable(t):
+                    return isinstance(t, torch.Tensor) and t.dtype == torch.float32
+                args = (cotangents, *primals)
+                if op.name == 'nn.functional.binary_cross_entropy':
+                    in_dims = (0, 1)  # targets is float32 but isn't differentiable
+                    atol_rtol = 1.5E-4, 1.3e-06
+                else:
+                    in_dims = tuple(i for i in range(len(args)) if is_differentiable(args[i]))
+                    atol_rtol = None
+                compare_jacobians(args, in_dims, atol_rtol)
             else:
                 expected = reference(primals, cotangents, primals_tangents, cotangents_tangents)
                 self.assertEqual(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -481,6 +481,7 @@ TORCH_LIBRARY_IMPL(aten, FT_DYNAMIC_LAYER_FRONT_MODE_KEY, m) {`
`481`	`481`	`JVP_DECOMP(log_sigmoid_forward);`
`482`	`482`	`JVP_DECOMP(binary_cross_entropy);`
`483`	`483`	`JVP_DECOMP(binary_cross_entropy_backward);`
	`484`	`+ JVP_DECOMP(native_layer_norm_backward);`
`484`	`485`	`}`
`485`	`486`
`486`	`487`