Added support for output pytrees in AOTAutograd (#332)

Chillee · web-flow · commit 3fa6ea9cc9e8 · 2021-12-15T21:45:41.000-08:00
* Added autominifier

* added support for output pytree

* Added some test
diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
@@ -212,7 +212,7 @@ def joint_forward_backward(primals, tangents):
         out = fn(*primals)
         primals = [p for p in pytree.tree_flatten(primals)[0] if p.requires_grad]
         backward_out = []
-        if primals:
+        if primals: # todo(chilli): Make it support it if not all outputs have gradients
             backward_out = torch.autograd.grad(out, primals, grad_outputs=tangents, allow_unused=True)
         return out, backward_out
     return joint_forward_backward
@@ -275,8 +275,6 @@ def forward(ctx, *flat_args):
                 compiled_bw = bw_compiler(bw_module, bw_args)
             fw_outs = normalize_as_list(compiled_fw(*flat_args))
             ctx.save_for_backward(*fw_outs[num_outs:])
-            if num_outs == 1:
-                return fw_outs[0]
             return tuple(fw_outs[0:num_outs])
 
         @staticmethod
@@ -303,46 +301,73 @@ class _CompileCache(CompileCache):
     HAS_TREE = False
 compile_cache = None
 
+# Inspired by autodidax (thanks!)
+class PytreeThunk:
+    spec = None
+    # These are some kinda dumb microoptimizations that save about 3-4 us of overhead.
+    is_simple = None # if the output spec is a tuple/list, we won't bother unflattening it.
+    is_really_simple = None # if the output spec is a LeafSpec
+
+    def set(self, spec):
+        assert self.spec is None or self.spec == spec
+        self.spec = spec
+        if type(self.spec) in [tuple, list] and all([isinstance(i, pytree.LeafSpec) for i in spec.children_specs]):
+            self.is_simple = True
+        if isinstance(self.spec, pytree.LeafSpec):
+            self.is_really_simple = True
+
+    def unflatten(self, x):
+        if self.is_really_simple:
+            return x[0]
+        if self.is_simple:
+            return x
+        return pytree.tree_unflatten(x, self.spec)
 
 def compiled_function(
     fn, fw_compiler, bw_compiler, partition_fn=default_partition, decompose=False, hasher_type="StaticShapeHasher"
 ):
     global compile_cache
     if compile_cache is None:
         compile_cache = CompileCache()
-    cached_fn = None
+    cached_res = None
 
     fn_id = id(fn)
 
     def returned_function(*args, **kwargs):
         global compile_cache
-        nonlocal cached_fn
+        nonlocal cached_res
         if HAS_TREE:
             flattened_args = tree.flatten((args, kwargs))
         else:
             flattened_args, _ = pytree.tree_flatten((args, kwargs))
         num_args = len(flattened_args)
         # Check if the fn is already compiled
-        cached_fn = compile_cache.at(fn_id, num_args, hasher_type, *flattened_args)
+        cached_res = compile_cache.at(fn_id, num_args, hasher_type, *flattened_args)
 
         # Compile the function and save it in the cache
-        if cached_fn is None:
+        if cached_res is None:
             # Compile a new function
             flattened_args, args_spec = pytree.tree_flatten((args, kwargs))
+            out_spec = PytreeThunk()
             def flat_fn(*args):
+                nonlocal out_spec
                 args, kwargs = pytree.tree_unflatten(args, args_spec)
-                return fn(*args, **kwargs)
-
-            cached_fn = create_compiled_function(
+                tree_out = fn(*args, **kwargs)
+                flat_out = pytree.tree_flatten(tree_out)
+                out_spec.set(flat_out[1])
+                return flat_out[0]
+            compiled_fn = create_compiled_function(
                 flat_fn, fw_compiler, bw_compiler, partition_fn, decompose
             ).apply
-
+            cached_res = (compiled_fn, out_spec)
             # Save the compiled_fn in the cache
             compile_cache.insert(
-                fn_id, num_args, hasher_type, cached_fn, *flattened_args
+                fn_id, num_args, hasher_type, cached_res, *flattened_args
             )
 
-        return cached_fn(*flattened_args)
+        cached_fn, out_spec = cached_res
+        out = cached_fn(*flattened_args)
+        return out_spec.unflatten(out)
 
     return returned_function
 
diff --git a/functorch/_src/fx_minifier.py b/functorch/_src/fx_minifier.py
@@ -193,8 +193,8 @@ def delta_debugging(cur_graph: fx.Graph, cur_inps):
     print([i.shape for i in inps])
     return failing_fx, inps
 
-import subprocess
 def check_nvfuser_subprocess(f, inps):
+    import subprocess
     f.to_folder("temp")
     with open("_temp.py", 'w') as fil:
         fil.write(f'''
@@ -213,4 +213,4 @@ def check_nvfuser_subprocess(f, inps):
     except Exception as e:
         print(e)
         return True
-    return False
+    return False
diff --git a/test/test_pythonkey.py b/test/test_pythonkey.py
@@ -19,6 +19,7 @@
     skipCUDAIfNoMagma, onlyCPU
 import types
 from functools import partial, wraps
+import copy
 
 import functorch
 from functorch import (
@@ -27,7 +28,7 @@
 )
 from functorch.compile import (
     nnc_jit, compiled_function, compiled_module,
-    partition_with_recompute_fwd_in_bwd, pythonkey_decompose, decomposition_table
+    partition_with_recompute_fwd_in_bwd, pythonkey_decompose, decomposition_table, aot_function, aot_module
 )
 
 from torch.testing._internal.common_device_type import ops, onlyCPU
@@ -257,42 +258,68 @@ def _nop_compile(x, _):
 
 def _outs_and_grads(fn, inps):
     outs = fn(*inps)
-    [out.sum().backward(retain_graph=True) for out in outs]
-    grads = [inp.grad for inp in inps]
-    for inp in inps:
+    [out.sum().backward(retain_graph=True) for out in pytree.tree_flatten(outs)[0]]
+    grads = [inp.grad for inp in pytree.tree_flatten(inps)[0]]
+    for inp in pytree.tree_flatten(inps)[0]:
         inp.grad = None
     return outs, grads
 
-class TestEagerFusion(TestCase):
-    def test_single_output(self):
-        def f(a, b):
-            return a + b
-        compiled_f = compiled_function(f, _nop_compile, _nop_compile)
-        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
+
+
+class TestAOTAutograd(TestCase):
+    def verify_aot_autograd(self, f, inp):
+        if isinstance(f, nn.Module):
+            compiled_f = aot_module(f, _nop_compile, _nop_compile)
+        else:
+            compiled_f = aot_function(f, _nop_compile, _nop_compile)
         ref_out, ref_grad = _outs_and_grads(f, inp)
         test_out, test_grad = _outs_and_grads(compiled_f, inp)
         self.assertEqual(ref_out, test_out)
         self.assertEqual(ref_grad, test_grad)
 
+    def test_single_output(self):
+        def f(a, b):
+            return a + b
+        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
+        self.verify_aot_autograd(f, inp)
+
     def test_multi_output(self):
         def f(a, b):
             return a + b, a - b
-        compiled_f = compiled_function(f, _nop_compile, _nop_compile)
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
-        ref_out, ref_grad = _outs_and_grads(f, inp)
-        test_out, test_grad = _outs_and_grads(compiled_f, inp)
-        self.assertEqual(ref_out, test_out)
-        self.assertEqual(ref_grad, test_grad)
+        self.verify_aot_autograd(f, inp)
 
     def test_multi_output_list(self):
         def f(a, b):
             return [a + b, a - b]
-        compiled_f = compiled_function(f, _nop_compile, _nop_compile)
         inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
-        ref_out, ref_grad = _outs_and_grads(f, inp)
-        test_out, test_grad = _outs_and_grads(compiled_f, inp)
-        self.assertEqual(ref_out, test_out)
-        self.assertEqual(ref_grad, test_grad)
+        self.verify_aot_autograd(f, inp)
+
+    def test_multi_output_list(self):
+        def f(a, b):
+            return [a + b, a - b]
+        inp = [torch.randn(3, 3, requires_grad=True), torch.randn(3, 3)]
+        self.verify_aot_autograd(f, inp)
+
+    def test_output_dict(self):
+        def f(x):
+            return {'a': x, 'b': x}
+        inp = [torch.randn(3, 3, requires_grad=True)]
+        self.verify_aot_autograd(f, inp)
+
+        def f(x, y):
+            return {'a': x, 'b': y + x}
+        inp = [torch.randn(3, requires_grad=True), torch.randn(3)]
+        self.verify_aot_autograd(f, inp)
+        
+        def f(x):
+            new_d = {}
+            for k in x:
+                new_d[k] = x[k] * 2
+            return new_d
+        inp = [{'a': torch.randn(3, requires_grad=True), 'b': torch.randn(3, requires_grad=True)}]
+        self.verify_aot_autograd(f, inp)
+
 
     def test_module(self):
         mod = nn.Sequential(nn.Linear(32, 32), nn.ReLU())