[Partitioning] Recompute forward in the backward pass (#213)

anijain2305 · web-flow · commit ab4a32f27f31 · 2021-10-19T14:05:53.000-07:00
Summary: Recomputation fwd in the bwd pass can improve the performance
of pointwise operators, where it helps us in reduce memory bandwidth
pressure at the expense of more computation. This PR adds a new
partitioning function to enable this type of recomputation.
diff --git a/functorch/__init__.py b/functorch/__init__.py
@@ -21,7 +21,7 @@
 from ._src.make_functional import functional_init, functional_init_with_buffers
 from ._src.python_key import wrap_key, PythonTensor, pythonkey_trace, make_fx, nnc_jit, make_nnc
 from ._src.nnc_compile import nnc_compile, get_ops
-from ._src.eager_compilation import compiled_function, compiled_module, tvm_compile, draw_joint_graph, default_partition
+from ._src.eager_compilation import compiled_function, compiled_module, tvm_compile, draw_joint_graph, default_partition, partition_with_recompute_fwd_in_bwd
 from ._src.operator_authoring import pointwise_operator
 
 
diff --git a/functorch/_src/eager_compilation.py b/functorch/_src/eager_compilation.py
@@ -82,6 +82,81 @@ def add_saved(a):
     bw_module.graph.lint()
     return fw_module, bw_module
 
+def partition_with_recompute_fwd_in_bwd(joint_module: fx.GraphModule, _joint_inputs):
+    """
+    Partitions the joint graph such that the backward recomputes the forward.
+    Recopmuting helps in trading off memory bandwidth with computation.
+
+    To create the fwd and bwd graph, we copy the joint graph, manually set the
+    outputs to just original forward or backward outputs. And then we run the
+    resulting graphs through dead code elimintation.
+    """
+
+    def _extract_graph_with_given_outputs(joint_graph, outputs, is_fwd=False):
+        """
+        Returns a copy of joint_graph with given outputs.
+
+        If its forward graph, we need extra bookkeeping
+            1) Remove tangent nodes in the input.
+            2) Pass the inputs directly to the output. This will be saved in the
+            backward ctx.
+        """
+        # Set up val_map to be used later for copying the graph
+        val_map = {}
+        saved_nodes = []
+        if is_fwd:
+            # Remove the tangent placeholder nodes from the graph
+            def _tangent_finder(node):
+                return node.op == "placeholder" and "tangents" in node.target
+            tangent_nodes = filter(_tangent_finder, joint_graph.nodes)
+            for tangent_node in tangent_nodes:
+                val_map[tangent_node] = 1
+
+            # Find the saved tensor nodes that will be used by ctx later
+            def _placeholder_finder(node):
+                return node.op == "placeholder" and "tangents" not in node.target
+            saved_nodes = list(filter(_placeholder_finder, joint_graph.nodes))
+
+        # Make a copy of the joint graph
+        graph = fx.Graph()
+        graph.graph_copy(joint_graph, val_map)
+
+        # Set the outputs
+        outputs = outputs + saved_nodes
+        if len(outputs) == 1:
+            graph.output(val_map[outputs[0]])
+        else:
+            graph.output([val_map[out] for out in outputs])
+
+        # Run dead code elimination to remove unnecessary nodes
+        graph.eliminate_dead_code()
+        graph.lint()
+        return graph
+
+    # Find the output node
+    output_node = None
+    for n in reversed(joint_module.graph.nodes):
+        if n.op == "output":
+            output_node = n
+            break
+
+    # Get the forward and backward output nodes
+    num_fwd_outputs = joint_module._out_spec.children_specs[0].num_leaves
+    fwd_outputs = output_node.args[0][0:num_fwd_outputs]
+    bwd_outputs = output_node.args[0][num_fwd_outputs:]
+
+    # Construct the forward module
+    fwd_graph = _extract_graph_with_given_outputs(
+        joint_module.graph, fwd_outputs, is_fwd=True
+    )
+    fwd_module = fx.GraphModule(joint_module, fwd_graph)
+
+    # Construct the backward module
+    bwd_graph = _extract_graph_with_given_outputs(joint_module.graph, bwd_outputs)
+    bwd_module = fx.GraphModule(joint_module, bwd_graph)
+
+    return fwd_module, bwd_module
+
 def create_joint_forward_backward(fn):
     def joint_forward_backward(primals, tangents):
         out = fn(*primals)
diff --git a/test/test_pythonkey.py b/test/test_pythonkey.py
@@ -23,7 +23,8 @@
 import functorch
 from functorch import (
     grad, vjp, vmap, jacrev, grad_and_value,
-    make_functional_deprecated_v1, make_functional_with_buffers_deprecated_v1, make_fx, nnc_jit, compiled_function, compiled_module
+    make_functional_deprecated_v1, make_functional_with_buffers_deprecated_v1, make_fx, nnc_jit, compiled_function, compiled_module,
+    partition_with_recompute_fwd_in_bwd
 )
 
 from torch.testing._internal.common_device_type import ops, onlyCPU
@@ -365,6 +366,28 @@ def create_new_arg(x):
             self.assertEqual(orig_grad, compiled_grad)
 
 
+class TestPartitioning(TestCase):
+    def test_recompute_partitioning(self):
+        def fn(a, b):
+            return torch.sin(torch.sin(a)) + b
+
+        # Reference calculation
+        ref_a = torch.rand(10, 10, requires_grad=True)
+        ref_b = torch.rand(10, 10, requires_grad=True)
+        ref = fn(ref_a, ref_b)
+        ref.sum().backward()
+
+        # Compiled function calculation
+        res_a = ref_a.clone().detach().requires_grad_(True)
+        res_b = ref_b.clone().detach().requires_grad_(True)
+        compile_fn = lambda x, _ : x
+        compiled_fn = compiled_function(fn, compile_fn, compile_fn, partition_with_recompute_fwd_in_bwd)
+        res = compiled_fn(res_a, res_b)
+        res.sum().backward()
+        assert torch.allclose(ref, res, atol=1e-3, rtol=1e-3)
+        assert torch.allclose(ref_a.grad, res_a.grad, atol=1e-3, rtol=1e-3)
+        assert torch.allclose(ref_b.grad, res_b.grad, atol=1e-3, rtol=1e-3)
+
 
 only_for = ("cpu")
 instantiate_device_type_tests(