[functorch] Implement a Common Subexpression Elimination (CSE) pass in AOTAutograd (pytorch/functorch#852)

yushangdi · zou3519 · commit b705f178e6de · 2022-07-20T15:45:05.000-07:00
Implement a Common Subexpression Elimination (CSE) pass in AOTAutograd

The test is in test/test_memory_efficient_fusion.py.
diff --git a/functorch/benchmarks/cse.py b/functorch/benchmarks/cse.py
@@ -0,0 +1,103 @@
+import torch
+import torch.fx as fx
+from functorch import make_fx
+from torch.profiler import profile, ProfilerActivity
+
+from functorch._src.cse import fx_graph_cse
+
+def profile_it(f, inp):
+    for _ in range(5):
+        f(inp)
+
+    itr = 5
+    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
+        for _ in range(itr):
+            f(inp)
+
+    timing = prof.key_averages()
+    cuda_time_total = 0
+    for e in timing:
+        cuda_time_total = cuda_time_total + e.cuda_time_total
+    return cuda_time_total / itr
+
+def profile_function(name, f, inp):
+    fx_g = make_fx(f)(inp)
+
+    new_g = fx_graph_cse(fx_g.graph)
+    new_g = fx.GraphModule(fx_g, new_g)
+    # do not benchmark against the scripted version because script already does some CSE
+    # script_f = torch.jit.script(fx_g)
+    # script_g = torch.jit.script(new_g)
+    # avg_cuda_time_f = profile_it(script_f, inp)
+    # avg_cuda_time_g = profile_it(script_g, inp)
+    avg_cuda_time_f = profile_it(fx_g, inp)
+    avg_cuda_time_g = profile_it(new_g, inp)
+    num_node_decrease = len(fx_g.graph.nodes) - len(new_g.graph.nodes)
+
+    print(f"{name}, {avg_cuda_time_f}, {avg_cuda_time_g}, {num_node_decrease}, {len(fx_g.graph.nodes)}")
+
+g_gpu = torch.Generator(device='cuda')
+g_gpu.manual_seed(2147483647)
+inp = torch.randn(2**20, device='cuda', generator=g_gpu)
+
+def f1(x):
+    return x.cos().cos()
+
+profile_function("f1", f1, inp)
+
+def fsum(x):
+    a = x.sum()
+    b = x.sum()
+    c = x.sum()
+    d = x.sum()
+    return a + b + c + d
+
+profile_function("fsum", fsum, inp)
+
+def fconcat(x):
+    a = torch.cat((x, x))
+    b = torch.cat((x, x))
+    return a + b
+profile_function("fconcat", fconcat, inp)
+
+def fsum2(x):
+    a = x.sum()
+    for _ in range(30):
+        a = a + x.sum()
+    return a
+
+profile_function("fsum2", fsum2, inp)
+
+def fsummulti(x):
+    a = 0
+    for _ in range(3):
+        a = a + x.sum()
+        a = a * x.sum()
+    return a
+
+profile_function("fsummulti", fsummulti, inp)
+
+def fsummulti2(x):
+    a = 0
+    for _ in range(30):
+        a = a + x.sum()
+        a = a * x.sum()
+    return a
+
+profile_function("fsummulti2", fsummulti2, inp)
+
+def fcos(x):
+    a = 0
+    for _ in range(3):
+        a = a + x.cos()
+    return a
+
+profile_function("fcos", fcos, inp)
+
+def fcos2(x):
+    a = 0
+    for _ in range(30):
+        a = a + x.cos()
+    return a
+
+profile_function("fcos2", fcos2, inp)
diff --git a/functorch/functorch/_src/cse.py b/functorch/functorch/_src/cse.py
@@ -0,0 +1,58 @@
+import torch
+import torch.fx as fx
+from torch.utils._pytree import tree_flatten
+
+aten = torch.ops.aten
+rand_ops = [aten.dropout, aten._fused_dropout, aten._standard_gamma,
+            aten.bernoulli, aten.multinomial, aten.native_dropout,
+            aten.normal, aten.poisson, aten.binomial, aten.rrelu,
+            aten.rand_like, aten.rand, aten.randint, aten.randn, aten.randperm]
+
+
+# return a new copy of torch.fx.graph.Graph with CSE applied to the input graph
+def fx_graph_cse(fx_g: torch.fx.graph.Graph):
+    new_graph = fx.Graph()
+    env = {}  # map from node in the old graph to node in the new graph
+    hash_env = {}  # map from hash to a node in the new graph
+    token_map = {}  # map from hash to token
+    for n in fx_g.nodes:
+        # The placeholder, output, and get_attr nodes are copied to the new grpah without change
+        # do not CSE away random operations
+        if n.op == 'placeholder' or n.op == 'output' or n.op == 'get_attr' or n.target in rand_ops:
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+        else:  # n.op == 'call_function', should never see n.op == 'call_module' or 'call_method'
+            # substitute args and kwargs memebrs to their mapping in env if exists
+            # specs can be used to reconstruct nested list/dictionaries
+            def substitute(arg_list):
+                arg_list, spec = tree_flatten(arg_list)
+                for i in range(len(arg_list)):
+                    v = arg_list[i]
+                    if isinstance(v, torch.fx.node.Node) and v in env:
+                        arg_list[i] = env[v]
+                return tuple(arg_list), spec
+            args, args_spec = substitute(n.args)
+            kwargs, kwargs_spec = substitute(n.kwargs)
+
+            # each token corresponds to a unique node
+            # nodes with the same token can be substituted
+            token = {"target": n.target, "args": args, "args_spec": args_spec,
+                     "kwargs": kwargs, "kwargs_spec": kwargs_spec}
+
+            # hash substituted args to a number, do not hash specs because specs are not hashable
+            hash_arg = hash((args, kwargs))
+            hash_val = (n.target, hash_arg)
+
+            # check if a node has a substitute and can be eliminated
+            hash_val_in_hash_env = hash_val in hash_env
+            if hash_val_in_hash_env and token_map[hash_val] == token:
+                env[n] = hash_env[hash_val]
+                continue
+
+            new_node = new_graph.node_copy(n, lambda x: env[x])
+            env[n] = new_node
+            if not hash_val_in_hash_env:
+                hash_env[hash_val] = new_node
+                token_map[hash_val] = token
+
+    return new_graph
diff --git a/functorch/functorch/_src/partitioners.py b/functorch/functorch/_src/partitioners.py
@@ -7,6 +7,7 @@
 import os
 from torch.fx.passes import graph_drawer
 from typing import Tuple
+from .cse import fx_graph_cse
 
 
 class InvalidNodeBase(object):
@@ -226,6 +227,10 @@ def min_cut_rematerialization_partition(
     except ImportError:
         raise RuntimeError("Need networkx installed to perform smart recomputation heuristics")
 
+    #  add the CSE pass
+    fx_g = joint_module.graph
+    cse_graph = fx_graph_cse(fx_g)
+    joint_module.graph = cse_graph
     full_bw_graph = joint_module.graph
 
     name_to_node = {}
diff --git a/functorch/test/test_memory_efficient_fusion.py b/functorch/test/test_memory_efficient_fusion.py
@@ -1,9 +1,13 @@
 import torch
 import torch.nn as nn
+import torch.fx as fx
+from functorch import make_fx
 from torch.nn import functional as F
 from functorch.compile import memory_efficient_fusion
+from functorch._src.cse import fx_graph_cse
 from torch.testing._internal.common_utils import TestCase, run_tests
 import inspect
+import random
 from typing import Callable
 import unittest
 
@@ -179,5 +183,189 @@ def forward(self, hidden_states):
     #     run_and_compare_activation(hard_mish, 1024)
 
 
+# check if the CSE modified graph of f has delta less nodes, and do not reduce the number of nodes further on a second pass.
+# delta is an integer >= -1. If delta = -1, only check if the new graph
+#   has less or equal number of nodes
+def check(f, t, delta, check_val=True, graph_input=False):
+    if graph_input:
+        fx_g = f
+    else:
+        fx_g = make_fx(f)(t)
+    new_graph = fx_graph_cse(fx_g.graph)
+    new_g = fx.GraphModule(fx_g, new_graph)
+
+    # the number of nodes decrease/ or stay the same
+    old_num_nodes = len(fx_g.graph.nodes)
+    new_num_nodes = len(new_graph.nodes)
+    if delta == -1:
+        assert old_num_nodes >= new_num_nodes, (
+            f"number of nodes increased {old_num_nodes}, {new_num_nodes}")
+    else:
+        assert old_num_nodes == new_num_nodes + delta, (
+            f"number of nodes not the same {old_num_nodes - delta}, {new_num_nodes}\n {fx_g.graph} \n {new_graph}")
+
+    # a second pass should not reduce more nodes
+    pass_2_graph = fx_graph_cse(new_graph)
+    pass_2_num_nodes = len(pass_2_graph.nodes)
+    assert pass_2_num_nodes == new_num_nodes, (
+        f"second pass graph has less node {pass_2_num_nodes}, {new_num_nodes}\n {new_graph} \n {pass_2_graph}")
+
+    # check correctness
+    if check_val:
+        true_result = fx_g(t)
+        our_result = new_g(t)
+        if true_result is None:  # both return None
+            assert our_result is None, f"true result is None, CSE result is {our_result}"
+        else:  # results returned are the same
+            assert torch.all(true_result == our_result), (
+                f"results are different {true_result}, {our_result}")  # check results are the same
+
+
+class NoChangeTestCase(TestCase):
+
+    def test_nochange(self):
+        def f(x):
+            a = x + 1
+            b = x + a
+            a = x
+            d = x + a
+            return b + d
+        t = torch.randn(2, 2)
+        check(f, t, 0)
+
+    def test_empty(self):
+        def f(x):
+            pass
+        t = torch.randn(2, 2)
+        check(f, t, 0)
+
+    def test_rand_like(self):
+        def f(x):
+            a = torch.rand_like(x)
+            b = torch.rand_like(x)
+            return a + b
+        t = torch.randn(2, 2)
+        check(f, t, 0, check_val=False)
+
+    def test_rand_n(self):
+        def f(x):
+            g_cpu = torch.Generator()
+            g_cpu.manual_seed(2147483647)
+            a = torch.randn(4, generator=g_cpu)
+            b = torch.randn(4, generator=g_cpu)
+            return a + b
+        t = torch.randn(2, 2)
+        check(f, t, 0)
+
+
+class ReduceTestCase(TestCase):
+
+    def test_immutable_list_type(self):
+        def f(x):
+            a = x.sum(dim=1)
+            b = x.sum(dim=1)
+            c = x.sum()
+            d = x.sum()
+            return a + b + c + d
+        t = torch.randn(2, 2)
+        check(f, t, 2)
+
+    def test_immutable_list_multiple_entries(self):
+        def f(x):
+            a = x.sum(dim=[0, 1])
+            b = x.sum(dim=[0, 1])
+            c = x.sum(dim=1)
+            d = x.sum(dim=1)
+            return a + b + c + d
+        t = torch.randn(2, 2)
+        check(f, t, 2)
+
+    def test_simple(self):
+        def f(x):
+            a = x.cos()
+            b = x.cos()
+            c = a + a
+            d = b + b
+            return c + d
+        t = torch.randn(2, 2)
+        check(f, t, 2)
+
+    def test_simple_2(self):
+        def f(x):
+            a = x.cos().sin()
+            b = x.cos().sin()
+            c = a + a
+            d = b + b
+            return c + d
+        t = torch.randn(1)
+        check(f, t, 3)
+
+    def test_two_args_default(self):
+        def f(x):
+            a = x.sum(dim=1)
+            b = x.sum(dim=1, keepdim=False)
+            c = x.sum(dim=1, keepdim=False)
+            d = x.sum(dim=1)
+            return a + b + c + d
+        t = torch.randn(2, 2)
+        check(f, t, 3)
+
+    def test_two_args(self):
+        def f(x):
+            a = x.sum(dim=1)
+            b = x.sum(dim=1, keepdim=True)
+            c = x.sum(dim=1, keepdim=True)
+            d = x.sum(dim=1)
+            return a + b + c + d
+        t = torch.randn(2, 2)
+        check(f, t, 2)
+
+    def test_simple_multiple_same_ops(self):
+        def f(x):
+            a = x.sum()
+            b = x.sum()
+            c = x.sum()
+            d = x.sum()
+            return a + b + c + d
+        t = torch.randn(2, 2)
+        check(f, t, 3)
+
+    def test_nested_immutable_list_type(self):
+        def f(x):
+            a = torch.cat((x, x))
+            b = torch.cat((x, x))
+            return a + b
+        t = torch.randn(2, 2)
+        check(f, t, 1)
+
+    def test_kwarg(self):
+        def f(x):
+            a = torch.ones_like(x)
+            b = torch.ones_like(x)
+            return a + b
+        t = torch.randn(2, 2)
+        check(f, t, 1)
+
+
+class RandomOpTestCase(TestCase):
+    def test_random(self):
+        def f(x):
+            vals = [x]
+            ops = [torch.clone, torch.cos, torch.tanh, torch.nn.functional.gelu]
+            for _ in range(100):
+                new_val = random.choice(ops)(random.choice(vals))
+                vals.append(new_val)
+            return vals[-1]
+
+        fx_g = fx.symbolic_trace(f)
+        fx_g.graph.eliminate_dead_code()
+        fx_g.recompile()
+        t = torch.randn(2, 2)
+
+        for _ in range(30):
+            check(fx_g, t, -1, graph_input=True)
+
+
+
 if __name__ == "__main__":
     run_tests()