add option for meta tensor tracing (#349)

Chillee · web-flow · commit ed6787acc5dc · 2021-12-16T00:32:18.000-08:00
* add option for meta tensor tracing

* Added meta tensor flag
diff --git a/functorch/_src/aot_autograd.py b/functorch/_src/aot_autograd.py
@@ -385,54 +385,6 @@ def clear_compile_cache():
         compile_cache.clear()
         compile_cache = None
 
-def tvm_compile(fx_module, example_inputs, name = None):
-    import tvm
-    from tvm import relay, auto_scheduler
-    from tvm.contrib import graph_executor
-    import os
-
-    jit_mod = torch.jit.script(fx_module)
-    # jit_mod = torch.jit.trace(fx_module, example_inputs)
-
-    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
-    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
-    target = tvm.target.Target("llvm -mcpu=core-avx2")
-    tasks, task_weights = auto_scheduler.extract_tasks(mod['main'], params, target)
-    for task in tasks:
-        print(task.compute_dag)
-    if name is None:
-        log_file = f'{time.time()}.json'
-    else:
-        log_file = f'{name}.json'
-    if len(tasks) != 0:
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-        if not os.path.exists(log_file):
-            tune_option = auto_scheduler.TuningOptions(
-                num_measure_trials=10000,  # change this to 20000 to achieve the best performance
-                measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-                # early_stopping=1000,
-                # verbose=2,
-            )
-            tuner.tune(tune_option)
-
-    dev = tvm.cpu(0)
-    with auto_scheduler.ApplyHistoryBest(log_file):
-        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
-            lib = relay.build(mod, target=target, params=params)
-    dtype = "float32"
-    m = graph_executor.GraphModule(lib["default"](dev))
-    def exec_tvm(*args):
-        for idx, arg in enumerate(args, 0):
-            if arg.dim() != 0:
-
-                m.set_input(f"inp_{idx}", tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(arg)))
-        m.run()
-        outs = [torch.utils.dlpack.from_dlpack(m.get_output(i).to_dlpack()) for i in range(m.get_num_outputs())]
-        return outs
-    return exec_tvm
-
-def tvm_function(fn, name):
-    return compiled_function(fn, partial(tvm_compile, name=f'fw_{name}'), partial(tvm_compile, name=f'bw_{name}'))
 
 def compiled_module(mod, *args, **kwargs):
     func_mod, params, buffers = make_functional_with_buffers(mod)
diff --git a/functorch/_src/compilers.py b/functorch/_src/compilers.py
@@ -0,0 +1,105 @@
+import torch
+from functools import partial
+from .aot_autograd import draw_graph
+import time
+
+def ts_compile(fx_g, _):
+    for node in fx_g.graph.nodes:
+        if node.target == torch.ops.aten.new_zeros:
+            if node.args[1] == []:
+                args = list(node.args)
+                args[1] = [1]
+                node.args = tuple(args)
+    fx_g.graph.lint()
+    # Works around this NVFuser issue: https://github.com/csarofeen/pytorch/issues/1311
+    for i in range(1000):
+        attr = f'_tensor_constant{i}'
+        if hasattr(fx_g, attr):
+            setattr(fx_g, attr, getattr(fx_g, attr).cuda())
+        else:
+            break
+
+    fx_g.recompile()
+    f = torch.jit.script(fx_g)
+
+    # Works around alias analysis issues in TS
+    graph = f.graph
+    outputs = list(graph.outputs())
+    output = outputs[0]
+    graph.eraseOutput(0)
+    outputs = list(output.node().inputs())
+    for inp in output.node().inputs():
+        graph.registerOutput(inp)
+    output.node().destroy()
+    torch._C._jit_pass_remove_mutation(graph)
+    for i in range(len(list(graph.outputs()))):
+        graph.eraseOutput(0)
+    node = graph.create("prim::ListConstruct", outputs)
+    graph.appendNode(node)
+    node.output().setType(torch._C.ListType.ofTensors())
+    graph.registerOutput(node.output())
+    torch._C._jit_pass_remove_mutation(f.graph)
+
+    f = torch.jit.freeze(f.eval())
+    f = torch.jit.optimize_for_inference(f)
+    return f
+
+def _draw_graph_compile(fx_g, _, name):
+    draw_graph(fx_g, name)
+    return fx_g
+
+def draw_graph_compile(name):
+    return partial(draw_graph_compile, name=name)
+
+def _tvm_compile(fx_module, example_inputs, name = None):
+    import tvm
+    from tvm import relay, auto_scheduler
+    from tvm.contrib import graph_executor
+    import os
+
+    jit_mod = torch.jit.script(fx_module)
+    # jit_mod = torch.jit.trace(fx_module, example_inputs)
+
+    shape_list = [(f"inp_{idx}", i.shape) for idx, i in enumerate(example_inputs)]
+    mod, params = relay.frontend.from_pytorch(jit_mod, shape_list)
+    target = tvm.target.Target("llvm -mcpu=core-avx2")
+    tasks, task_weights = auto_scheduler.extract_tasks(mod['main'], params, target)
+    for task in tasks:
+        print(task.compute_dag)
+    if name is None:
+        log_file = f'{time.time()}.json'
+    else:
+        log_file = f'{name}.json'
+    if len(tasks) != 0:
+        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
+        if not os.path.exists(log_file):
+            tune_option = auto_scheduler.TuningOptions(
+                num_measure_trials=10000,  # change this to 20000 to achieve the best performance
+                measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
+                # early_stopping=1000,
+                # verbose=2,
+            )
+            tuner.tune(tune_option)
+
+    dev = tvm.cpu(0)
+    with auto_scheduler.ApplyHistoryBest(log_file):
+        with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
+            lib = relay.build(mod, target=target, params=params)
+    dtype = "float32"
+    m = graph_executor.GraphModule(lib["default"](dev))
+    def exec_tvm(*args):
+        for idx, arg in enumerate(args, 0):
+            if arg.dim() != 0:
+
+                m.set_input(f"inp_{idx}", tvm.nd.from_dlpack(torch.utils.dlpack.to_dlpack(arg)))
+        m.run()
+        outs = [torch.utils.dlpack.from_dlpack(m.get_output(i).to_dlpack()) for i in range(m.get_num_outputs())]
+        return outs
+    return exec_tvm
+
+def tvm_compile(name):
+    return partial(tvm_compile, name=name)
+
+def nop(f, _):
+    print(f.code)
+    return f
diff --git a/functorch/_src/python_key.py b/functorch/_src/python_key.py
@@ -22,6 +22,7 @@
 
 
 USE_DECOMPOSE = False
+USE_META = False
 
 @contextmanager
 def pythonkey_decompose():
@@ -32,22 +33,38 @@ def pythonkey_decompose():
     finally:
         USE_DECOMPOSE = False
 
+
+@contextmanager
+def pythonkey_meta():
+    global USE_META
+    USE_META = True
+    try:
+        yield USE_META
+    finally:
+        USE_META = False
+
 class PythonTensor(torch.Tensor):
     elem: torch.Tensor
 
     __slots__ = ['elem', 'proxy']
 
     @staticmethod
-    def __new__(cls, elem, proxy):
+    def __new__(cls, elem, proxy, device=None):
         # The wrapping tensor (PythonTensor) is just a meta tensor, so it
         # doesn't hold any memory (meta tensor is generally the preferred type
         # of tensor you want to make a subclass from)...
-        meta = elem.new_empty((0,))
-        meta.set_(meta.storage(), 0, elem.size(), elem.stride())
-        r = torch.Tensor._make_subclass(cls, meta, elem.requires_grad)
+
+        r = torch.Tensor._make_wrapper_subclass(
+            cls, elem.size(),
+            strides=elem.stride(), storage_offset=elem.storage_offset(),
+            dtype=elem.dtype, layout=elem.layout, requires_grad=elem.requires_grad,
+            device=(elem.device if device is None else device),
+        )
 
         # ...the real tensor is held as an element on the tensor.
         r.elem = elem
+        if USE_META:
+            r.elem = r.elem.to('meta')
         r.proxy = proxy
         return r
 
@@ -59,28 +76,45 @@ def __repr__(self):
     def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
         if func in decomposition_table and USE_DECOMPOSE:
             return decomposition_table[func](*args, **kwargs)
+
         def unwrap_proxy(e):
             return e.proxy if isinstance(e, PythonTensor) else e
 
         def unwrap_tensor(e):
             return e.elem if isinstance(e, PythonTensor) else e
+
+        # Used to infer the output device
+        input_devices = list(set([i.device for i in pytree.tree_flatten(args)[0] + pytree.tree_flatten(kwargs)[0] if isinstance(i, PythonTensor)]))
+        assert len(input_devices) == 1
+        output_device = input_devices[0]
         proxy_args = pytree.tree_map(unwrap_proxy, args)
         proxy_kwargs = pytree.tree_map(unwrap_proxy, kwargs)
         proxy_out = func(*proxy_args, **proxy_kwargs)
-        real_out = func(*pytree.tree_map(unwrap_tensor, args), **pytree.tree_map(unwrap_tensor, kwargs))
-
-        def wrap_with_proxy(e, idx):
+        args = pytree.tree_map(unwrap_tensor, args)
+        kwargs = pytree.tree_map(unwrap_tensor, kwargs)
+        try:
+            real_out = func(*args, **kwargs)
+        except NotImplementedError as e:
+            args = pytree.tree_map(lambda x: torch.ones_like(x, device=output_device) if isinstance(x, torch.Tensor) else x, args)
+            kwargs = pytree.tree_map(lambda x: torch.ones_like(x, device=output_device) if isinstance(x, torch.Tensor) else x, kwargs)
+            real_out = func(*args, **kwargs)
+
+        def wrap_with_proxy(e, proxy):
             # Some ops (like native_batch_norm_backward) return undefined tensors that get converted into None in python.
             # As the function signature expects tensors, if we directly return these None tensors back to C++, we'll error.
             if e is None:
-                return PythonTensor(torch.empty(()), proxy_out[idx])
-            return PythonTensor(e, proxy_out[idx]) if type(e) == torch.Tensor else e
+                e = torch.empty(())
+            # Currently assuming that all inputs to an op are the same device - not totally sure that's true
+            if type(e) == torch.Tensor:
+                return PythonTensor(e, proxy, output_device)
+            else:
+                return e
         if isinstance(real_out, tuple):
-            return tuple([wrap_with_proxy(e, idx) for idx, e in enumerate(real_out)])
+            return tuple([wrap_with_proxy(e, proxy_out[idx]) for idx, e in enumerate(real_out)])
         elif isinstance(real_out, list):
-            return list([wrap_with_proxy(e, idx) for idx, e in enumerate(real_out)])
+            return list([wrap_with_proxy(e, proxy_out[idx]) for idx, e in enumerate(real_out)])
         elif isinstance(real_out, torch.Tensor):
-            return PythonTensor(real_out, proxy_out)
+            return PythonTensor(real_out, proxy_out, output_device)
         else:
             return real_out
 
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
@@ -1,6 +1,6 @@
 from .._src.operator_authoring import pointwise_operator
 from .._src.memory_efficient_op_authoring import memory_efficient_pointwise_fusion, torchscript_nvfuser_compile
-from .._src.python_key import nnc_jit, make_nnc, pythonkey_decompose
+from .._src.python_key import nnc_jit, make_nnc, pythonkey_decompose, pythonkey_meta
 from .._src.decompositions import register_decomposition, decomposition_table
 from .._src.nnc_compile import nnc_compile, get_ops
 from .._src.fx_minifier import minimizer
@@ -9,11 +9,11 @@
     aot_module,
     compiled_function,
     compiled_module,
-    tvm_compile,
     draw_joint_graph,
     default_partition,
     partition_with_recompute_fwd_in_bwd,
     num_of_recompilations,
     clear_compile_cache,
     draw_graph,
 )
+from .._src.compilers import ts_compile, tvm_compile, draw_graph_compile, nop