[acc_shape_prop] Introduce and use for acc_tracer to support fp16 sample inputs (#73)

jfix71 · Wei Wei · commit b503d47d6a87 · 2022-06-03T17:54:13.000-07:00
Summary: Pull Request resolved: pytorch/fx2trt#73 Tries to support shape prop for fp16 ops that don’t have pytorch CPU support. Does so by first attempting to use standard shape_prop, and if it fails then upconverts fp16 inputs to fp32 to re-run. This should make things much cleaner for acc_tracer, as the user can provide fp16 sample inputs directly instead of fp32 and then hacking things after the fact. Reviewed By: alexbeloi Differential Revision: D36305442 fbshipit-source-id: 2ecdc88a072d914cb26785d29fd7e409c51954fb
diff --git a/test/tracer/test_acc_shape_prop.py b/test/tracer/test_acc_shape_prop.py
@@ -0,0 +1,97 @@
+# Owner(s): ["oncall: fx"]
+
+import operator
+import unittest
+
+import fx2trt_oss.tracer.acc_tracer.acc_shape_prop as acc_shape_prop
+import fx2trt_oss.tracer.acc_tracer.acc_tracer as acc_tracer
+import torch
+from parameterized import parameterized, param
+
+torch.manual_seed(0)
+
+
+class AccShapePropTest(unittest.TestCase):
+    @parameterized.expand(
+        [
+            param("fp32", dtype=torch.float32),
+            param("fp16", dtype=torch.float16),
+        ]
+    )
+    def test_basic(self, _, dtype):
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.attr = torch.nn.Parameter(torch.randn(3, 4))
+                self.submod = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                return torch.neg(self.submod(x.relu() + self.attr))
+
+        m = TestModule()
+        if dtype == torch.float16:
+            m.half()
+        gm = acc_tracer.rewriter_base_trace(m, None, None)
+        inp = torch.rand(3, 4, dtype=dtype)
+        acc_shape_prop.AccShapeProp(gm).propagate(inp)
+
+        for node in gm.graph.nodes:
+            self.assertEqual(node.meta["tensor_meta"].dtype, dtype)
+
+    def test_mutli_dtype(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x, y):
+                return torch.relu(x * 2), torch.sigmoid(y + y)
+
+        m = TestModule()
+        gm = acc_tracer.rewriter_base_trace(m, None, None)
+        # Note: One input is fp32, the other fp16.
+        x, y = torch.rand(3, 4), torch.rand(3, 4, dtype=torch.float16)
+        acc_shape_prop.AccShapeProp(gm).propagate(x, y)
+
+        for node in gm.graph.nodes:
+            if (node.op == "placeholder" and node.target == "x") or (
+                node.op == "call_function" and node.target in {operator.mul, torch.relu}
+            ):
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float32)
+            elif node.op != "output":
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float16)
+            else:
+                self.assertEqual(node.meta["tensor_meta"][0].dtype, torch.float32)
+                self.assertEqual(node.meta["tensor_meta"][1].dtype, torch.float16)
+
+    def test_to_dtype(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                return x.to(dtype=torch.float32).to(dtype=torch.float16)
+
+        m = TestModule()
+        gm = acc_tracer.rewriter_base_trace(m, None, None)
+        x = torch.rand(3, 4, dtype=torch.float16)
+        acc_shape_prop.AccShapeProp(gm).propagate(x)
+        ph = None
+        for node in gm.graph.nodes:
+            if node.op == "placeholder":
+                ph = node
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float16)
+            elif node.all_input_nodes == [ph]:
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float32)
+            else:
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float16)
+
+    def test_split(self):
+        class TestModule(torch.nn.Module):
+            def forward(self, x):
+                s = torch.tensor_split(x, 2)
+                return s[0].relu(), s[1].sigmoid()
+
+        m = TestModule()
+        gm = acc_tracer.rewriter_base_trace(m, None, None)
+        x = torch.rand(2, 4, dtype=torch.float16)
+        acc_shape_prop.AccShapeProp(gm).propagate(x)
+        for node in gm.graph.nodes:
+            if node.target == torch.tensor_split or node.op == "output":
+                self.assertEqual(node.meta["tensor_meta"][0].dtype, torch.float16)
+                self.assertEqual(node.meta["tensor_meta"][1].dtype, torch.float16)
+            else:
+                self.assertEqual(node.meta["tensor_meta"].dtype, torch.float16)
diff --git a/test/tracer/test_acc_tracer.py b/test/tracer/test_acc_tracer.py
@@ -1947,15 +1947,20 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
 
         torch.testing.assert_allclose(m(input), traced(input))
 
-    def test_addmm(self):
+    @parameterized.expand([(torch.float,), (torch.float16,)])
+    def test_addmm(self, dtype):
         class TestModule(torch.nn.Module):
             def forward(
                 self, input: torch.Tensor, a: torch.Tensor, b: torch.Tensor
             ) -> torch.Tensor:
                 return torch.addmm(input, a, b)
 
         m = TestModule()
-        input, a, b = torch.randn(2, 2), torch.randn(2, 2), torch.randn(2, 2)
+        input, a, b = (
+            torch.randn(2, 2, dtype=dtype),
+            torch.randn(2, 2, dtype=dtype),
+            torch.randn(2, 2, dtype=dtype),
+        )
         traced = acc_tracer.trace(m, [input, a, b])
 
         ph_in = ph_a = ph_b = mm = add = None
@@ -1983,7 +1988,11 @@ def forward(
             else:
                 self.fail(f"Unexpected node: {node.format_node()}")
 
-        self.assertTrue(torch.equal(m(input, a, b), traced(input, a, b)))
+        for node in [ph_in, ph_a, ph_b, mm, add]:
+            self.assertEqual(acc_utils.get_tensor_meta(node).dtype, dtype)
+
+        if dtype == torch.float:
+            self.assertTrue(torch.equal(m(input, a, b), traced(input, a, b)))
 
     def test_gelu(self):
         return self._make_acc_op_function_test(acc_ops.gelu, torch.nn.functional.gelu)
diff --git a/tracer/acc_tracer/acc_shape_prop.py b/tracer/acc_tracer/acc_shape_prop.py
@@ -0,0 +1,63 @@
+import os
+import sys
+from typing import Any
+
+import torch.fx
+from torch.fx.passes import shape_prop
+
+
+class SuppressStderrPrints:
+    def __enter__(self):
+        self._original_stderr = sys.stderr
+        sys.stderr = open(os.devnull, "w")
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        sys.stderr.close()
+        sys.stderr = self._original_stderr
+
+
+class AccShapeProp(shape_prop.ShapeProp):
+    """
+    Similar to standard shape prop, but if any node that is run with standard shape prop
+    fails then it tries to upconvert any fp16 inputs to fp32, rerun shape prop, and then
+    downconvert fp32 results back to fp16.
+
+    Note that we currently mostly only look for/support up/down conversion for nodes
+    with tensor outputs, but this is likely fine for most cases. Additionally the base
+    shape_prop works for many ops with fp16, such as tensor.cat, tensor slice, tensor.to
+    dtype conversion, etc.
+
+    """
+
+    def run_node(self, n: torch.fx.Node) -> Any:
+        # First try running shape_prop with the original inputs.
+        with SuppressStderrPrints():
+            try:
+                return super().run_node(n)
+            except Exception:
+                pass
+
+        # Base shape_prop failed, so temporarily upconvert the node's fp16 inputs in env
+        # and retry. For now just support upconverting Tensor outputs.
+        orig_dtype_env = []
+        for in_node in n.all_input_nodes:
+            in_ten = self.env[in_node]
+            if isinstance(in_ten, torch.Tensor) and in_ten.dtype == torch.float16:
+                orig_dtype_env.append((in_node, in_ten))
+                self.env[in_node] = in_ten.clone().to(dtype=torch.float)
+
+        # Now try running again with upconverted fp32 input tensor in env.
+        result = super().run_node(n)
+
+        # Now that we succeeded, assume it's thanks to upconverting. Therefore we
+        # downconvert fp32 tensor results to fp16.
+        if isinstance(result, torch.Tensor) and result.dtype == torch.float:
+            result = result.to(dtype=torch.float16)
+            self.env[n] = result
+            n.meta["tensor_meta"] = n.meta["tensor_meta"]._replace(dtype=torch.float16)
+
+        # Finally, restore the original env back to fp16 for any upconverted tensors.
+        for in_node, in_ten in orig_dtype_env:
+            self.env[in_node] = in_ten
+
+        return result
diff --git a/tracer/acc_tracer/acc_tracer.py b/tracer/acc_tracer/acc_tracer.py
@@ -10,6 +10,7 @@
 
 import fx2trt_oss.tracer.acc_tracer.acc_normalizer as acc_normalizer
 import fx2trt_oss.tracer.acc_tracer.acc_ops  # noqa: F401
+import fx2trt_oss.tracer.acc_tracer.acc_shape_prop as acc_shape_prop
 import fx2trt_oss.tracer.acc_tracer.acc_utils as acc_utils
 import torch
 import torch.jit as jit
@@ -384,6 +385,19 @@ def _replace_tensor_meta_with_rank(gm: torch.fx.GraphModule):
             del node.meta["tensor_meta"]
 
 
+def rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list):
+    rewritten_graph, rewritten_mod = AccRewritingTracer().trace(
+        mod,
+        ast_rewriter_allow_list=ast_rewriter_allow_list,
+        leaf_module_list=leaf_module_list,
+    )
+
+    assert isinstance(rewritten_mod, nn.Module)
+    # Note: use the rewritten_mod here as the root. This is necessary because
+    # RewrittenModule includes a new module for the ConditionalExceptionWrapper.
+    return torch.fx.GraphModule(rewritten_mod, rewritten_graph)
+
+
 def trace(
     mod: nn.Module,
     sample_inputs: Sequence[Any],
@@ -443,18 +457,10 @@ def trace(
         )
         mod.eval()
 
-    # Rewrite the module to make it symbolic traceable, and then trace it.
-    rewritten_graph, rewritten_mod = AccRewritingTracer().trace(
-        mod,
-        ast_rewriter_allow_list=ast_rewriter_allow_list,
-        leaf_module_list=leaf_module_list,
-    )
-
-    assert isinstance(rewritten_mod, nn.Module)
     assert isinstance(sample_inputs, (list, tuple))
-    # Note: use the rewritten_mod here as the root. This is necessary because
-    # RewrittenModule includes a new module for the ConditionalExceptionWrapper.
-    traced = torch.fx.GraphModule(rewritten_mod, rewritten_graph)
+
+    # Rewrite the module to make it symbolic traceable, and then trace it.
+    traced = rewriter_base_trace(mod, ast_rewriter_allow_list, leaf_module_list)
 
     # Now remove all assertions and exceptions if requested.
     if remove_assertions:
@@ -467,7 +473,7 @@ def trace(
     traced.graph.eliminate_dead_code()
 
     # Run shape prop to add node.meta["type"] to nodes, needed for NormalizeArgs.
-    shape_prop.ShapeProp(traced).propagate(*sample_inputs)
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
     # Swap out tensor_meta for tensor_rank, because we don't actually want to rely on
     # tensor_meta yet for normalization/lowering, though rank shouldn't change.
     _replace_tensor_meta_with_rank(traced)
@@ -483,6 +489,6 @@ def trace(
     traced.recompile()
 
     # Run shape prop to again to populate tensor_meta after normalize.
-    shape_prop.ShapeProp(traced).propagate(*sample_inputs)
+    acc_shape_prop.AccShapeProp(traced).propagate(*sample_inputs)
 
     return traced
diff --git a/tracer/acc_tracer/acc_utils.py b/tracer/acc_tracer/acc_utils.py
@@ -189,3 +189,14 @@ def map_tensor_metadata(a: Any, fn: Callable):
         a, list
     ), f"Only supporting tuple/list/TensorMetadata, but found {type(a)}"
     return immutable_list(map_tensor_metadata(elem, fn) for elem in a)
+
+
+def get_tensor_meta(node: torch.fx.Node) -> TensorMetadata:
+    tensor_meta = node.meta.get("tensor_meta")
+
+    if not tensor_meta:
+        raise RuntimeError(
+            f"Node has no tensor metadata associated with it! "
+            f"Check that shape propagation has run. {node.format_node()}"
+        )
+    return tensor_meta