[fx2trt] support for new_ones, new_empty, as_strided, einsum (#80)

Wei Wei · Wei Wei · commit 4086fdc2ad44 · 2022-06-03T17:54:13.000-07:00
Summary: Pull Request resolved: https://github.com/pytorch/fx2trt/pull/80 1. add support for new_ones, new_empty. new_empty is filled with uninitialized data so there is no test for it. 2. add support for as_strided,einsum 3. add some print information for interpreter.run and inference time 4. add support for einsum 5. add some print information on testing to show compile time and run time. 6. fix a bug in where 7. acc_tracer need to recompile after DCE Reviewed By: yinghai Differential Revision: D36460857 fbshipit-source-id: f74a7df8d5b11c1a9478cb9840a7c47577c3bdc0
diff --git a/fx/converters/acc_ops_converters.py b/fx/converters/acc_ops_converters.py
@@ -2483,7 +2483,10 @@ def acc_ops_where(
 
     if type(x_t) != TRTTensor:
         if x_shape != output_shape:
-            x_t.expand(output_shape)
+            # special case where 1 element in x_t
+            if len(x_t.shape) == 0:
+                x_t = x_t.unsqueeze(0)
+            x_t = x_t.expand(output_shape)
         x_val = get_trt_tensor(network, x_t, f"{name}_x")
     else:
         x_val = x_t
@@ -2498,7 +2501,10 @@ def acc_ops_where(
 
     if type(y_t) != TRTTensor:
         if y_shape != output_shape:
-            y_t.expand(output_shape)
+            # special case where 1 element in y_t
+            if len(y_t.shape) == 0:
+                y_t = y_t.unsqueeze(0)
+            y_t = y_t.expand(output_shape)
         y_val = get_trt_tensor(network, y_t, f"{name}_y")
     else:
         y_val = y_t
@@ -2912,16 +2918,20 @@ def acc_ops_cat(
     name: str,
 ) -> Union[TRTTensor, Sequence[TRTTensor]]:
     tensors = kwargs["tensors"]
+    dim = kwargs["dim"]
 
     if any(not isinstance(t, TRTTensor) for t in tensors):  # type: ignore[union-attr]
         raise RuntimeError(
             f"cat received inputs {tensors} that is not part " "of the TensorRT region!"
         )
-
     layer = network.add_concatenation(inputs=tensors)
-    layer.axis = cast(int, kwargs["dim"]) - (
-        1 if network.has_implicit_batch_dimension else 0
-    )
+    if dim < 0:
+        if network.has_implicit_batch_dimension:
+            dim = len(tensors[0].shape) + 1 + dim
+        else:
+            dim = len(tensors[0].shape) + dim
+
+    layer.axis = dim - (1 if network.has_implicit_batch_dimension else 0)
     set_layer_name(layer, target, name)
     return layer.get_output(0)
 
@@ -3477,3 +3487,129 @@ def acc_ops_interpolate(
 
     set_layer_name(layer, target, name)
     return layer.get_output(0)
+
+
+@tensorrt_converter(acc_ops.new_ones)
+def acc_ops_new_ones(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    input_val = kwargs["input"]
+    size_val = kwargs["size"]
+    dtype_val = kwargs.get("dtype")
+    if dtype_val is None:
+        dtype_val = input_val.dtype
+        dtype_val = torch_dtype_from_trt(dtype_val)
+
+    device_val = kwargs.get("device")
+    assert (
+        device_val == "cuda" or device_val == None
+    ), f"device is not `cuda` but {device_val}"
+
+    weight = torch.ones(size_val, dtype=dtype_val)
+    return get_trt_tensor(network, weight, f"{name}_weight")
+
+
+@tensorrt_converter(acc_ops.new_empty)
+def acc_ops_new_empty(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    input_val = kwargs["input"]
+    size_val = kwargs["size"]
+    dtype_val = kwargs.get("dtype")
+    if dtype_val is None:
+        dtype_val = input_val.dtype
+        dtype_val = torch_dtype_from_trt(dtype_val)
+
+    device_val = kwargs.get("device")
+    assert (
+        device_val == "cuda" or device_val == None
+    ), f"device is not `cuda` but {device_val}"
+
+    weight = torch.zeros(size_val, dtype=dtype_val)
+    return get_trt_tensor(network, weight, f"{name}_weight")
+
+
+@tensorrt_converter(acc_ops.einsum)
+def acc_ops_einsum(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    input_val = list(kwargs["operands"])
+    equation = kwargs["equation"]
+    assert type(equation) is str, "equation type is not str"
+    const_flag = False
+    for i, input_source in enumerate(input_val):
+        if type(input_source) == torch.Tensor:
+            # const change to TRTensor always output with dtype FLOAT even though stored memory is other type
+            # so we cast to float first. And we need other inputs to be the same float type
+            input_source = input_source.to(torch.float)
+            const_flag = True
+        input_val[i] = get_trt_tensor(network, input_source, name + f"_input_source{i}")
+
+    if const_flag:
+        for i, input_source in enumerate(input_val):
+            if input_source.dtype != trt.float32:
+                input_val[i] = type_cast(
+                    network, target, f"{name}_input_cast{i}", input_source, trt.float32
+                )
+    einsum_layer = network.add_einsum(inputs=input_val, equation=equation)
+    return einsum_layer.get_output(0)
+
+
+@tensorrt_converter(acc_ops.as_strided)
+def acc_ops_as_strided(
+    network: TRTNetwork,
+    target: Target,
+    args: Tuple[Argument, ...],
+    kwargs: Dict[str, Argument],
+    name: str,
+) -> Union[TRTTensor, Sequence[TRTTensor]]:
+    input_val = kwargs["input"]
+    size = kwargs["size"]
+    stride = kwargs["stride"]
+    offset = kwargs.get("storage_offset")
+    if offset == None:
+        offset = 0
+
+    # convert to 1d vector
+    new_kwargs = {}
+    new_kwargs["input"] = kwargs["input"]
+    new_kwargs["start_dim"] = 0
+    new_kwargs["end_dim"] = -1
+    flatten_output = acc_ops_flatten(network, target, [], new_kwargs, name + "_flatten")
+    # use gather to collect output from 1d flatten_output
+    rank = len(size)
+    assert len(size) == len(stride), "size and stride shapes are not the same"
+
+    def nested(rank, size, stride, current, dim, indices):
+        if dim == rank:
+            indices.append(current)
+            return
+        for i in range(size[dim]):
+            current = current + stride[dim] * i
+            nested(rank, size, stride, current, dim + 1, indices)
+            current = current - stride[dim] * i
+
+    indices = []
+    nested(rank, size, stride, 0, 0, indices)
+    indices = torch.tensor(indices, dtype=torch.int)
+    indices = indices + offset
+    indices_tensor = get_trt_tensor(network, indices, name + "_indices_tensor")
+    gather_layer = network.add_gather(flatten_output, indices_tensor, axis=0)
+    # resize the output to match size
+    shuffle_layer = network.add_shuffle(gather_layer.get_output(0))
+    set_layer_name(shuffle_layer, target, name + "_shuffle")
+    shuffle_layer.reshape_dims = tuple(size)
+
+    return shuffle_layer.get_output(0)
diff --git a/fx/passes/lower_basic_pass.py b/fx/passes/lower_basic_pass.py
@@ -356,7 +356,7 @@ def transform_setitem(gm: torch.fx.GraphModule, input: Input):
             inp = node.args[2]
 
             inp_flag = False
-            if inp.target == operator.getitem:
+            if type(inp) == torch.fx.node.Node and inp.target == operator.getitem:
                 new_args = list(copy.deepcopy(inp.args[1]))
                 for ind, val in enumerate(new_args):
                     if type(val) == int:
diff --git a/fx/tools/timing_cache_utils.py b/fx/tools/timing_cache_utils.py
@@ -12,17 +12,20 @@ def __init__(self, timing_cache_prefix: str = "", save_timing_cache=False):
         if not timing_cache_prefix and tc:
             timing_cache_prefix_name = tc
 
-        self.timing_cache_prefix_name = timing_cache_prefix
+        self.timing_cache_prefix_name = timing_cache_prefix_name
         self.save_timing_cache = save_timing_cache
 
     def get_file_full_name(self, name: str):
         return f"{self.timing_cache_prefix_name}_{name}.npy"
 
     def get_timing_cache_trt(self, timing_cache_file: str) -> bytearray:
         timing_cache_file = self.get_file_full_name(timing_cache_file)
-        with open(timing_cache_file, "rb") as raw_cache:
-            cache_data = raw_cache.read()
-        return bytearray(cache_data)
+        try:
+            with open(timing_cache_file, "rb") as raw_cache:
+                cache_data = raw_cache.read()
+            return bytearray(cache_data)
+        except Exception:
+            return None
 
     def update_timing_cache(
         self, timing_cache_file: str, serilized_cache: bytearray
diff --git a/test/converters/acc_op/test_as_strided.py b/test/converters/acc_op/test_as_strided.py
@@ -0,0 +1,35 @@
+import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
+import torch
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_fx2trt import AccTestCase
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestConverter(AccTestCase):
+    @parameterized.expand(
+        [
+            ("2d_dim_v1", (5, 5), (2, 3), (1, 2), 0),
+            ("2d_dim_v2", (5, 5), (2, 3), (2, 2), 1),
+            ("3d_dim_v1", (20, 20), (2, 3, 2), (2, 2, 2), 0),
+            # take long time on large dimensions, we do not have better implementation yet
+            # ("4d_dim_v1", (200, 200, 200, 200), (9, 9, 3, 2), (2, 2, 2, 3), 0),
+            # ("4d_dim_v2", (200, 200, 200, 200), (1, 15, 512, 1), (4096, 256, 1, 1), 0),
+        ]
+    )
+    def test_as_strided(self, _, x_size, size, stride, offset):
+        class Stride(nn.Module):
+            def forward(self, x):
+                return torch.as_strided(x, size, stride, offset)
+
+        inputs = [torch.randn(*x_size)]
+        self.run_test(
+            Stride(),
+            inputs,
+            expected_ops={acc_ops.as_strided},
+            test_implicit_batch_dim=False,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/converters/acc_op/test_cat.py b/test/converters/acc_op/test_cat.py
@@ -14,6 +14,14 @@ def forward(self, x, y, z):
         inputs = [torch.randn(1, 2, 3), torch.randn(1, 1, 3), torch.randn(1, 3, 3)]
         self.run_test(Cat(), inputs, expected_ops={acc_ops.cat})
 
+    def test_cat_neg(self):
+        class Cat(nn.Module):
+            def forward(self, x, y, z):
+                return torch.cat((x, y, z), -1)
+
+        inputs = [torch.randn(1, 2, 3), torch.randn(1, 2, 3), torch.randn(1, 2, 2)]
+        self.run_test(Cat(), inputs, expected_ops={acc_ops.cat})
+
     def test_cat_with_dynamic_shape(self):
         class Cat(nn.Module):
             def forward(self, x, y):
diff --git a/test/converters/acc_op/test_einsum.py b/test/converters/acc_op/test_einsum.py
@@ -0,0 +1,35 @@
+import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
+import torch
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_fx2trt import AccTestCase
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestConverter(AccTestCase):
+    @parameterized.expand(
+        [
+            ("2d_dim", "ij,jk->ik", (2, 3), (3, 4)),
+            ("2d_dim_ext", "ij,kj->ik", (2, 3), (4, 3)),
+            ("3d_dim", "cxd,cyd->cxy", (3, 4, 5), (3, 6, 5)),
+            ("4d_dim", "bcwd,bcdh->bcwh", (2, 3, 4, 5), (2, 3, 5, 6)),
+            ("4d_dim_ext", "bcxd,bcyd->bcxy", (2, 3, 4, 5), (2, 3, 6, 5)),
+            # TRT does not support ellipsis or diagonal operations
+        ]
+    )
+    def test_einsum(self, _, equation, x_size, y_size):
+        class Einsum(nn.Module):
+            def forward(self, x, y):
+                return torch.einsum(equation, x, y)
+
+        inputs = [torch.randn(*x_size), torch.randn(*y_size)]
+        self.run_test(
+            Einsum(),
+            inputs,
+            expected_ops={acc_ops.einsum},
+            test_implicit_batch_dim=False,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/converters/acc_op/test_new_ones.py b/test/converters/acc_op/test_new_ones.py
@@ -0,0 +1,50 @@
+import fx2trt_oss.tracer.acc_tracer.acc_ops as acc_ops
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_fx2trt import AccTestCase, InputTensorSpec
+from torch.testing._internal.common_utils import run_tests
+
+
+class TestNewOnesConverter(AccTestCase):
+    def test_newone(self):
+        class TestModule(nn.Module):
+            def forward(self, x):
+                return x.new_ones((3, 5), dtype=torch.float16)
+
+        inputs = [torch.randn(1, 10)]
+        self.run_test(
+            TestModule(),
+            inputs,
+            expected_ops={acc_ops.new_ones},
+            test_implicit_batch_dim=False,
+        )
+
+    def test_newone_no_dtype(self):
+        class TestModule(nn.Module):
+            def forward(self, x):
+                return x.new_ones((3, 5))
+
+        inputs = [torch.randn(1, 10)]
+        self.run_test(
+            TestModule(),
+            inputs,
+            expected_ops={acc_ops.new_ones},
+            test_implicit_batch_dim=False,
+        )
+
+    def test_newone_device(self):
+        class TestModule(nn.Module):
+            def forward(self, x):
+                return x.new_ones((3, 5), device="cuda")
+
+        inputs = [torch.randn(1, 10)]
+        self.run_test(
+            TestModule(),
+            inputs,
+            expected_ops={acc_ops.new_ones},
+            test_implicit_batch_dim=False,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/tracer/test_acc_tracer.py b/test/tracer/test_acc_tracer.py
@@ -2500,5 +2500,9 @@ def test_all_acc_ops_registered(self):
                 acc_ops.isinf,
                 acc_ops.any,
                 acc_ops.tensor_split,
+                acc_ops.new_empty,
+                acc_ops.new_ones,
+                acc_ops.einsum,
+                acc_ops.as_strided,
             },
         )
diff --git a/tracer/acc_tracer/acc_ops.py b/tracer/acc_tracer/acc_ops.py
diff --git a/tracer/acc_tracer/acc_tracer.py b/tracer/acc_tracer/acc_tracer.py