nvidia-modelopt 0.15.1 examples release

kevalmorabia97 · kevalmorabia97 · commit 8a999e2accf6 · 2024-08-05T12:33:31.000-07:00
diff --git a/diffusers/cache_diffusion/benchmarks.py b/diffusers/cache_diffusion/benchmarks.py
@@ -0,0 +1,103 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+import argparse
+import time
+from pathlib import Path
+
+import torch
+from cache_diffusion import cachify
+from cache_diffusion.utils import SDXL_DEFAULT_CONFIG
+from diffusers import DiffusionPipeline
+from pipeline.deploy import compile, teardown
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch-size", type=int, default=2)
+    parser.add_argument("--num-inference-steps", type=int, default=30)
+    parser.add_argument("--num-iter", type=int, default=8)
+    args = parser.parse_args()
+    for key, value in vars(args).items():
+        if value is not None:
+            print("Parsed args -- {}: {}".format(key, value))
+    return args
+
+
+def main(args):
+    pipe = DiffusionPipeline.from_pretrained(
+        "stabilityai/stable-diffusion-xl-base-1.0",
+        torch_dtype=torch.float16,
+        variant="fp16",
+        use_safetensors=True,
+    )
+    pipe = pipe.to("cuda")
+
+    prompt = "A random person with a head that is made of flowers, photo by James C. Leyendecker, \
+        Afrofuturism, studio portrait, dynamic pose, national geographic photo, retrofuturism, \
+            biomorphicy"
+
+    compile(
+        pipe.unet,
+        onnx_path=Path("./onnx"),
+        engine_path=Path("./engine"),
+        batch_size=args.batch_size,
+    )
+
+    cachify.prepare(pipe, args.num_inference_steps, SDXL_DEFAULT_CONFIG)
+
+    generator = torch.Generator(device="cuda").manual_seed(2946901)
+    total_time = 0
+    cachify.disable(pipe)
+    for _ in range(args.num_iter):
+        start_time = time.time()
+        _ = pipe(
+            prompt=[prompt] * args.batch_size,
+            num_inference_steps=args.num_inference_steps,
+            generator=generator,
+        )
+        end_time = time.time()
+        total_time += end_time - start_time
+    total_time = total_time / args.num_iter
+    latency = total_time / args.batch_size
+    print(f"TRT Disabled Cache: {latency}")
+
+    generator = torch.Generator(device="cuda").manual_seed(2946901)
+    total_time = 0
+    cachify.enable(pipe)
+    for _ in range(args.num_iter):
+        start_time = time.time()
+        _ = pipe(
+            prompt=[prompt] * args.batch_size,
+            num_inference_steps=args.num_inference_steps,
+            generator=generator,
+        )
+        end_time = time.time()
+        total_time += end_time - start_time
+    total_time = total_time / args.num_iter
+    latency = total_time / args.batch_size
+    print(f"TRT Enabled Cache: {latency}")
+    teardown(pipe.unet)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/diffusers/cache_diffusion/cache_diffusion/cachify.py b/diffusers/cache_diffusion/cache_diffusion/cachify.py
@@ -87,13 +87,23 @@ def cachify(model, num_inference_steps, config_list, modules):
 
 def disable(pipe):
     model = get_model(pipe)
+    if hasattr(model, "use_trt_infer") and model.use_trt_infer:
+        for _, module in model.engines.items():
+            if isinstance(module, CachedModule):
+                module.disable_cache()
+        return
     for _, module in model.named_modules():
         if isinstance(module, CachedModule):
             module.disable_cache()
 
 
 def enable(pipe):
     model = get_model(pipe)
+    if hasattr(model, "use_trt_infer") and model.use_trt_infer:
+        for _, module in model.engines.items():
+            if isinstance(module, CachedModule):
+                module.enable_cache()
+        return
     for _, module in model.named_modules():
         if isinstance(module, CachedModule):
             module.enable_cache()
diff --git a/diffusers/cache_diffusion/pipeline/deploy.py b/diffusers/cache_diffusion/pipeline/deploy.py
@@ -54,14 +54,15 @@ def replace_new_forward(unet):
             upsample_block.forward = types.MethodType(cacheupblock2d_forward, upsample_block)
 
 
-def get_input_info(dummy_dict, info=None):
+def get_input_info(dummy_dict, info: str = None, batch_size: int = 1):
     return_val = [] if info == "profile_shapes" or info == "input_names" else {}
 
     def collect_leaf_keys(d):
         for key, value in d.items():
             if isinstance(value, dict):
                 collect_leaf_keys(value)
             else:
+                value = (value[0] * batch_size,) + value[1:]
                 if info == "profile_shapes":
                     return_val.append((key, value))  # type: ignore
                 elif info == "profile_shapes_dict":
@@ -75,7 +76,7 @@ def collect_leaf_keys(d):
     return return_val
 
 
-def complie2trt(onnx_path: Path, engine_path: Path):
+def complie2trt(onnx_path: Path, engine_path: Path, batch_size: int = 1):
     subdirs = [f for f in onnx_path.iterdir() if f.is_dir()]
     for subdir in subdirs:
         if subdir.name not in SDXL_ONNX_CONFIG.keys():
@@ -86,15 +87,17 @@ def complie2trt(onnx_path: Path, engine_path: Path):
             print(f"Building {str(model_path)}")
             build_profile = Profile()
             profile_shapes = get_input_info(
-                SDXL_ONNX_CONFIG[subdir.name]["dummy_input"], "profile_shapes"
+                SDXL_ONNX_CONFIG[subdir.name]["dummy_input"], "profile_shapes", batch_size
             )
             for input_name, input_shape in profile_shapes:
-                build_profile.add(input_name, input_shape, input_shape, input_shape)
+                min_input_shape = (2,) + input_shape[1:]
+                build_profile.add(input_name, min_input_shape, input_shape, input_shape)
             block_network = network_from_onnx_path(
-                str(model_path), flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]
+                str(model_path), flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM], strongly_typed=True
             )
             build_config = CreateConfig(
-                fp16=True,
+                builder_optimization_level=4,
+                tf32=True,
                 profiles=[build_profile],
             )
             engine = engine_from_network(
@@ -113,7 +116,7 @@ def get_total_device_memory(unet):
     return max_device_memory
 
 
-def load_engines(unet, engine_path: Path):
+def load_engines(unet, engine_path: Path, batch_size: int = 1):
     unet.engines = {}
     for f in engine_path.iterdir():
         if f.is_file():
@@ -127,9 +130,10 @@ def load_engines(unet, engine_path: Path):
     for block_name in unet.engines.keys():
         unet.engines[block_name].allocate_buffers(
             shape_dict=get_input_info(
-                SDXL_ONNX_CONFIG[block_name]["dummy_input"], "profile_shapes_dict"
+                SDXL_ONNX_CONFIG[block_name]["dummy_input"], "profile_shapes_dict", batch_size
             ),
             device=unet.device,
+            batch_size=batch_size,
         )
     # TODO: Free and clean up the origin pytorch cuda memory
 
@@ -216,10 +220,12 @@ def export_onnx(unet, onnx_path: Path):
             print(f"{str(_onnx_file)} alread exists!")
 
 
-def warm_up(unet):
+def warm_up(unet, batch_size: int = 1):
     print("Warming-up TensorRT engines...")
     for name, engine in unet.engines.items():
-        dummy_input = get_input_info(SDXL_ONNX_CONFIG[name]["dummy_input"], "dummy_input")
+        dummy_input = get_input_info(
+            SDXL_ONNX_CONFIG[name]["dummy_input"], "dummy_input", batch_size
+        )
         _ = engine(dummy_input, unet.cuda_stream)
 
 
@@ -231,13 +237,13 @@ def teardown(unet):
     del unet.cuda_stream
 
 
-def compile(unet, onnx_path: Path, engine_path: Path):
+def compile(unet, onnx_path: Path, engine_path: Path, batch_size: int = 1):
     onnx_path.mkdir(parents=True, exist_ok=True)
     engine_path.mkdir(parents=True, exist_ok=True)
 
     replace_new_forward(unet)
     export_onnx(unet, onnx_path)
-    complie2trt(onnx_path, engine_path)
-    load_engines(unet, engine_path)
-    warm_up(unet)
+    complie2trt(onnx_path, engine_path, batch_size)
+    load_engines(unet, engine_path, batch_size)
+    warm_up(unet, batch_size)
     unet.use_trt_infer = True
diff --git a/diffusers/cache_diffusion/pipeline/utils.py b/diffusers/cache_diffusion/pipeline/utils.py
@@ -71,13 +71,14 @@ def activate(self, reuse_device_memory=None):
         else:
             self.context = self.engine.create_execution_context()  # type: ignore
 
-    def allocate_buffers(self, shape_dict=None, device="cuda"):
+    def allocate_buffers(self, shape_dict=None, device="cuda", batch_size=1):
         for binding in range(self.engine.num_io_tensors):  # type: ignore
             name = self.engine.get_tensor_name(binding)  # type: ignore
             if shape_dict and name in shape_dict:
                 shape = shape_dict[name]
             else:
                 shape = self.engine.get_tensor_shape(name)  # type: ignore
+                shape = (batch_size * 2,) + shape[1:]
             dtype = trt.nptype(self.engine.get_tensor_dtype(name))  # type: ignore
             if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:  # type: ignore
                 self.context.set_input_shape(name, shape)  # type: ignore