ready for review

cehongwang · cehongwang · commit 503f3208d8e9 · 2025-09-23T00:01:57.000Z
diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py
@@ -62,10 +62,6 @@ def compile_model(
         torch_dtype=torch.float16,
     ).to(torch.float16)
 
-    # pipe.transformer = FluxTransformer2DModel(
-    #     num_layers=28, num_single_layers=12, guidance_embeds=True
-    # ).to(torch.float16)
-
     if args.low_vram_mode:
         pipe.enable_model_cpu_offload()
     else:
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -694,7 +694,7 @@ def compile(
     # Move the weights in the state_dict to CPU
     if offload_module_to_cpu:
         deallocate_module(gm, delete_module=False)
-        # deallocate_module(exported_program.module(), delete_module=False)
+        deallocate_module(exported_program.module(), delete_module=False)
         logger.info(
             "The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"
         )
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -591,13 +591,11 @@ def _save_weight_mapping(self) -> None:
         torch.cuda.empty_cache()
 
     @needs_refit  # type: ignore[misc]
-    def _insert_engine_to_cache(self, hash_val: str, serialized_engine: bytes) -> None:
+    def _insert_engine_to_cache(self, hash_val: str, engine: bytes) -> None:
+        serialized_engine = engine.serialize()
         # TODO: @Evan is waiting for TRT's feature to cache the weight-stripped engine
         # if not self.compilation_settings.strip_engine_weights:
         #     # set EXCLUDE_WEIGHTS flag to strip weights
-        #     runtime = trt.Runtime(TRT_LOGGER)
-        #     engine = runtime.deserialize_cuda_engine(serialized_engine)
-
         #     serialization_config = engine.create_serialization_config()
         #     serialization_config.set_flag(trt.SerializationFlag.EXCLUDE_WEIGHTS)
         #     serialized_engine = engine.serialize_with_config(
@@ -731,10 +729,6 @@ def run(
                     if interpreter_result is not None:  # hit the cache
                         return interpreter_result  # type: ignore[no-any-return]
 
-        import psutil
-
-        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
-        # breakpoint()
         self._construct_trt_network_def()
 
         if not self.compilation_settings.immutable_weights:
@@ -753,14 +747,11 @@ def run(
         self._create_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-        import psutil
-
-        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
-        # breakpoint()
 
         cuda_engine = self.builder.build_engine_with_config(
             self.ctx.net, builder_config
         )
+        assert cuda_engine
 
         _LOGGER.info(
             f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
@@ -772,17 +763,13 @@ def run(
         )
 
         # Engine caching only for refittable engines
-        # if (
-        #     not self.compilation_settings.immutable_weights
-        #     and self.compilation_settings.cache_built_engines
-        #     and self.engine_cache is not None
-        # ):
-        #     self._insert_engine_to_cache(hash_val, serialized_engine)
-
-        print("After build_engine_with_config")
-        print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
-        # breakpoint()
-        assert cuda_engine
+        if (
+            not self.compilation_settings.immutable_weights
+            and self.compilation_settings.cache_built_engines
+            and self.engine_cache is not None
+        ):
+            self._insert_engine_to_cache(hash_val, cuda_engine)
+
         if self.compilation_settings.use_python_runtime:
             return TRTInterpreterResult(
                 cuda_engine,
@@ -792,16 +779,13 @@ def run(
                 self.ctx.requires_output_allocator,
             )
         else:
-            print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
-            # breakpoint()
             serialized_engine = cuda_engine.serialize()
             _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")
 
             with io.BytesIO() as engine_bytes:
                 engine_bytes.write(serialized_engine)
                 engine_str = engine_bytes.getvalue()
-            print(psutil.Process().memory_info().rss / 1024 / 1024, "MB")
-            # breakpoint()
+
             return TRTInterpreterResult(
                 engine_str,
                 self._input_names,

Original file line number	Diff line number	Diff line change
`@@ -694,7 +694,7 @@ def compile(`
`694`	`694`	`# Move the weights in the state_dict to CPU`
`695`	`695`	`if offload_module_to_cpu:`
`696`	`696`	`deallocate_module(gm, delete_module=False)`
`697`		`- # deallocate_module(exported_program.module(), delete_module=False)`
	`697`	`+ deallocate_module(exported_program.module(), delete_module=False)`
`698`	`698`	`logger.info(`
`699`	`699`	`"The PyTorch model was moved to the CPU to allocate all GPU memory to TensorRT. To retain the model on the GPU, set offload_module_to_cpu=False"`
`700`	`700`	`)`