Remove all unnecessary torch.cuda.empty_cache (hao-ai-lab#606)

Edenzzzz · web-flow · commit 8f1c5bc2d5cf · 2025-07-09T16:45:20.000-05:00
diff --git a/fastvideo/v1/distributed/parallel_state.py b/fastvideo/v1/distributed/parallel_state.py
@@ -23,7 +23,6 @@
  you can skip the model parallel initialization and destruction steps.
 """
 import contextlib
-import gc
 import os
 import pickle
 import weakref
@@ -1016,15 +1015,6 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     if shutdown_ray:
         import ray  # Lazy import Ray
         ray.shutdown()
-    gc.collect()
-    from fastvideo.v1.platforms import current_platform
-    if not current_platform.is_cpu():
-        torch.cuda.empty_cache()
-    try:
-        torch._C._host_emptyCache()
-    except AttributeError:
-        logger.warning(
-            "torch._C._host_emptyCache() only available in Pytorch >=2.5")
 
 
 def in_the_same_node_as(pg: ProcessGroup | StatelessProcessGroup,
diff --git a/fastvideo/v1/entrypoints/video_generator.py b/fastvideo/v1/entrypoints/video_generator.py
@@ -6,7 +6,6 @@
 diffusion models.
 """
 
-import gc
 import math
 import os
 import time
@@ -277,5 +276,3 @@ def shutdown(self):
         """
         self.executor.shutdown()
         del self.executor
-        gc.collect()
-        torch.cuda.empty_cache()
diff --git a/fastvideo/v1/models/vaes/common.py b/fastvideo/v1/models/vaes/common.py
@@ -239,7 +239,6 @@ def parallel_tiled_decode(self, z: torch.FloatTensor) -> torch.FloatTensor:
 
         results = torch.cat(local_results, dim=0).contiguous()
         del local_results
-        torch.cuda.empty_cache()
         # first gather size to pad the results
         local_size = torch.tensor([results.size(0)],
                                   device=results.device,
@@ -253,7 +252,7 @@ def parallel_tiled_decode(self, z: torch.FloatTensor) -> torch.FloatTensor:
         padded_results = torch.zeros(max_size, device=results.device)
         padded_results[:results.size(0)] = results
         del results
-        torch.cuda.empty_cache()
+
         # Gather all results
         gathered_dim_metadata = [None] * world_size
         gathered_results = torch.zeros_like(padded_results).repeat(
diff --git a/fastvideo/v1/pipelines/stages/encoding.py b/fastvideo/v1/pipelines/stages/encoding.py
@@ -136,7 +136,6 @@ def forward(
             self.maybe_free_model_hooks()
 
         self.vae.to("cpu")
-        torch.cuda.empty_cache()
 
         return batch
 
diff --git a/fastvideo/v1/pipelines/stages/image_encoding.py b/fastvideo/v1/pipelines/stages/image_encoding.py
@@ -5,8 +5,6 @@
 This module contains implementations of image encoding stages for diffusion pipelines.
 """
 
-import torch
-
 from fastvideo.v1.distributed import get_local_torch_device
 from fastvideo.v1.fastvideo_args import FastVideoArgs
 from fastvideo.v1.forward_context import set_forward_context
@@ -68,7 +66,6 @@ def forward(
 
         if fastvideo_args.use_cpu_offload:
             self.image_encoder.to('cpu')
-            torch.cuda.empty_cache()
 
         return batch
 
diff --git a/fastvideo/v1/training/training_pipeline.py b/fastvideo/v1/training/training_pipeline.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import gc
 import math
 import os
 import time
@@ -706,5 +705,3 @@ def _log_validation(self, transformer, training_args, global_step) -> None:
         # Re-enable gradients for training
         training_args.inference_mode = False
         transformer.train()
-        gc.collect()
-        torch.cuda.empty_cache()
diff --git a/fastvideo/v1/worker/gpu_worker.py b/fastvideo/v1/worker/gpu_worker.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import contextlib
 import faulthandler
-import gc
 import multiprocessing as mp
 import os
 import signal
@@ -69,8 +68,6 @@ def init_device(self) -> None:
         torch.cuda.set_device(self.device)
 
         # _check_if_gpu_supports_dtype(self.model_config.dtype)
-        gc.collect()
-        torch.cuda.empty_cache()
         self.init_gpu_memory = torch.cuda.mem_get_info()[0]
 
         os.environ["MASTER_ADDR"] = "localhost"
@@ -102,9 +99,6 @@ def shutdown(self) -> dict[str, Any]:
         if hasattr(self, 'pipeline') and self.pipeline is not None:
             # Clean up pipeline resources if needed
             pass
-        # Release CUDA resources
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
 
         # Destroy the distributed environment
         cleanup_dist_env_and_memory(shutdown_ray=False)
@@ -133,8 +127,6 @@ def event_loop(self) -> None:
 
                 # Handle regular RPC calls
                 if method_name == 'execute_forward':
-                    gc.collect()
-                    torch.cuda.empty_cache()
                     forward_batch = recv_rpc['kwargs']['forward_batch']
                     fastvideo_args = recv_rpc['kwargs']['fastvideo_args']
                     output_batch = self.execute_forward(forward_batch,