add offload to disk (#124)

tenderness-git · web-flow · commit a46071141109 · 2025-08-07T19:31:29.000+08:00
* add offload disk

* fp8
diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -16,6 +16,7 @@ class BaseConfig:
     vae_tile_stride: int | Tuple[int, int] = 256
     device: str = "cuda"
     offload_mode: Optional[str] = None
+    offload_to_disk: bool = False
 
 
 @dataclass
@@ -62,11 +63,13 @@ def basic_config(
         model_path: str | os.PathLike | List[str | os.PathLike],
         device: str = "cuda",
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "SDPipelineConfig":
         return cls(
             model_path=model_path,
             device=device,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
 
 
@@ -87,11 +90,13 @@ def basic_config(
         model_path: str | os.PathLike | List[str | os.PathLike],
         device: str = "cuda",
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "SDXLPipelineConfig":
         return cls(
             model_path=model_path,
             device=device,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
 
 
@@ -116,13 +121,15 @@ def basic_config(
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "FluxPipelineConfig":
         return cls(
             model_path=model_path,
             device=device,
             parallelism=parallelism,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
 
     def __post_init__(self):
@@ -160,6 +167,7 @@ def basic_config(
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "WanPipelineConfig":
         return cls(
             model_path=model_path,
@@ -169,6 +177,7 @@ def basic_config(
             use_cfg_parallel=True,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
 
     def __post_init__(self):
@@ -196,6 +205,7 @@ def basic_config(
         device: str = "cuda",
         parallelism: int = 1,
         offload_mode: Optional[str] = None,
+        offload_to_disk: bool = False,
     ) -> "QwenImagePipelineConfig":
         return cls(
             model_path=model_path,
@@ -206,6 +216,7 @@ def basic_config(
             use_cfg_parallel=True,
             use_fsdp=True,
             offload_mode=offload_mode,
+            offload_to_disk=offload_to_disk,
         )
 
     def __post_init__(self):
diff --git a/diffsynth_engine/pipelines/base.py b/diffsynth_engine/pipelines/base.py
@@ -41,6 +41,7 @@ def __init__(
         self.offload_mode = None
         self.model_names = []
         self._offload_param_dict = {}
+        self.offload_to_disk = False
 
     @classmethod
     def from_pretrained(cls, model_path_or_config: str | BaseConfig) -> "BasePipeline":
@@ -228,19 +229,23 @@ def eval(self):
                 model.eval()
         return self
 
-    def enable_cpu_offload(self, offload_mode: str):
-        valid_offload_mode = ("cpu_offload", "sequential_cpu_offload")
+    def enable_cpu_offload(self, offload_mode: str | None, offload_to_disk:bool = False):
+        valid_offload_mode = ("cpu_offload", "sequential_cpu_offload", "disable", None)
         if offload_mode not in valid_offload_mode:
             raise ValueError(f"offload_mode must be one of {valid_offload_mode}, but got {offload_mode}")
         if self.device == "cpu" or self.device == "mps":
             logger.warning("must set an non cpu device for pipeline before calling enable_cpu_offload")
             return
-        if offload_mode == "cpu_offload":
+        if offload_mode is None or offload_mode == "disable":
+            self._disable_offload()
+        elif offload_mode == "cpu_offload":
             self._enable_model_cpu_offload()
         elif offload_mode == "sequential_cpu_offload":
             self._enable_sequential_cpu_offload()
+        self.offload_to_disk = offload_to_disk
 
-    def _enable_model_cpu_offload(self):
+
+    def _enable_model_cpu_offload(self):        
         for model_name in self.model_names:
             model = getattr(self, model_name)
             if model is not None:
@@ -253,13 +258,23 @@ def _enable_sequential_cpu_offload(self):
             if model is not None:
                 enable_sequential_cpu_offload(model, self.device)
         self.offload_mode = "sequential_cpu_offload"
+    
+    def _disable_offload(self):
+        self.offload_mode = None                
+        self._offload_param_dict = {}        
+        for model_name in self.model_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                model.to(self.device)
+
 
     def enable_fp8_autocast(
         self, model_names: List[str], compute_dtype: torch.dtype = torch.bfloat16, use_fp8_linear: bool = False
     ):
         for model_name in model_names:
             model = getattr(self, model_name)
             if model is not None:
+                model.to(device=self.device, dtype=torch.float8_e4m3fn)                
                 enable_fp8_autocast(model, compute_dtype, use_fp8_linear)
         self.fp8_autocast_enabled = True
 
@@ -282,10 +297,26 @@ def load_models_to_device(self, load_model_names: List[str] | None = None):
         # load the needed models to device
         for model_name in load_model_names:
             model = getattr(self, model_name)
+            if model is None:
+                raise ValueError(f"model {model_name} is not loaded, maybe this model has been destroyed by model_lifecycle_finish function with offload_to_disk=True")
             if model is not None and (p := next(model.parameters(), None)) is not None and p.device.type != self.device:
                 model.to(self.device)
         # fresh the cuda cache
         empty_cache()
 
+    def model_lifecycle_finish(self, model_names: List[str] | None = None):
+        if not self.offload_to_disk or self.offload_mode is None:
+            return 
+        for model_name in model_names:
+            model = getattr(self, model_name)
+            del model
+            if model_name in self._offload_param_dict:
+                del self._offload_param_dict[model_name]
+            setattr(self, model_name, None)
+            print(f"model {model_name} has been deleted from memory")
+            logger.info(f"model {model_name} has been deleted from memory")
+            empty_cache()
+        
+        
     def compile(self):
         raise NotImplementedError(f"{self.__class__.__name__} does not support compile")
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -208,7 +208,19 @@ def from_state_dict(cls, state_dicts: QwenImageStateDicts, config: QwenImagePipe
         pipe.eval()
 
         if config.offload_mode is not None:
-            pipe.enable_cpu_offload(config.offload_mode)
+            pipe.enable_cpu_offload(config.offload_mode, config.offload_to_disk)
+        
+        if config.model_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # compute dtype
+            pipe.enable_fp8_autocast(
+                model_names=["dit"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
+            )
+
+        if config.encoder_dtype == torch.float8_e4m3fn:
+            pipe.dtype = torch.bfloat16  # compute dtype
+            pipe.enable_fp8_autocast(
+                model_names=["encoder"], compute_dtype=pipe.dtype, use_fp8_linear=config.use_fp8_linear
+            )
 
         if config.parallelism > 1:
             pipe = ParallelWrapper(
@@ -393,6 +405,7 @@ def __call__(
             negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(negative_prompt, 1, 4096)
         else:
             negative_prompt_embeds, negative_prompt_embeds_mask = None, None
+        self.model_lifecycle_finish(["encoder"])
 
         hide_progress = dist.is_initialized() and dist.get_rank() != 0
         for i, timestep in enumerate(tqdm(timesteps, disable=hide_progress)):
@@ -412,6 +425,7 @@ def __call__(
             # UI
             if progress_callback is not None:
                 progress_callback(i, len(timesteps), "DENOISING")
+        self.model_lifecycle_finish(["dit"])
         # Decode image
         self.load_models_to_device(["vae"])
         latents = rearrange(latents, "B C H W -> B C 1 H W")
@@ -423,5 +437,6 @@ def __call__(
         )
         image = self.vae_output_to_image(vae_output)
         # Offload all models
+        self.model_lifecycle_finish(["vae"])        
         self.load_models_to_device([])
         return image