kohya-ss
diff --git a/‎README.md‎
Lines changed: 5 additions & 3 deletions b/‎README.md‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎flux_minimal_inference.py‎
Lines changed: 5 additions & 3 deletions b/‎flux_minimal_inference.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎library/custom_offloading_utils.py‎
Lines changed: 28 additions & 9 deletions b/‎library/custom_offloading_utils.py‎
Lines changed: 28 additions & 9 deletions
diff --git a/‎library/device_utils.py‎
Lines changed: 20 additions & 2 deletions b/‎library/device_utils.py‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎library/flux_train_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎library/flux_train_utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎library/flux_utils.py‎
Lines changed: 2 additions & 2 deletions b/‎library/flux_utils.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎library/lumina_train_util.py‎
Lines changed: 2 additions & 1 deletion b/‎library/lumina_train_util.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎library/lumina_util.py‎
Lines changed: 1 addition & 1 deletion b/‎library/lumina_util.py‎
Lines changed: 1 addition & 1 deletion
@@ -13,11 +13,13 @@ For RTX 50 series GPUs, PyTorch 2.8.0 with CUDA 12.8/9 should be used. `requirem
 
 If you are using DeepSpeed, please install DeepSpeed with `pip install deepspeed` (appropriate version is not confirmed yet).
 
-- [FLUX.1 training](#flux1-training)
-- [SD3 training](#sd3-training)
-
 ### Recent Updates
 
+Sep 13, 2025:
+- The loading speed of `.safetensors` files has been improved for SD3, FLUX.1 and Lumina. See [PR #2200](https://github.com/kohya-ss/sd-scripts/pull/2200) for more details.
+    - Model loading can be up to 1.5 times faster.
+    - This is a wide-ranging update, so there may be bugs. Please let us know if you encounter any issues.
+
 Sep 4, 2025:
 - The information about FLUX.1 and SD3/SD3.5 training that was described in the README has been organized and divided into the following documents:
     - [LoRA Training Overview](./docs/train_network.md)
 
@@ -456,13 +456,13 @@ def is_fp8(dt):
     # load clip_l (skip for chroma model)
     if args.model_type == "flux":
         logger.info(f"Loading clip_l from {args.clip_l}...")
-        clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device)
+        clip_l = flux_utils.load_clip_l(args.clip_l, clip_l_dtype, loading_device, disable_mmap=True)
         clip_l.eval()
     else:
         clip_l = None
 
     logger.info(f"Loading t5xxl from {args.t5xxl}...")
-    t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device)
+    t5xxl = flux_utils.load_t5xxl(args.t5xxl, t5xxl_dtype, loading_device, disable_mmap=True)
     t5xxl.eval()
 
     # if is_fp8(clip_l_dtype):
@@ -471,7 +471,9 @@ def is_fp8(dt):
     #     t5xxl = accelerator.prepare(t5xxl)
 
     # DiT
-    is_schnell, model = flux_utils.load_flow_model(args.ckpt_path, None, loading_device, model_type=args.model_type)
+    is_schnell, model = flux_utils.load_flow_model(
+        args.ckpt_path, None, loading_device, disable_mmap=True, model_type=args.model_type
+    )
     model.eval()
     logger.info(f"Casting model to {flux_dtype}")
     model.to(flux_dtype)  # make sure model is dtype
 
@@ -1,13 +1,28 @@
 from concurrent.futures import ThreadPoolExecutor
+import gc
 import time
 from typing import Optional, Union, Callable, Tuple
 import torch
 import torch.nn as nn
 
-from library.device_utils import clean_memory_on_device
 
+# Keep these functions here for portability, and private to avoid confusion with the ones in device_utils.py
+def _clean_memory_on_device(device: torch.device):
+    r"""
+    Clean memory on the specified device, will be called from training scripts.
+    """
+    gc.collect()
+
+    # device may "cuda" or "cuda:0", so we need to check the type of device
+    if device.type == "cuda":
+        torch.cuda.empty_cache()
+    if device.type == "xpu":
+        torch.xpu.empty_cache()
+    if device.type == "mps":
+        torch.mps.empty_cache()
 
-def synchronize_device(device: torch.device):
+
+def _synchronize_device(device: torch.device):
     if device.type == "cuda":
         torch.cuda.synchronize()
     elif device.type == "xpu":
@@ -71,19 +86,18 @@ def swap_weight_devices_no_cuda(device: torch.device, layer_to_cpu: nn.Module, l
         if hasattr(module_to_cpu, "weight") and module_to_cpu.weight is not None:
             weight_swap_jobs.append((module_to_cpu, module_to_cuda, module_to_cpu.weight.data, module_to_cuda.weight.data))
 
-
     # device to cpu
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         module_to_cpu.weight.data = cuda_data_view.data.to("cpu", non_blocking=True)
 
-    synchronize_device(device)
+    _synchronize_device(device)
 
     # cpu to device
     for module_to_cpu, module_to_cuda, cuda_data_view, cpu_data_view in weight_swap_jobs:
         cuda_data_view.copy_(module_to_cuda.weight.data, non_blocking=True)
         module_to_cuda.weight.data = cuda_data_view
 
-    synchronize_device(device)
+    _synchronize_device(device)
 
 
 def weighs_to_device(layer: nn.Module, device: torch.device):
@@ -152,12 +166,15 @@ def _wait_blocks_move(self, block_idx):
 # Gradient tensors
 _grad_t = Union[tuple[torch.Tensor, ...], torch.Tensor]
 
+
 class ModelOffloader(Offloader):
     """
     supports forward offloading
     """
 
-    def __init__(self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False):
+    def __init__(
+        self, blocks: Union[list[nn.Module], nn.ModuleList], blocks_to_swap: int, device: torch.device, debug: bool = False
+    ):
         super().__init__(len(blocks), blocks_to_swap, device, debug)
 
         # register backward hooks
@@ -172,7 +189,9 @@ def __del__(self):
         for handle in self.remove_handles:
             handle.remove()
 
-    def create_backward_hook(self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
+    def create_backward_hook(
+        self, blocks: Union[list[nn.Module], nn.ModuleList], block_index: int
+    ) -> Optional[Callable[[nn.Module, _grad_t, _grad_t], Union[None, _grad_t]]]:
         # -1 for 0-based index
         num_blocks_propagated = self.num_blocks - block_index - 1
         swapping = num_blocks_propagated > 0 and num_blocks_propagated <= self.blocks_to_swap
@@ -213,8 +232,8 @@ def prepare_block_devices_before_forward(self, blocks: Union[list[nn.Module], nn
             b.to(self.device)  # move block to device first
             weighs_to_device(b, torch.device("cpu"))  # make sure weights are on cpu
 
-        synchronize_device(self.device)
-        clean_memory_on_device(self.device)
+        _synchronize_device(self.device)
+        _clean_memory_on_device(self.device)
 
     def wait_for_block(self, block_idx: int):
         if self.blocks_to_swap is None or self.blocks_to_swap == 0:
 
@@ -1,7 +1,9 @@
 import functools
 import gc
+from typing import Optional, Union
 
 import torch
+
 try:
     # intel gpu support for pytorch older than 2.5
     # ipex is not needed after pytorch 2.5
@@ -36,12 +38,15 @@ def clean_memory():
         torch.mps.empty_cache()
 
 
-def clean_memory_on_device(device: torch.device):
+def clean_memory_on_device(device: Optional[Union[str, torch.device]]):
     r"""
     Clean memory on the specified device, will be called from training scripts.
     """
     gc.collect()
-
+    if device is None:
+        return
+    if isinstance(device, str):
+        device = torch.device(device)
     # device may "cuda" or "cuda:0", so we need to check the type of device
     if device.type == "cuda":
         torch.cuda.empty_cache()
@@ -51,6 +56,19 @@ def clean_memory_on_device(device: torch.device):
         torch.mps.empty_cache()
 
 
+def synchronize_device(device: Optional[Union[str, torch.device]]):
+    if device is None:
+        return
+    if isinstance(device, str):
+        device = torch.device(device)
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+    elif device.type == "xpu":
+        torch.xpu.synchronize()
+    elif device.type == "mps":
+        torch.mps.synchronize()
+
+
 @functools.lru_cache(maxsize=None)
 def get_preferred_device() -> torch.device:
     r"""
 
@@ -16,10 +16,11 @@
 
 from library import flux_models, flux_utils, strategy_base, train_util
 from library.device_utils import init_ipex, clean_memory_on_device
+from library.safetensors_utils import mem_eff_save_file
 
 init_ipex()
 
-from .utils import setup_logging, mem_eff_save_file
+from .utils import setup_logging
 
 setup_logging()
 import logging
 
@@ -18,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 from library import flux_models
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 
 MODEL_VERSION_FLUX_V1 = "flux1"
 MODEL_NAME_DEV = "dev"
@@ -124,7 +124,7 @@ def load_flow_model(
         logger.info(f"Loading state dict from {ckpt_path}")
         sd = {}
         for ckpt_path in ckpt_paths:
-            sd.update(load_safetensors(ckpt_path, device=str(device), disable_mmap=disable_mmap, dtype=dtype))
+            sd.update(load_safetensors(ckpt_path, device=device, disable_mmap=disable_mmap, dtype=dtype))
 
         # convert Diffusers to BFL
         if is_diffusers:
 
@@ -18,10 +18,11 @@
 from library.flux_models import AutoEncoder
 from library.device_utils import init_ipex, clean_memory_on_device
 from library.sd3_train_utils import FlowMatchEulerDiscreteScheduler
+from library.safetensors_utils import mem_eff_save_file
 
 init_ipex()
 
-from .utils import setup_logging, mem_eff_save_file
+from .utils import setup_logging
 
 setup_logging()
 import logging
 
@@ -12,7 +12,7 @@
 
 from library.utils import setup_logging
 from library import lumina_models, flux_models
-from library.utils import load_safetensors
+from library.safetensors_utils import load_safetensors
 import logging
 
 setup_logging()