FlashPack

hlky · hlky · commit a6ffed68766c · 2025-11-22T08:59:06.000Z
diff --git a/setup.py b/setup.py
@@ -248,6 +248,7 @@ def run(self):
 extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate")
 extras["torchao"] = deps_list("torchao", "accelerate")
 extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[hf]")
+extras["flashpack"] = deps_list("flashpack")
 
 if os.name == "nt":  # windows
     extras["flax"] = []  # jax is not supported on windows
diff --git a/src/diffusers/models/model_loading_utils.py b/src/diffusers/models/model_loading_utils.py
@@ -33,6 +33,7 @@
 from ..quantizers import DiffusersQuantizer
 from ..utils import (
     DEFAULT_HF_PARALLEL_LOADING_WORKERS,
+    FLASHPACK_FILE_EXTENSION,
     GGUF_FILE_EXTENSION,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_FILE_EXTENSION,
@@ -42,6 +43,7 @@
     deprecate,
     is_accelerate_available,
     is_accelerate_version,
+    is_flashpack_available,
     is_gguf_available,
     is_torch_available,
     is_torch_version,
@@ -177,6 +179,8 @@ def load_state_dict(
                 return safetensors.torch.load_file(checkpoint_file, device=map_location)
         elif file_extension == GGUF_FILE_EXTENSION:
             return load_gguf_checkpoint(checkpoint_file)
+        elif file_extension == FLASHPACK_FILE_EXTENSION:
+            return load_flashpack_checkpoint(checkpoint_file)
         else:
             extra_args = {}
             weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
@@ -682,6 +686,33 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
     return parsed_parameters
 
 
+def load_flashpack_checkpoint(flashpack_checkpoint_path: str):
+    """
+    Load a FlashPack file and return a dictionary of parsed parameters containing tensors.
+
+    Args:
+        flashpack_checkpoint_path (`str`):
+            The path the to FlashPack file to load
+    """
+
+    if is_flashpack_available() and is_torch_available():
+        import flashpack
+    else:
+        logger.error(
+            "Loading a FlashPack checkpoint in PyTorch, requires both PyTorch and flashpack to be installed. Please see "
+            "https://pytorch.org/ and https://github.com/fal-ai/flashpack for installation instructions."
+        )
+        raise ImportError("Please install torch and flashpack to load a FlashPack checkpoint in PyTorch.")
+
+    flash_tensor, meta = flashpack.deserialization.read_flashpack_file(
+        path=flashpack_checkpoint_path,
+    )
+    state_dict = {}
+    for name, view in flashpack.deserialization.iterate_from_flash_tensor(flash_tensor, meta):
+        state_dict[name] = view
+    return state_dict
+
+
 def _find_mismatched_keys(state_dict, model_state_dict, loaded_keys, ignore_mismatched_sizes):
     mismatched_keys = []
     if not ignore_mismatched_sizes:
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -42,6 +42,7 @@
 from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     CONFIG_NAME,
+    FLASHPACK_WEIGHTS_NAME,
     FLAX_WEIGHTS_NAME,
     HF_ENABLE_PARALLEL_LOADING,
     SAFE_WEIGHTS_INDEX_NAME,
@@ -55,6 +56,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_bitsandbytes_version,
+    is_flashpack_available,
     is_peft_available,
     is_torch_version,
     logging,
@@ -913,6 +915,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             disable_mmap ('bool', *optional*, defaults to 'False'):
                 Whether to disable mmap when loading a Safetensors model. This option can perform better when the model
                 is on a network mount or hard drive, which may not handle the seeky-ness of mmap very well.
+            use_flashpack (`bool`, *optional*, defaults to `False`):
+                If set to `True`, the model is loaded from `flashpack` weights.
 
         > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
         with `hf > auth login`. You can also activate the special >
@@ -957,6 +961,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
         disable_mmap = kwargs.pop("disable_mmap", False)
         parallel_config: Optional[Union[ParallelConfig, ContextParallelConfig]] = kwargs.pop("parallel_config", None)
+        use_flashpack = kwargs.pop("use_flashpack", False)
 
         is_parallel_loading_enabled = HF_ENABLE_PARALLEL_LOADING
         if is_parallel_loading_enabled and not low_cpu_mem_usage:
@@ -1185,6 +1190,30 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                     subfolder=subfolder or "",
                     dduf_entries=dduf_entries,
                 )
+            elif use_flashpack:
+                try:
+                    resolved_model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=FLASHPACK_WEIGHTS_NAME,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                        dduf_entries=dduf_entries,
+                    )
+
+                except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                    if not allow_pickle:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
             elif use_safetensors:
                 try:
                     resolved_model_file = _get_model_file(
@@ -1248,6 +1277,32 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         with ContextManagers(init_contexts):
             model = cls.from_config(config, **unused_kwargs)
 
+        if use_flashpack:
+            if is_flashpack_available():
+                import flashpack
+
+                flashpack.mixin.assign_from_file(
+                    model=model,
+                    path=resolved_model_file[0],
+                    device=None if device_map is None else device_map[""],
+                    # silent=silent,
+                    # strict=strict,
+                    # strict_params=strict_params,
+                    # strict_buffers=strict_buffers,
+                    # keep_flash_ref_on_model=keep_flash_ref_on_model,
+                    # num_streams=num_streams,
+                    # chunk_bytes=chunk_bytes,
+                    # ignore_names=ignore_names or cls.flashpack_ignore_names,
+                    # ignore_prefixes=ignore_prefixes or cls.flashpack_ignore_prefixes,
+                    # ignore_suffixes=ignore_suffixes or cls.flashpack_ignore_suffixes,
+                    # use_distributed_loading=use_distributed_loading,
+                    # rank=rank,
+                    # local_rank=local_rank,
+                    # world_size=world_size,
+                    # coerce_dtype=coerce_dtype or cls.flashpack_coerce_dtype,
+                )
+                return model
+
         if dtype_orig is not None:
             torch.set_default_dtype(dtype_orig)
 
diff --git a/src/diffusers/pipelines/pipeline_loading_utils.py b/src/diffusers/pipelines/pipeline_loading_utils.py
@@ -413,6 +413,9 @@ def get_class_obj_and_candidates(
     """Simple helper method to retrieve class object of module as well as potential parent class objects"""
     component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
 
+    if class_name.startswith("FlashPack"):
+        class_name = class_name.removeprefix("FlashPack")
+
     if is_pipeline_module:
         pipeline_module = getattr(pipelines, library_name)
 
diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py
@@ -23,6 +23,8 @@
     DEFAULT_HF_PARALLEL_LOADING_WORKERS,
     DEPRECATED_REVISION_ARGS,
     DIFFUSERS_DYNAMIC_MODULE_NAME,
+    FLASHPACK_FILE_EXTENSION,
+    FLASHPACK_WEIGHTS_NAME,
     FLAX_WEIGHTS_NAME,
     GGUF_FILE_EXTENSION,
     HF_ENABLE_PARALLEL_LOADING,
@@ -74,6 +76,7 @@
     is_flash_attn_3_available,
     is_flash_attn_available,
     is_flash_attn_version,
+    is_flashpack_available,
     is_flax_available,
     is_ftfy_available,
     is_gguf_available,
diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py
@@ -34,6 +34,8 @@
 SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors"
 SAFE_WEIGHTS_INDEX_NAME = "diffusion_pytorch_model.safetensors.index.json"
 SAFETENSORS_FILE_EXTENSION = "safetensors"
+FLASHPACK_WEIGHTS_NAME = "model.flashpack"
+FLASHPACK_FILE_EXTENSION = "flashpack"
 GGUF_FILE_EXTENSION = "gguf"
 ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb"
 HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co")
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
@@ -230,6 +230,7 @@ def _is_package_available(pkg_name: str, get_dist_name: bool = False) -> Tuple[b
 _aiter_available, _aiter_version = _is_package_available("aiter")
 _kornia_available, _kornia_version = _is_package_available("kornia")
 _nvidia_modelopt_available, _nvidia_modelopt_version = _is_package_available("modelopt", get_dist_name=True)
+_flashpack_available, _flashpack_version = _is_package_available("flashpack")
 
 
 def is_torch_available():
@@ -364,6 +365,10 @@ def is_gguf_available():
     return _gguf_available
 
 
+def is_flashpack_available():
+    return _flashpack_available
+
+
 def is_torchao_available():
     return _torchao_available