huggingface · Wauplin · Dec 13, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md
@@ -6,6 +6,58 @@ rendered properly in your Markdown viewer.
 
 `huggingface_hub` contains helpers to help ML libraries serialize models weights in a standardized way. This part of the lib is still under development and will be improved in future releases. The goal is to harmonize how weights are serialized on the Hub, both to remove code duplication across libraries and to foster conventions on the Hub.
 
+## DDUF file format
+
+DDUF is a file format designed for diffusers models. It allows saving all the information to run a model in a single file. This work is inspired by the GGUF format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected.
+
+<Tip warning={true}>
+
+This is a very early version of the parser. The API and implementation can evolve in the near future.
+
+The parser currently does very little validation. For more details about the file format, check out https://github.com/huggingface/huggingface.js/tree/main/packages/dduf.
+
+</Tip>
+
+### How to write a DDUF file?
+
+```python
+>>> from huggingface_hub import write_dduf_file
+>>> write_dduf_file("FLUX.1-dev.dduf", diffuser_path="path/to/FLUX.1-dev")
+```
+
+### How to read a DDUF file?
+
+```python
+>>> import json
+>>> import safetensors.load
+>>> from huggingface_hub import read_dduf_file
+
+# Read DDUF metadata
+>>> dduf_entries = read_dduf_file("FLUX.1-dev.dduf")
+
+# Returns a mapping filename <> DDUFEntry
+>>> dduf_entries["model_index.json"]
+DDUFEntry(filename='model_index.json', offset=66, length=587)
+
+# Load model index as JSON
+>>> json.loads(dduf_entries["model_index.json"].read_text())
+{'_class_name': 'FluxPipeline', '_diffusers_version': '0.32.0.dev0', '_name_or_path': 'black-forest-labs/FLUX.1-dev', 'scheduler': ['diffusers', 'FlowMatchEulerDiscreteScheduler'], 'text_encoder': ['transformers', 'CLIPTextModel'], 'text_encoder_2': ['transformers', 'T5EncoderModel'], 'tokenizer': ['transformers', 'CLIPTokenizer'], 'tokenizer_2': ['transformers', 'T5TokenizerFast'], 'transformer': ['diffusers', 'FluxTransformer2DModel'], 'vae': ['diffusers', 'AutoencoderKL']}
+
+# Load VAE weights using safetensors
+>>> with dduf_entries["vae/diffusion_pytorch_model.safetensors"].as_mmap() as mm:
+...     state_dict = safetensors.torch.load(mm)
+```
+
+### Helpers
+
+[[autodoc]] huggingface_hub.write_dduf_file
+
+[[autodoc]] huggingface_hub.read_dduf_file
+
+[[autodoc]] huggingface_hub.DDUFEntry
+
+[[autodoc]] huggingface_hub.errors.DDUFCorruptedFileError
+
 ## Save torch state dict
 
 The main helper of the `serialization` module takes a torch `nn.Module` as input and saves it to disk. It handles the logic to save shared tensors (see [safetensors explanation](https://huggingface.co/docs/safetensors/torch_shared_tensors)) as well as logic to split the state dictionary into shards, using [`split_torch_state_dict_into_shards`] under the hood. At the moment, only `torch` framework is supported.

diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -468,6 +468,11 @@
         "split_tf_state_dict_into_shards",
         "split_torch_state_dict_into_shards",
     ],
+    "serialization._dduf": [
+        "DDUFEntry",
+        "read_dduf_file",
+        "write_dduf_file",
+    ],
     "utils": [
         "CacheNotFound",
         "CachedFileInfo",
@@ -995,6 +1000,11 @@ def __dir__():
         split_tf_state_dict_into_shards,  # noqa: F401
         split_torch_state_dict_into_shards,  # noqa: F401
     )
+    from .serialization._dduf import (
+        DDUFEntry,  # noqa: F401
+        read_dduf_file,  # noqa: F401
+        write_dduf_file,  # noqa: F401
+    )
     from .utils import (
         CachedFileInfo,  # noqa: F401
         CachedRepoInfo,  # noqa: F401

diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py
@@ -308,3 +308,10 @@ class BadRequestError(HfHubHTTPError, ValueError):
     huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX)
     ```
     """
+
+
+# DDUF file format ERROR
+
+
+class DDUFCorruptedFileError(Exception):
+    pass
diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py
@@ -0,0 +1,142 @@
+import logging
+import mmap
+import shutil
+import zipfile
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, Generator, Union
+
+from ..errors import DDUFCorruptedFileError
+
+
+logger = logging.getLogger(__name__)
+
+DDUF_ALLOWED_ENTRIES = {".json", ".gguf", ".txt", ".safetensors"}
+
+
+@dataclass
+class DDUFEntry:
+    """Object representing a file entry in a DDUF file.
+
+    See [`read_dduf_file`] for how to read a DDUF file.
+
+    Attributes:
+        filename (str):
+            The name of the file in the DDUF archive.
+        offset (int):
+            The offset of the file in the DDUF archive.
+        length (int):
+            The length of the file in the DDUF archive.
+        dduf_path (str):
+            The path to the DDUF archive (for internal use).
+    """
+
+    filename: str
+    length: int
+    offset: int
+
+    dduf_path: Path = field(repr=False)
+
+    @contextmanager
+    def as_mmap(self) -> Generator[bytes, None, None]:
+        """Open the file as a memory-mapped file.
+
+        Useful to load safetensors directly from the file.
+
+        Example:
+            ```py
+            >>> import safetensors.torch
+            >>> with entry.as_mmap() as mm:
+            ...     tensors = safetensors.torch.load(mm)
+            ```
+        """
+        with self.dduf_path.open("rb") as f:
+            with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
+                yield mm[self.offset : self.offset + self.length]
+
+    def read_text(self, encoding="utf-8") -> str:
+        """Read the file as text.
+
+        Useful for '.txt' and '.json' entries.
+        """
+        with self.dduf_path.open("rb") as f:
+            f.seek(self.offset)
+            return f.read(self.length).decode(encoding=encoding)
+
+
+def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]:
+    """
+    Read a DDUF file and return a dictionary of entries.
+
+    Only the metadata is read, the data is not loaded in memory.
+
+    Args:
+        dduf_path (`str` or `Path`):
+            The path to the DDUF file to read.
+
+    Returns:
+        `Dict[str, DDUFEntry]`:
+            A dictionary of [`DDUFEntry`] indexed by filename.
+
+    Raises:
+        - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format).
+    """
+    entries = {}
+    dduf_path = Path(dduf_path)
+    logger.info("Reading DDUF file %s", dduf_path)
+    with zipfile.ZipFile(str(dduf_path), "r") as zf:
+        for info in zf.infolist():
+            logger.debug("Reading entry %s", info.filename)
+            if info.compress_type != zipfile.ZIP_STORED:
+                raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.")
+
+            # Use private attribute to get data range for this file.
+            # Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser).
+            # Note: simply doing `info.header_offset + len(info.FileHeader())` doesn't work because of the ZIP64 extension.
+            offset = info._end_offset - info.compress_size  # type: ignore[attr-defined]
+
+            entries[info.filename] = DDUFEntry(
+                filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path
+            )
+    logger.info("Done reading DDUF file %s. Found %d entries", dduf_path, len(entries))
+    return entries
+
+
+def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path]) -> None:
+    """
+    Write a DDUF file from a diffusers folder.
+
+    A DDUF file is simply a ZIP archive with a few constraints (force ZIP64, no compression, only certain files).
+
+    Args:
+        dduf_path (`str` or `Path`):
+            The path to the DDUF file to write.
+        diffuser_path (`str` or `Path`):
+            The path to the folder containing the diffusers model.
+    """
+    # TODO: update method signature.
+    #       DDUF filename should be inferred as much as possible from high-level info (precision, model, etc.) to ensure consistency.
+    #       Example: "stable-diffusion-3.5-Q4-BNB.dduf"
+    #       See https://github.com/huggingface/diffusers/pull/10037#discussion_r1862275730.
+    logger.info("Writing DDUF file %s from folder %s", dduf_path, diffuser_path)
+    diffuser_path = Path(diffuser_path)
+    with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive:
+        for path in diffuser_path.glob("**/*"):
+            if path.is_dir():
+                logger.debug("Skipping directory %s", path)
+                continue
+            if path.suffix not in DDUF_ALLOWED_ENTRIES:
+                logger.debug("Skipping file %s (file type not allowed)", path)
+                continue
+            path_in_archive = path.relative_to(diffuser_path)
+            if len(path_in_archive.parts) > 3:
+                logger.debug("Skipping file %s (nested directories not allowed)", path)
+                continue
+            logger.debug("Adding file %s", path)
+            with archive.open(str(path_in_archive), "w", force_zip64=True) as f:
+                with path.open("rb") as src:
+                    # taken from zipfile source code
+                    # TODO: optimize this for large files
+                    shutil.copyfileobj(src, f, 1024 * 8)  # type: ignore[misc]
+    logger.info("Done writing DDUF file %s", dduf_path)