diff --git a/docs/source/en/package_reference/serialization.md b/docs/source/en/package_reference/serialization.md index 3bf0c6ba50..f45ad58cd8 100644 --- a/docs/source/en/package_reference/serialization.md +++ b/docs/source/en/package_reference/serialization.md @@ -6,7 +6,117 @@ rendered properly in your Markdown viewer. `huggingface_hub` provides helpers to save and load ML model weights in a standardized way. This part of the library is still under development and will be improved in future releases. The goal is to harmonize how weights are saved and loaded across the Hub, both to remove code duplication across libraries and to establish consistent conventions. -## Saving +## DDUF file format + +DDUF is a file format designed for diffusion models. It allows saving all the information to run a model in a single file. This work is inspired by the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) format. `huggingface_hub` provides helpers to save and load DDUF files, ensuring the file format is respected. + + + +This is a very early version of the parser. The API and implementation can evolve in the near future. + +The parser currently does very little validation. For more details about the file format, check out https://github.com/huggingface/huggingface.js/tree/main/packages/dduf. + + + +### How to write a DDUF file? + +Here is how to export a folder containing different parts of a diffusion model using [`export_folder_as_dduf`]: + +```python +# Export a folder as a DDUF file +>>> from huggingface_hub import export_folder_as_dduf +>>> export_folder_as_dduf("FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") +``` + +For more flexibility, you can use [`export_entries_as_dduf`] and pass a list of files to include in the final DDUF file: + +```python +# Export specific files from the local disk. +>>> from huggingface_hub import export_entries_as_dduf +>>> export_entries_as_dduf( +... dduf_path="stable-diffusion-v1-4-FP16.dduf", +... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) +... ("model_index.json", "path/to/model_index.json"), +... ("vae/config.json", "path/to/vae/config.json"), +... ("vae/diffusion_pytorch_model.fp16.safetensors", "path/to/vae/diffusion_pytorch_model.fp16.safetensors"), +... ("text_encoder/config.json", "path/to/text_encoder/config.json"), +... ("text_encoder/model.fp16.safetensors", "path/to/text_encoder/model.fp16.safetensors"), +... # ... add more entries here +... ] +... ) +``` + +The `entries` parameter also supports passing an iterable of paths or bytes. This can prove useful if you have a loaded model and want to serialize it directly into a DDUF file instead of having to serialize each component to disk first and then as a DDUF file. Here is an example of how a `StableDiffusionPipeline` can be serialized as DDUF: + + +```python +# Export state_dicts one by one from a loaded pipeline +>>> from diffusers import DiffusionPipeline +>>> from typing import Generator, Tuple +>>> import safetensors.torch +>>> from huggingface_hub import export_entries_as_dduf +>>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") +... # ... do some work with the pipeline + +>>> def as_entries(pipe: DiffusionPipeline) -> Generator[Tuple[str, bytes], None, None]: +... # Build a generator that yields the entries to add to the DDUF file. +... # The first element of the tuple is the filename in the DDUF archive (must use UNIX separator!). The second element is the content of the file. +... # Entries will be evaluated lazily when the DDUF file is created (only 1 entry is loaded in memory at a time) +... yield "vae/config.json", pipe.vae.to_json_string().encode() +... yield "vae/diffusion_pytorch_model.safetensors", safetensors.torch.save(pipe.vae.state_dict()) +... yield "text_encoder/config.json", pipe.text_encoder.config.to_json_string().encode() +... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) +... # ... add more entries here + +>>> export_entries_as_dduf(dduf_path="stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) +``` + +**Note:** in practice, `diffusers` provides a method to directly serialize a pipeline in a DDUF file. The snippet above is only meant as an example. + +### How to read a DDUF file? + +```python +>>> import json +>>> import safetensors.torch +>>> from huggingface_hub import read_dduf_file + +# Read DDUF metadata +>>> dduf_entries = read_dduf_file("FLUX.1-dev.dduf") + +# Returns a mapping filename <> DDUFEntry +>>> dduf_entries["model_index.json"] +DDUFEntry(filename='model_index.json', offset=66, length=587) + +# Load model index as JSON +>>> json.loads(dduf_entries["model_index.json"].read_text()) +{'_class_name': 'FluxPipeline', '_diffusers_version': '0.32.0.dev0', '_name_or_path': 'black-forest-labs/FLUX.1-dev', 'scheduler': ['diffusers', 'FlowMatchEulerDiscreteScheduler'], 'text_encoder': ['transformers', 'CLIPTextModel'], 'text_encoder_2': ['transformers', 'T5EncoderModel'], 'tokenizer': ['transformers', 'CLIPTokenizer'], 'tokenizer_2': ['transformers', 'T5TokenizerFast'], 'transformer': ['diffusers', 'FluxTransformer2DModel'], 'vae': ['diffusers', 'AutoencoderKL']} + +# Load VAE weights using safetensors +>>> with dduf_entries["vae/diffusion_pytorch_model.safetensors"].as_mmap() as mm: +... state_dict = safetensors.torch.load(mm) +``` + +### Helpers + +[[autodoc]] huggingface_hub.export_entries_as_dduf + +[[autodoc]] huggingface_hub.export_folder_as_dduf + +[[autodoc]] huggingface_hub.read_dduf_file + +[[autodoc]] huggingface_hub.DDUFEntry + +### Errors + +[[autodoc]] huggingface_hub.errors.DDUFError + +[[autodoc]] huggingface_hub.errors.DDUFCorruptedFileError + +[[autodoc]] huggingface_hub.errors.DDUFExportError + +[[autodoc]] huggingface_hub.errors.DDUFInvalidEntryNameError + +## Saving tensors The main helper of the `serialization` module takes a torch `nn.Module` as input and saves it to disk. It handles the logic to save shared tensors (see [safetensors explanation](https://huggingface.co/docs/safetensors/torch_shared_tensors)) as well as logic to split the state dictionary into shards, using [`split_torch_state_dict_into_shards`] under the hood. At the moment, only `torch` framework is supported. @@ -37,7 +147,7 @@ This is the underlying factory from which each framework-specific helper is deri [[autodoc]] huggingface_hub.split_state_dict_into_shards_factory -## Loading +## Loading tensors The loading helpers support both single-file and sharded checkpoints in either safetensors or pickle format. [`load_torch_model`] takes a `nn.Module` and a checkpoint path (either a single file or a directory) as input and load the weights into the model. @@ -50,7 +160,7 @@ The loading helpers support both single-file and sharded checkpoints in either s [[autodoc]] huggingface_hub.load_state_dict_from_file -## Helpers +## Tensors helpers ### get_torch_storage_id diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py index 733b6568f5..b71fa582c6 100644 --- a/src/huggingface_hub/__init__.py +++ b/src/huggingface_hub/__init__.py @@ -469,6 +469,12 @@ "split_tf_state_dict_into_shards", "split_torch_state_dict_into_shards", ], + "serialization._dduf": [ + "DDUFEntry", + "export_entries_as_dduf", + "export_folder_as_dduf", + "read_dduf_file", + ], "utils": [ "CacheNotFound", "CachedFileInfo", @@ -997,6 +1003,12 @@ def __dir__(): split_tf_state_dict_into_shards, # noqa: F401 split_torch_state_dict_into_shards, # noqa: F401 ) + from .serialization._dduf import ( + DDUFEntry, # noqa: F401 + export_entries_as_dduf, # noqa: F401 + export_folder_as_dduf, # noqa: F401 + read_dduf_file, # noqa: F401 + ) from .utils import ( CachedFileInfo, # noqa: F401 CachedRepoInfo, # noqa: F401 diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index 1dae6ddf97..226c8bb400 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -308,3 +308,22 @@ class BadRequestError(HfHubHTTPError, ValueError): huggingface_hub.utils._errors.BadRequestError: Bad request for check endpoint: {details} (Request ID: XXX) ``` """ + + +# DDUF file format ERROR + + +class DDUFError(Exception): + """Base exception for errors related to the DDUF format.""" + + +class DDUFCorruptedFileError(DDUFError): + """Exception thrown when the DDUF file is corrupted.""" + + +class DDUFExportError(DDUFError): + """Base exception for errors during DDUF export.""" + + +class DDUFInvalidEntryNameError(DDUFExportError): + """Exception thrown when the entry name is invalid.""" diff --git a/src/huggingface_hub/serialization/_dduf.py b/src/huggingface_hub/serialization/_dduf.py new file mode 100644 index 0000000000..a1debadb3a --- /dev/null +++ b/src/huggingface_hub/serialization/_dduf.py @@ -0,0 +1,387 @@ +import json +import logging +import mmap +import os +import shutil +import zipfile +from contextlib import contextmanager +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any, Dict, Generator, Iterable, Tuple, Union + +from ..errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError + + +logger = logging.getLogger(__name__) + +DDUF_ALLOWED_ENTRIES = { + # Allowed file extensions in a DDUF file + ".json", + ".model", + ".safetensors", + ".txt", +} + +DDUF_FOLDER_REQUIRED_ENTRIES = { + # Each folder must contain at least one of these entries + "config.json", + "tokenizer_config.json", + "preprocessor_config.json", + "scheduler_config.json", +} + + +@dataclass +class DDUFEntry: + """Object representing a file entry in a DDUF file. + + See [`read_dduf_file`] for how to read a DDUF file. + + Attributes: + filename (str): + The name of the file in the DDUF archive. + offset (int): + The offset of the file in the DDUF archive. + length (int): + The length of the file in the DDUF archive. + dduf_path (str): + The path to the DDUF archive (for internal use). + """ + + filename: str + length: int + offset: int + + dduf_path: Path = field(repr=False) + + @contextmanager + def as_mmap(self) -> Generator[bytes, None, None]: + """Open the file as a memory-mapped file. + + Useful to load safetensors directly from the file. + + Example: + ```py + >>> import safetensors.torch + >>> with entry.as_mmap() as mm: + ... tensors = safetensors.torch.load(mm) + ``` + """ + with self.dduf_path.open("rb") as f: + with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm: + yield mm[self.offset : self.offset + self.length] + + def read_text(self, encoding: str = "utf-8") -> str: + """Read the file as text. + + Useful for '.txt' and '.json' entries. + + Example: + ```py + >>> import json + >>> index = json.loads(entry.read_text()) + ``` + """ + with self.dduf_path.open("rb") as f: + f.seek(self.offset) + return f.read(self.length).decode(encoding=encoding) + + +def read_dduf_file(dduf_path: Union[os.PathLike, str]) -> Dict[str, DDUFEntry]: + """ + Read a DDUF file and return a dictionary of entries. + + Only the metadata is read, the data is not loaded in memory. + + Args: + dduf_path (`str` or `os.PathLike`): + The path to the DDUF file to read. + + Returns: + `Dict[str, DDUFEntry]`: + A dictionary of [`DDUFEntry`] indexed by filename. + + Raises: + - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). + + Example: + ```python + >>> import json + >>> import safetensors.torch + >>> from huggingface_hub import read_dduf_file + + # Read DDUF metadata + >>> dduf_entries = read_dduf_file("FLUX.1-dev.dduf") + + # Returns a mapping filename <> DDUFEntry + >>> dduf_entries["model_index.json"] + DDUFEntry(filename='model_index.json', offset=66, length=587) + + # Load model index as JSON + >>> json.loads(dduf_entries["model_index.json"].read_text()) + {'_class_name': 'FluxPipeline', '_diffusers_version': '0.32.0.dev0', '_name_or_path': 'black-forest-labs/FLUX.1-dev', ... + + # Load VAE weights using safetensors + >>> with dduf_entries["vae/diffusion_pytorch_model.safetensors"].as_mmap() as mm: + ... state_dict = safetensors.torch.load(mm) + ``` + """ + entries = {} + dduf_path = Path(dduf_path) + logger.info(f"Reading DDUF file {dduf_path}") + with zipfile.ZipFile(str(dduf_path), "r") as zf: + for info in zf.infolist(): + logger.debug(f"Reading entry {info.filename}") + if info.compress_type != zipfile.ZIP_STORED: + raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") + + try: + _validate_dduf_entry_name(info.filename) + except DDUFInvalidEntryNameError as e: + raise DDUFCorruptedFileError(f"Invalid entry name in DDUF file: {info.filename}") from e + + offset = _get_data_offset(zf, info) + + entries[info.filename] = DDUFEntry( + filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path + ) + + # Consistency checks on the DDUF file + if "model_index.json" not in entries: + raise DDUFCorruptedFileError("Missing required 'model_index.json' entry in DDUF file.") + index = json.loads(entries["model_index.json"].read_text()) + _validate_dduf_structure(index, entries.keys()) + + logger.info(f"Done reading DDUF file {dduf_path}. Found {len(entries)} entries") + return entries + + +def export_entries_as_dduf( + dduf_path: Union[str, os.PathLike], entries: Iterable[Tuple[str, Union[str, Path, bytes]]] +) -> None: + """Write a DDUF file from an iterable of entries. + + This is a lower-level helper than [`export_folder_as_dduf`] that allows more flexibility when serializing data. + In particular, you don't need to save the data on disk before exporting it in the DDUF file. + + Args: + dduf_path (`str` or `os.PathLike`): + The path to the DDUF file to write. + entries (`Iterable[Tuple[str, Union[str, Path, bytes]]]`): + An iterable of entries to write in the DDUF file. Each entry is a tuple with the filename and the content. + The filename should be the path to the file in the DDUF archive. + The content can be a string or a pathlib.Path representing a path to a file on the local disk or directly the content as bytes. + + Raises: + - [`DDUFExportError`]: If anything goes wrong during the export (e.g. invalid entry name, missing 'model_index.json', etc.). + + Example: + ```python + # Export specific files from the local disk. + >>> from huggingface_hub import export_entries_as_dduf + >>> export_entries_as_dduf( + ... dduf_path="stable-diffusion-v1-4-FP16.dduf", + ... entries=[ # List entries to add to the DDUF file (here, only FP16 weights) + ... ("model_index.json", "path/to/model_index.json"), + ... ("vae/config.json", "path/to/vae/config.json"), + ... ("vae/diffusion_pytorch_model.fp16.safetensors", "path/to/vae/diffusion_pytorch_model.fp16.safetensors"), + ... ("text_encoder/config.json", "path/to/text_encoder/config.json"), + ... ("text_encoder/model.fp16.safetensors", "path/to/text_encoder/model.fp16.safetensors"), + ... # ... add more entries here + ... ] + ... ) + ``` + + ```python + # Export state_dicts one by one from a loaded pipeline + >>> from diffusers import DiffusionPipeline + >>> from typing import Generator, Tuple + >>> import safetensors.torch + >>> from huggingface_hub import export_entries_as_dduf + >>> pipe = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") + ... # ... do some work with the pipeline + + >>> def as_entries(pipe: DiffusionPipeline) -> Generator[Tuple[str, bytes], None, None]: + ... # Build an generator that yields the entries to add to the DDUF file. + ... # The first element of the tuple is the filename in the DDUF archive (must use UNIX separator!). The second element is the content of the file. + ... # Entries will be evaluated lazily when the DDUF file is created (only 1 entry is loaded in memory at a time) + ... yield "vae/config.json", pipe.vae.to_json_string().encode() + ... yield "vae/diffusion_pytorch_model.safetensors", safetensors.torch.save(pipe.vae.state_dict()) + ... yield "text_encoder/config.json", pipe.text_encoder.config.to_json_string().encode() + ... yield "text_encoder/model.safetensors", safetensors.torch.save(pipe.text_encoder.state_dict()) + ... # ... add more entries here + + >>> export_entries_as_dduf(dduf_path="stable-diffusion-v1-4.dduf", entries=as_entries(pipe)) + ``` + """ + logger.info(f"Exporting DDUF file '{dduf_path}'") + filenames = set() + index = None + with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: + for filename, content in entries: + if filename in filenames: + raise DDUFExportError(f"Can't add duplicate entry: {filename}") + filenames.add(filename) + + if filename == "model_index.json": + try: + index = json.loads(_load_content(content).decode()) + except json.JSONDecodeError as e: + raise DDUFExportError("Failed to parse 'model_index.json'.") from e + + try: + filename = _validate_dduf_entry_name(filename) + except DDUFInvalidEntryNameError as e: + raise DDUFExportError(f"Invalid entry name: {filename}") from e + logger.debug(f"Adding entry '{filename}' to DDUF file") + _dump_content_in_archive(archive, filename, content) + + # Consistency checks on the DDUF file + if index is None: + raise DDUFExportError("Missing required 'model_index.json' entry in DDUF file.") + try: + _validate_dduf_structure(index, filenames) + except DDUFCorruptedFileError as e: + raise DDUFExportError("Invalid DDUF file structure.") from e + + logger.info(f"Done writing DDUF file {dduf_path}") + + +def export_folder_as_dduf(dduf_path: Union[str, os.PathLike], folder_path: Union[str, os.PathLike]) -> None: + """ + Export a folder as a DDUF file. + + AUses [`export_entries_as_dduf`] under the hood. + + Args: + dduf_path (`str` or `os.PathLike`): + The path to the DDUF file to write. + folder_path (`str` or `os.PathLike`): + The path to the folder containing the diffusion model. + + Example: + ```python + >>> from huggingface_hub import export_folder_as_dduf + >>> export_folder_as_dduf(dduf_path="FLUX.1-dev.dduf", folder_path="path/to/FLUX.1-dev") + ``` + """ + folder_path = Path(folder_path) + + def _iterate_over_folder() -> Iterable[Tuple[str, Path]]: + for path in Path(folder_path).glob("**/*"): + if not path.is_file(): + continue + if path.suffix not in DDUF_ALLOWED_ENTRIES: + logger.debug(f"Skipping file '{path}' (file type not allowed)") + continue + path_in_archive = path.relative_to(folder_path) + if len(path_in_archive.parts) >= 3: + logger.debug(f"Skipping file '{path}' (nested directories not allowed)") + continue + yield path_in_archive.as_posix(), path + + export_entries_as_dduf(dduf_path, _iterate_over_folder()) + + +def _dump_content_in_archive(archive: zipfile.ZipFile, filename: str, content: Union[str, os.PathLike, bytes]) -> None: + with archive.open(filename, "w", force_zip64=True) as archive_fh: + if isinstance(content, (str, Path)): + content_path = Path(content) + with content_path.open("rb") as content_fh: + shutil.copyfileobj(content_fh, archive_fh, 1024 * 1024 * 8) # type: ignore[misc] + elif isinstance(content, bytes): + archive_fh.write(content) + else: + raise DDUFExportError(f"Invalid content type for {filename}. Must be str, Path or bytes.") + + +def _load_content(content: Union[str, Path, bytes]) -> bytes: + """Load the content of an entry as bytes. + + Used only for small checks (not to dump content into archive). + """ + if isinstance(content, (str, Path)): + return Path(content).read_bytes() + elif isinstance(content, bytes): + return content + else: + raise DDUFExportError(f"Invalid content type. Must be str, Path or bytes. Got {type(content)}.") + + +def _validate_dduf_entry_name(entry_name: str) -> str: + if "." + entry_name.split(".")[-1] not in DDUF_ALLOWED_ENTRIES: + raise DDUFInvalidEntryNameError(f"File type not allowed: {entry_name}") + if "\\" in entry_name: + raise DDUFInvalidEntryNameError(f"Entry names must use UNIX separators ('/'). Got {entry_name}.") + entry_name = entry_name.strip("/") + if entry_name.count("/") > 1: + raise DDUFInvalidEntryNameError(f"DDUF only supports 1 level of directory. Got {entry_name}.") + return entry_name + + +def _validate_dduf_structure(index: Any, entry_names: Iterable[str]) -> None: + """ + Consistency checks on the DDUF file structure. + + Rules: + - The 'model_index.json' entry is required and must contain a dictionary. + - Each folder name must correspond to an entry in 'model_index.json'. + - Each folder must contain at least a config file ('config.json', 'tokenizer_config.json', 'preprocessor_config.json', 'scheduler_config.json'). + + Args: + index (Any): + The content of the 'model_index.json' entry. + entry_names (Iterable[str]): + The list of entry names in the DDUF file. + + Raises: + - [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). + """ + if not isinstance(index, dict): + raise DDUFCorruptedFileError(f"Invalid 'model_index.json' content. Must be a dictionary. Got {type(index)}.") + + dduf_folders = {entry.split("/")[0] for entry in entry_names if "/" in entry} + for folder in dduf_folders: + if folder not in index: + raise DDUFCorruptedFileError(f"Missing required entry '{folder}' in 'model_index.json'.") + if not any(f"{folder}/{required_entry}" in entry_names for required_entry in DDUF_FOLDER_REQUIRED_ENTRIES): + raise DDUFCorruptedFileError( + f"Missing required file in folder '{folder}'. Must contains at least one of {DDUF_FOLDER_REQUIRED_ENTRIES}." + ) + + +def _get_data_offset(zf: zipfile.ZipFile, info: zipfile.ZipInfo) -> int: + """ + Calculate the data offset for a file in a ZIP archive. + + Args: + zf (`zipfile.ZipFile`): + The opened ZIP file. Must be opened in read mode. + info (`zipfile.ZipInfo`): + The file info. + + Returns: + int: The offset of the file data in the ZIP archive. + """ + if zf.fp is None: + raise DDUFCorruptedFileError("ZipFile object must be opened in read mode.") + + # Step 1: Get the local file header offset + header_offset = info.header_offset + + # Step 2: Read the local file header + zf.fp.seek(header_offset) + local_file_header = zf.fp.read(30) # Fixed-size part of the local header + + if len(local_file_header) < 30: + raise DDUFCorruptedFileError("Incomplete local file header.") + + # Step 3: Parse the header fields to calculate the start of file data + # Local file header: https://en.wikipedia.org/wiki/ZIP_(file_format)#File_headers + filename_len = int.from_bytes(local_file_header[26:28], "little") + extra_field_len = int.from_bytes(local_file_header[28:30], "little") + + # Data offset is after the fixed header, filename, and extra fields + data_offset = header_offset + 30 + filename_len + extra_field_len + + return data_offset diff --git a/tests/test_dduf.py b/tests/test_dduf.py new file mode 100644 index 0000000000..ece8aa9dfc --- /dev/null +++ b/tests/test_dduf.py @@ -0,0 +1,238 @@ +import json +import zipfile +from pathlib import Path +from typing import Iterable, Tuple, Union + +import pytest +from pytest_mock import MockerFixture + +from huggingface_hub.errors import DDUFCorruptedFileError, DDUFExportError, DDUFInvalidEntryNameError +from huggingface_hub.serialization._dduf import ( + DDUFEntry, + _load_content, + _validate_dduf_entry_name, + _validate_dduf_structure, + export_entries_as_dduf, + export_folder_as_dduf, + read_dduf_file, +) + + +class TestDDUFEntry: + @pytest.fixture + def dummy_entry(self, tmp_path: Path) -> DDUFEntry: + dummy_dduf = tmp_path / "dummy_dduf.dduf" + dummy_dduf.write_bytes(b"somethingCONTENTsomething") + return DDUFEntry(filename="dummy.json", length=7, offset=9, dduf_path=dummy_dduf) + + def test_dataclass(self, dummy_entry: DDUFEntry): + assert dummy_entry.filename == "dummy.json" + assert dummy_entry.length == 7 + assert dummy_entry.offset == 9 + assert str(dummy_entry.dduf_path).endswith("dummy_dduf.dduf") + + def test_read_text(self, dummy_entry: DDUFEntry): + assert dummy_entry.read_text() == "CONTENT" + + def test_as_mmap(self, dummy_entry: DDUFEntry): + with dummy_entry.as_mmap() as mmap: + assert mmap == b"CONTENT" + + +class TestUtils: + @pytest.mark.parametrize("filename", ["dummy.txt", "dummy.json", "dummy.safetensors"]) + def test_entry_name_valid_extension(self, filename: str): + assert _validate_dduf_entry_name(filename) == filename + + @pytest.mark.parametrize("filename", ["dummy", "dummy.bin", "dummy.dduf", "dummy.gguf"]) + def test_entry_name_invalid_extension(self, filename: str): + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name(filename) + + @pytest.mark.parametrize("filename", ["encoder\\dummy.json", "C:\\dummy.json"]) + def test_entry_name_no_windows_path(self, filename: str): + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name(filename) + + def test_entry_name_stripped( + self, + ): + assert _validate_dduf_entry_name("/dummy.json") == "dummy.json" + + def test_entry_name_no_nested_directory(self): + _validate_dduf_entry_name("bar/dummy.json") # 1 level is ok + with pytest.raises(DDUFInvalidEntryNameError): + _validate_dduf_entry_name("foo/bar/dummy.json") # not more + + def test_load_content(self, tmp_path: Path): + content = b"hello world" + path = tmp_path / "hello.txt" + path.write_bytes(content) + + assert _load_content(content) == content # from bytes + assert _load_content(path) == content # from Path + assert _load_content(str(path)) == content # from str + + def test_validate_dduf_structure_valid(self): + _validate_dduf_structure( + { # model_index.json content + "_some_key": "some_value", + "encoder": { + "config.json": {}, + "model.safetensors": {}, + }, + }, + { # entries in DDUF archive + "model_index.json", + "something.txt", + "encoder/config.json", + "encoder/model.safetensors", + }, + ) + + def test_validate_dduf_structure_not_a_dict(self): + with pytest.raises(DDUFCorruptedFileError, match="Must be a dictionary."): + _validate_dduf_structure(["not a dict"], {}) # content from 'model_index.json' + + def test_validate_dduf_structure_missing_folder(self): + with pytest.raises(DDUFCorruptedFileError, match="Missing required entry 'encoder' in 'model_index.json'."): + _validate_dduf_structure({}, {"encoder/config.json", "encoder/model.safetensors"}) + + def test_validate_dduf_structure_missing_config_file(self): + with pytest.raises(DDUFCorruptedFileError, match="Missing required file in folder 'encoder'."): + _validate_dduf_structure( + {"encoder": {}}, + { + "encoder/not_a_config.json", # expecting a config.json / tokenizer_config.json / preprocessor_config.json / scheduler_config.json + "encoder/model.safetensors", + }, + ) + + +class TestExportFolder: + @pytest.fixture + def dummy_folder(self, tmp_path: Path): + folder_path = tmp_path / "dummy_folder" + folder_path.mkdir() + encoder_path = folder_path / "encoder" + encoder_path.mkdir() + subdir_path = encoder_path / "subdir" + subdir_path.mkdir() + + (folder_path / "config.json").touch() + (folder_path / "model.safetensors").touch() + (folder_path / "model.bin").touch() # won't be included + (encoder_path / "config.json").touch() + (encoder_path / "model.safetensors").touch() + (encoder_path / "model.bin").touch() # won't be included + (subdir_path / "config.json").touch() # won't be included + return folder_path + + def test_export_folder(self, dummy_folder: Path, mocker: MockerFixture): + mock = mocker.patch("huggingface_hub.serialization._dduf.export_entries_as_dduf") + export_folder_as_dduf("dummy.dduf", dummy_folder) + mock.assert_called_once() + args = mock.call_args_list[0].args + + assert args[0] == "dummy.dduf" + assert list(args[1]) == [ + # args[1] is a generator of tuples (path_in_archive, path_on_disk) + ("config.json", dummy_folder / "config.json"), + ("model.safetensors", dummy_folder / "model.safetensors"), + ("encoder/config.json", dummy_folder / "encoder/config.json"), + ("encoder/model.safetensors", dummy_folder / "encoder/model.safetensors"), + ] + + +class TestExportEntries: + @pytest.fixture + def dummy_entries(self, tmp_path: Path) -> Iterable[Tuple[str, Union[str, Path, bytes]]]: + (tmp_path / "model_index.json").write_text(json.dumps({"foo": "bar"})) + (tmp_path / "doesnt_have_to_be_same_name.safetensors").write_bytes(b"this is safetensors content") + + return [ + ("model_index.json", str(tmp_path / "model_index.json")), # string path + ("model.safetensors", tmp_path / "doesnt_have_to_be_same_name.safetensors"), # pathlib path + ("hello.txt", b"hello world"), # raw bytes + ] + + def test_export_entries( + self, tmp_path: Path, dummy_entries: Iterable[Tuple[str, Union[str, Path, bytes]]], mocker: MockerFixture + ): + mock = mocker.patch("huggingface_hub.serialization._dduf._validate_dduf_structure") + export_entries_as_dduf(tmp_path / "dummy.dduf", dummy_entries) + mock.assert_called_once_with({"foo": "bar"}, {"model_index.json", "model.safetensors", "hello.txt"}) + + with zipfile.ZipFile(tmp_path / "dummy.dduf", "r") as archive: + assert archive.compression == zipfile.ZIP_STORED # uncompressed! + assert archive.namelist() == ["model_index.json", "model.safetensors", "hello.txt"] + assert archive.read("model_index.json") == b'{"foo": "bar"}' + assert archive.read("model.safetensors") == b"this is safetensors content" + assert archive.read("hello.txt") == b"hello world" + + def test_export_entries_invalid_name(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Invalid entry name") as e: + export_entries_as_dduf(tmp_path / "dummy.dduf", [("config", "model_index.json")]) + assert isinstance(e.value.__cause__, DDUFInvalidEntryNameError) + + def test_export_entries_no_duplicate(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Can't add duplicate entry"): + export_entries_as_dduf( + tmp_path / "dummy.dduf", + [ + ("model_index.json", b'{"key": "content1"}'), + ("model_index.json", b'{"key": "content2"}'), + ], + ) + + def test_export_entries_model_index_required(self, tmp_path: Path): + with pytest.raises(DDUFExportError, match="Missing required 'model_index.json' entry"): + export_entries_as_dduf(tmp_path / "dummy.dduf", [("model.safetensors", b"content")]) + + +class TestReadDDUFFile: + @pytest.fixture + def dummy_dduf_file(self, tmp_path: Path) -> Path: + with zipfile.ZipFile(tmp_path / "dummy.dduf", "w") as archive: + archive.writestr("model_index.json", b'{"foo": "bar"}') + archive.writestr("model.safetensors", b"this is safetensors content") + archive.writestr("hello.txt", b"hello world") + return tmp_path / "dummy.dduf" + + def test_read_dduf_file(self, dummy_dduf_file: Path, mocker: MockerFixture): + mock = mocker.patch("huggingface_hub.serialization._dduf._validate_dduf_structure") + + entries = read_dduf_file(dummy_dduf_file) + assert len(entries) == 3 + index_entry = entries["model_index.json"] + model_entry = entries["model.safetensors"] + hello_entry = entries["hello.txt"] + + mock.assert_called_once_with({"foo": "bar"}, {"model_index.json", "model.safetensors", "hello.txt"}) + + assert index_entry.filename == "model_index.json" + assert index_entry.dduf_path == dummy_dduf_file + assert index_entry.read_text() == '{"foo": "bar"}' + with dummy_dduf_file.open("rb") as f: + f.seek(index_entry.offset) + assert f.read(index_entry.length) == b'{"foo": "bar"}' + + assert model_entry.filename == "model.safetensors" + assert model_entry.dduf_path == dummy_dduf_file + assert model_entry.read_text() == "this is safetensors content" + with dummy_dduf_file.open("rb") as f: + f.seek(model_entry.offset) + assert f.read(model_entry.length) == b"this is safetensors content" + + assert hello_entry.filename == "hello.txt" + assert hello_entry.dduf_path == dummy_dduf_file + assert hello_entry.read_text() == "hello world" + with dummy_dduf_file.open("rb") as f: + f.seek(hello_entry.offset) + assert f.read(hello_entry.length) == b"hello world" + + def test_model_index_required(self, tmp_path: Path): + with zipfile.ZipFile(tmp_path / "dummy.dduf", "w") as archive: + archive.writestr("model.safetensors", b"this is safetensors content") + with pytest.raises(DDUFCorruptedFileError, match="Missing required 'model_index.json' entry"): + read_dduf_file(tmp_path / "dummy.dduf")