-
Notifications
You must be signed in to change notification settings - Fork 823
DDUF parser v0.1 #2692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
DDUF parser v0.1 #2692
Changes from 5 commits
3b533a6
953bbae
d30558b
0f21bd3
b4bf030
1b11f0b
f349cbe
bf7dc84
16c3e15
ba1e6a4
0546ca1
706597e
a6588aa
947a593
2881a57
2b7baf5
02a9532
c7ce20a
f5f0f25
0d1045d
dca1586
f6bee85
157633c
5cf560d
e6c62da
4078626
381ac7e
76265b4
d796252
5c2bb63
360ddd1
c168e23
4553f4c
6ad29e7
ce2b858
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import logging | ||
import mmap | ||
import shutil | ||
import zipfile | ||
from contextlib import contextmanager | ||
from dataclasses import dataclass, field | ||
from pathlib import Path | ||
from typing import Dict, Generator, Union | ||
|
||
from ..errors import DDUFCorruptedFileError | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
||
DDUF_ALLOWED_ENTRIES = {".json", ".gguf", ".txt", ".safetensors"} | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
|
||
@dataclass | ||
class DDUFEntry: | ||
"""Object representing a file entry in a DDUF file. | ||
See [`read_dduf_file`] for how to read a DDUF file. | ||
Attributes: | ||
filename (str): | ||
The name of the file in the DDUF archive. | ||
offset (int): | ||
The offset of the file in the DDUF archive. | ||
length (int): | ||
The length of the file in the DDUF archive. | ||
dduf_path (str): | ||
The path to the DDUF archive (for internal use). | ||
""" | ||
|
||
filename: str | ||
length: int | ||
offset: int | ||
|
||
dduf_path: Path = field(repr=False) | ||
|
||
@contextmanager | ||
def as_mmap(self) -> Generator[bytes, None, None]: | ||
"""Open the file as a memory-mapped file. | ||
Useful to load safetensors directly from the file. | ||
Example: | ||
```py | ||
>>> import safetensors.torch | ||
>>> with entry.as_mmap() as mm: | ||
... tensors = safetensors.torch.load(mm) | ||
``` | ||
""" | ||
with self.dduf_path.open("rb") as f: | ||
with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm: | ||
yield mm[self.offset : self.offset + self.length] | ||
|
||
def read_text(self, encoding="utf-8") -> str: | ||
"""Read the file as text. | ||
Useful for '.txt' and '.json' entries. | ||
""" | ||
with self.dduf_path.open("rb") as f: | ||
f.seek(self.offset) | ||
return f.read(self.length).decode(encoding=encoding) | ||
|
||
|
||
def read_dduf_file(dduf_path: Union[Path, str]) -> Dict[str, DDUFEntry]: | ||
""" | ||
Read a DDUF file and return a dictionary of entries. | ||
Only the metadata is read, the data is not loaded in memory. | ||
Args: | ||
Wauplin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dduf_path (`str` or `Path`): | ||
The path to the DDUF file to read. | ||
Returns: | ||
`Dict[str, DDUFEntry]`: | ||
A dictionary of [`DDUFEntry`] indexed by filename. | ||
Raises: | ||
- [`DDUFCorruptedFileError`]: If the DDUF file is corrupted (i.e. doesn't follow the DDUF format). | ||
""" | ||
entries = {} | ||
dduf_path = Path(dduf_path) | ||
logger.info("Reading DDUF file %s", dduf_path) | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
with zipfile.ZipFile(str(dduf_path), "r") as zf: | ||
for info in zf.infolist(): | ||
Wauplin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logger.debug("Reading entry %s", info.filename) | ||
if info.compress_type != zipfile.ZIP_STORED: | ||
Wauplin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise DDUFCorruptedFileError("Data must not be compressed in DDUF file.") | ||
|
||
# Use private attribute to get data range for this file. | ||
# Let's reconsider later if it's too problematic (worse case, we can build our own metadata parser). | ||
# Note: simply doing `info.header_offset + len(info.FileHeader())` doesn't work because of the ZIP64 extension. | ||
offset = info._end_offset - info.compress_size # type: ignore[attr-defined] | ||
|
||
entries[info.filename] = DDUFEntry( | ||
filename=info.filename, offset=offset, length=info.file_size, dduf_path=dduf_path | ||
) | ||
logger.info("Done reading DDUF file %s. Found %d entries", dduf_path, len(entries)) | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
return entries | ||
|
||
|
||
def write_dduf_file(dduf_path: Union[str, Path], diffuser_path: Union[str, Path]) -> None: | ||
|
||
""" | ||
Write a DDUF file from a diffusers folder. | ||
A DDUF file is simply a ZIP archive with a few constraints (force ZIP64, no compression, only certain files). | ||
Args: | ||
dduf_path (`str` or `Path`): | ||
The path to the DDUF file to write. | ||
diffuser_path (`str` or `Path`): | ||
The path to the folder containing the diffusers model. | ||
""" | ||
# TODO: update method signature. | ||
# DDUF filename should be inferred as much as possible from high-level info (precision, model, etc.) to ensure consistency. | ||
# Example: "stable-diffusion-3.5-Q4-BNB.dduf" | ||
# See https://github.com/huggingface/diffusers/pull/10037#discussion_r1862275730. | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
logger.info("Writing DDUF file %s from folder %s", dduf_path, diffuser_path) | ||
diffuser_path = Path(diffuser_path) | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
with zipfile.ZipFile(str(dduf_path), "w", zipfile.ZIP_STORED) as archive: | ||
for path in diffuser_path.glob("**/*"): | ||
if path.is_dir(): | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
logger.debug("Skipping directory %s", path) | ||
continue | ||
Wauplin marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
if path.suffix not in DDUF_ALLOWED_ENTRIES: | ||
logger.debug("Skipping file %s (file type not allowed)", path) | ||
continue | ||
path_in_archive = path.relative_to(diffuser_path) | ||
if len(path_in_archive.parts) > 3: | ||
logger.debug("Skipping file %s (nested directories not allowed)", path) | ||
continue | ||
logger.debug("Adding file %s", path) | ||
with archive.open(str(path_in_archive), "w", force_zip64=True) as f: | ||
with path.open("rb") as src: | ||
# taken from zipfile source code | ||
# TODO: optimize this for large files | ||
shutil.copyfileobj(src, f, 1024 * 8) # type: ignore[misc] | ||
logger.info("Done writing DDUF file %s", dduf_path) |
Uh oh!
There was an error while loading. Please reload this page.