Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 165 additions & 48 deletions convert_hf_to_gguf.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ void ggml_print_backtrace(void);
# define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif

// required for mmap as gguf only guarantees 32-byte alignment
#define TENSOR_ALIGNMENT 32
// required for mmap as gguf converted with reflinks from safetensors only guarantees 8-byte alignment
#define TENSOR_ALIGNMENT 8

// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
Expand Down
8 changes: 5 additions & 3 deletions ggml/src/gguf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -624,14 +624,16 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
ctx->size = 0;
for (size_t i = 0; i < ctx->info.size(); ++i) {
const gguf_tensor_info & ti = ctx->info[i];
if (ti.offset != ctx->size) {
// alignment offset only exists for GGUF converted with reflinks
const size_t align_offset = ti.offset % ctx->alignment;
if (ti.offset - align_offset != ctx->size) {
GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
__func__, ti.t.name, ti.offset, ctx->size);
__func__, ti.t.name, ti.offset, ctx->size + align_offset);
GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
gguf_free(ctx);
return nullptr;
}
size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t) + align_offset, ctx->alignment);
if (SIZE_MAX - ctx->size < padded_size) {
GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n",
__func__, ti.t.name, ctx->size, padded_size);
Expand Down
38 changes: 30 additions & 8 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
ExpertGatingFuncType,
)

from .lazy import best_extra_offset, count_reflinkable_size
from .quants import quant_shape_from_byte_shape

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -84,14 +85,16 @@ class GGUFWriter:

def __init__(
self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False,
use_reflinks = False, # opportunistically attempt to use copy-on-write
):
self.fout = None
self.path = Path(path) if path else None
self.arch = arch
self.endianess = endianess
self.data_alignment = GGUF_DEFAULT_ALIGNMENT
self.use_temp_file = use_temp_file
self.use_reflinks = use_reflinks
self.use_temp_file = False if self.use_reflinks else use_temp_file
self.temp_file = None
self.tensors = [{}]
self.kv_data = [{}]
Expand Down Expand Up @@ -178,13 +181,28 @@ def open_output_file(self, path: Path | None = None) -> None:
self.fout = [open(filename, "wb") for filename in filenames]
self.state = WriterState.EMPTY

if self.use_reflinks:
# reflinks require alignment to the filesystem blocks
block_size = os.stat(self.path.parent).st_blksize
# necessary to get an appropriate data start offset when padding for reflinks;
# using the real alignment (8 bytes, from safetensors) would result in a unusable base data offset
self.data_alignment = block_size
# for all shards to allow reading them on their own
for i, kv in enumerate(self.kv_data):
# insert at the start of the key-values
if Keys.General.ALIGNMENT in kv:
del kv[Keys.General.ALIGNMENT]
self.kv_data[i] = {Keys.General.ALIGNMENT: GGUFValue(block_size, GGUFValueType.UINT32), **kv}

def print_plan(self) -> list[Path]:
logger.info("Writing the following files:")
assert self.path is not None
filenames = self.format_shard_names(self.path)
assert len(filenames) == len(self.tensors)
for name, tensors in zip(filenames, self.tensors):
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
total_size = sum(ti.nbytes for ti in tensors.values())
reflinkable_size = count_reflinkable_size((name, ti.tensor) for name, ti in tensors.items()) if self.use_reflinks else 0
logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(total_size)}{', reflinked = ' + GGUFWriter.format_n_bytes_to_str(total_size - reflinkable_size) if self.use_reflinks else ''}")

if self.dry_run:
logger.info("Dry run, not writing files")
Expand Down Expand Up @@ -257,14 +275,18 @@ def write_ti_data_to_file(self) -> None:
offset_tensor = 0

for name, ti in tensors.items():
extra_offset = 0
if self.use_reflinks:
extra_offset = best_extra_offset(ti.tensor, offset_tensor)

ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
n_dims = len(ti.shape)
ti_data += self._pack("I", n_dims)
for j in range(n_dims):
ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
ti_data += self._pack("I", ti.dtype)
ti_data += self._pack("Q", offset_tensor)
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
ti_data += self._pack("Q", offset_tensor + extra_offset)
offset_tensor += GGUFWriter.ggml_pad(ti.nbytes + extra_offset, self.data_alignment)

fout.write(ti_data)
fout.flush()
Expand Down Expand Up @@ -392,7 +414,7 @@ def add_tensor(
def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
if pad != 0:
fp.write(bytes([0] * pad))
fp.write(b"\x00" * pad)

def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
Expand All @@ -418,7 +440,7 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:

self.write_padding(fout, fout.tell())
tensor.tofile(fout)
self.write_padding(fout, tensor.nbytes)
self.write_padding(fout, fout.tell())

self.state = WriterState.WEIGHTS

Expand Down Expand Up @@ -458,7 +480,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
shard_bar.update(ti.nbytes)
if bar is not None:
bar.update(ti.nbytes)
self.write_padding(fout, ti.nbytes)
self.write_padding(fout, fout.tell())
ti.tensor = None
else:
self.temp_file.seek(0)
Expand Down
180 changes: 169 additions & 11 deletions gguf-py/gguf/lazy.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from __future__ import annotations
from abc import ABC, ABCMeta, abstractmethod

import logging
from typing import Any, Callable
from io import BufferedReader, BufferedWriter
from pathlib import Path
from typing import Any, Callable, Iterable

import logging
import numpy as np
import os
import shutil

from numpy.typing import DTypeLike

from .utility import LocalTensorRange


logger = logging.getLogger(__name__)

Expand All @@ -20,10 +27,11 @@ def __getattr__(self, name: str) -> Any:
return type(self)._wrap_fn(
(lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
use_self=self,
data_noop=name in ("view", "reshape", "squeeze", "unsqueeze", "contiguous"),
)
elif isinstance(meta_attr, self._tensor_type):
# e.g. self.T with torch.Tensor should still be wrapped
return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
return type(self)._wrap_fn(lambda s: getattr(s, name), use_self=self)()
else:
# no need to wrap non-tensor properties,
# and they likely don't depend on the actual contents of the tensor
Expand All @@ -39,8 +47,9 @@ def mk_wrap(op_name: str, *, meta_noop: bool = False):
def wrapped_special_op(self, *args, **kwargs):
return type(self)._wrap_fn(
getattr(type(self)._tensor_type, op_name),
use_self=self,
meta_noop=meta_noop,
)(self, *args, **kwargs)
)(*args, **kwargs)
return wrapped_special_op

# special methods bypass __getattr__, so they need to be added manually
Expand Down Expand Up @@ -76,14 +85,16 @@ class LazyBase(ABC, metaclass=LazyMeta):
_args: tuple
_kwargs: dict[str, Any]
_func: Callable[[Any], Any] | None
_ranges: tuple[LocalTensorRange, ...]

def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None, ranges: tuple[LocalTensorRange, ...] = ()):
super().__init__()
self._meta = meta
self._data = data
self._args = args
self._kwargs = kwargs if kwargs is not None else {}
self._func = func
self._ranges = ranges
assert self._func is not None or self._data is not None

def __init_subclass__(cls) -> None:
Expand All @@ -107,7 +118,7 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
return o

@classmethod
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False, data_noop: bool = False) -> Callable[[Any], Any]:
def wrapped_fn(*args, **kwargs):
if kwargs is None:
kwargs = {}
Expand All @@ -116,6 +127,8 @@ def wrapped_fn(*args, **kwargs):
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
# TODO: maybe handle tensors in kwargs too

ranges = use_self._ranges if use_self is not None and data_noop else ()

if isinstance(meta_noop, bool) and not meta_noop:
try:
res = fn(*meta_args, **kwargs)
Expand All @@ -138,7 +151,7 @@ def wrapped_fn(*args, **kwargs):
res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)

if isinstance(res, cls._tensor_type):
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn, ranges=ranges)
elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res):
# share the evaluation between lazy tuple elements
shared_args: list = [args, None]
Expand Down Expand Up @@ -202,6 +215,7 @@ class LazyNumpyTensor(LazyBase):
_tensor_type = np.ndarray

shape: tuple[int, ...] # Makes the type checker happy in quants.py
nbytes: int

@classmethod
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
Expand All @@ -214,10 +228,154 @@ def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) ->
def astype(self, dtype, *args, **kwargs):
meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
full_args = (self, dtype,) + args
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
ranges = self._ranges if self._meta.dtype == dtype else ()
return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)), ranges=ranges)

def tofile(self, *args, **kwargs):
eager = LazyNumpyTensor.to_eager(self)
return eager.tofile(*args, **kwargs)
def tofile(self, fid, *args, **kwargs):
if isinstance(fid, BufferedWriter) and len(self._ranges) > 0:
return copy_tensor_ranges(self, fid)
else:
eager = LazyNumpyTensor.to_eager(self)
return eager.tofile(fid, *args, **kwargs)

# TODO: __array_function__


# For aligning blocks when reflinking
def best_extra_offset(t: np.ndarray | LazyNumpyTensor | None, current_offset: int) -> int:
if not isinstance(t, LazyNumpyTensor):
# no file ranges, no need for an offset
return 0

ranges = t._ranges

histogram: dict[int, int] = {}

max_block_size = 0
for r in ranges:
# Ensure minimal alignment is 8 bytes (common with safetensors)
# and that the block size is valid
if r.offset % 8 == 0 and r.block_size > 0:
align_offset = r.offset % r.block_size
if align_offset not in histogram:
histogram[align_offset] = 0
histogram[align_offset] += r.size
if r.block_size > max_block_size:
max_block_size = r.block_size

best_offset = 0
best_size = 0
for offset, size in histogram.items():
if size > best_size:
best_size = size
best_offset = offset

if max_block_size > 0:
# the offset needs to be aligned properly
# or else there's probably a block size mismatch
assert current_offset % max_block_size == 0, current_offset % max_block_size

return best_offset


def count_reflinkable_size(tensors: Iterable[tuple[str, np.ndarray | LazyNumpyTensor | None]]) -> int:
if not hasattr(os, "copy_file_range"):
return 0
size = 0
for name, t in tensors:
if isinstance(t, LazyNumpyTensor) and len(t._ranges) > 0:
align_offset = best_extra_offset(t, 0)
misaligned = 0
for range in t._ranges:
if range.block_size > 0:
if range.offset % range.block_size == align_offset:
size += range.size
else:
misaligned += 1
if misaligned > 0:
logger.debug(f"{name} misaligned for reflinking, fallback to copy for {misaligned} of {len(t._ranges)} parts")
return size


# Copy tensor ranges using os.copy_file_range with aligned offsets and sizes
# to make it more likely that copy-on-write is used where possible.
# Block alignment is necessary for BTRFS and XFS (and likely for ZFS too).
#
# Falls back to shutil.copyfileobj when os.copy_file_range is not present.
def copy_tensor_ranges(t: LazyNumpyTensor, fout: BufferedWriter):
ranges = t._ranges
assert len(ranges) > 0
dst_offset = fout.tell()
extra_offset = best_extra_offset(t, dst_offset)

if extra_offset > 0:
# initial padding
fout.write(b"\x00" * extra_offset)

dst_offset += extra_offset
start_offset = dst_offset

src_files: dict[Path, BufferedReader] = {}
for r in ranges:
if r.filename not in src_files:
src_files[r.filename] = open(r.filename, "rb")

has_copy_file_range = hasattr(os, "copy_file_range")

for r in ranges:
src = src_files[r.filename]
if has_copy_file_range:
if r.block_size > 0 and (r.offset % r.block_size) == (start_offset % r.block_size):
# Attempting to align copies for reflinking

# Block 0, 1, 2, 3, 4,
# |___0000|0000000|0001111|1111111|111____|
#
# 1. block 0 is partially overwritten with contents from range[0]
# 2. blocks 1 and 2 are copied from range[0] using os.copy_file_range
# 3. block 2 is partially overwritten with contents from range[1]
# 4. blocks 3 and 4 are copied from range[1] using os.copy_file_range
# (repeated for further ranges)
if dst_offset % r.block_size == 0:
extra_size = 0
else:
extra_size = r.block_size - (dst_offset % r.block_size)
extra_size = min(extra_size, r.size)
src.seek(r.offset)
buf = src.read(extra_size)
fout.seek(dst_offset)
fout.write(buf)
dst_offset += extra_size
if extra_size == r.size:
continue

assert dst_offset % r.block_size == 0, dst_offset % r.block_size

offset_src = r.offset + extra_size
offset_src_end = r.offset + r.size
if offset_src_end % r.block_size != 0:
offset_src_end += r.block_size - (offset_src_end % r.block_size)
size = offset_src_end - offset_src
os.copy_file_range(src.fileno(), fout.fileno(), size, offset_src, dst_offset)
dst_offset += r.size - extra_size
else:
# not trying to use reflinks, but still using os.copy_file_range for speed
try:
os.copy_file_range(src.fileno(), fout.fileno(), r.size, r.offset, dst_offset)
except OSError:
# fallback when there's a problem (e.g. cross-filesystem copies)
src.seek(r.offset)
fout.seek(dst_offset)
shutil.copyfileobj(src, fout, r.size)
dst_offset += r.size
else:
# not using reflinks, fallback when os.copy_file_range is not supported
src.seek(r.offset)
fout.seek(dst_offset)
shutil.copyfileobj(src, fout, r.size)
dst_offset += r.size

for f in src_files.values():
f.close()

fout.seek(dst_offset)
Loading
Loading