ggml-org · compilade · Sep 2, 2025 · Sep 2, 2025 · Sep 2, 2025 · Sep 4, 2025
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -42,8 +42,8 @@ void ggml_print_backtrace(void);
 #    define MAX(a, b) ((a) > (b) ? (a) : (b))
 #endif
 
-// required for mmap as gguf only guarantees 32-byte alignment
-#define TENSOR_ALIGNMENT 32
+// required for mmap as gguf converted with reflinks from safetensors only guarantees 8-byte alignment
+#define TENSOR_ALIGNMENT 8
 
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.

@@ -624,14 +624,16 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par
         ctx->size = 0;
         for (size_t i = 0; i < ctx->info.size(); ++i) {
             const gguf_tensor_info & ti = ctx->info[i];
-            if (ti.offset != ctx->size) {
+            // alignment offset only exists for GGUF converted with reflinks
+            const size_t align_offset = ti.offset % ctx->alignment;
+            if (ti.offset - align_offset != ctx->size) {
                 GGML_LOG_ERROR("%s: tensor '%s' has offset %" PRIu64 ", expected %zu\n",
-                    __func__, ti.t.name, ti.offset, ctx->size);
+                    __func__, ti.t.name, ti.offset, ctx->size + align_offset);
                 GGML_LOG_ERROR("%s: failed to read tensor data\n", __func__);
                 gguf_free(ctx);
                 return nullptr;
             }
-            size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment);
+            size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t) + align_offset, ctx->alignment);
             if (SIZE_MAX - ctx->size < padded_size) {
                 GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n",
                     __func__, ti.t.name, ctx->size, padded_size);

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -29,6 +29,7 @@
     ExpertGatingFuncType,
 )
 
+from .lazy import best_extra_offset, count_reflinkable_size
 from .quants import quant_shape_from_byte_shape
 
 logger = logging.getLogger(__name__)
@@ -84,14 +85,16 @@ class GGUFWriter:
 
     def __init__(
         self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False, endianess: GGUFEndian = GGUFEndian.LITTLE,
-        split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False
+        split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False,
+        use_reflinks = False,  # opportunistically attempt to use copy-on-write
     ):
         self.fout = None
         self.path = Path(path) if path else None
         self.arch = arch
         self.endianess = endianess
         self.data_alignment = GGUF_DEFAULT_ALIGNMENT
-        self.use_temp_file = use_temp_file
+        self.use_reflinks = use_reflinks
+        self.use_temp_file = False if self.use_reflinks else use_temp_file
         self.temp_file = None
         self.tensors = [{}]
         self.kv_data = [{}]
@@ -178,13 +181,28 @@ def open_output_file(self, path: Path | None = None) -> None:
             self.fout = [open(filename, "wb") for filename in filenames]
             self.state = WriterState.EMPTY
 
+            if self.use_reflinks:
+                # reflinks require alignment to the filesystem blocks
+                block_size = os.stat(self.path.parent).st_blksize
+                # necessary to get an appropriate data start offset when padding for reflinks;
+                # using the real alignment (8 bytes, from safetensors) would result in a unusable base data offset
+                self.data_alignment = block_size
+                # for all shards to allow reading them on their own
+                for i, kv in enumerate(self.kv_data):
+                    # insert at the start of the key-values
+                    if Keys.General.ALIGNMENT in kv:
+                        del kv[Keys.General.ALIGNMENT]
+                    self.kv_data[i] = {Keys.General.ALIGNMENT: GGUFValue(block_size, GGUFValueType.UINT32), **kv}
+
     def print_plan(self) -> list[Path]:
         logger.info("Writing the following files:")
         assert self.path is not None
         filenames = self.format_shard_names(self.path)
         assert len(filenames) == len(self.tensors)
         for name, tensors in zip(filenames, self.tensors):
-            logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(sum(ti.nbytes for ti in tensors.values()))}")
+            total_size = sum(ti.nbytes for ti in tensors.values())
+            reflinkable_size = count_reflinkable_size((name, ti.tensor) for name, ti in tensors.items()) if self.use_reflinks else 0
+            logger.info(f"{name}: n_tensors = {len(tensors)}, total_size = {GGUFWriter.format_n_bytes_to_str(total_size)}{', reflinked = ' + GGUFWriter.format_n_bytes_to_str(total_size - reflinkable_size) if self.use_reflinks else ''}")
 
         if self.dry_run:
             logger.info("Dry run, not writing files")
@@ -257,14 +275,18 @@ def write_ti_data_to_file(self) -> None:
             offset_tensor = 0
 
             for name, ti in tensors.items():
+                extra_offset = 0
+                if self.use_reflinks:
+                    extra_offset = best_extra_offset(ti.tensor, offset_tensor)
+
                 ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
                 n_dims = len(ti.shape)
                 ti_data += self._pack("I", n_dims)
                 for j in range(n_dims):
                     ti_data += self._pack("Q", ti.shape[n_dims - 1 - j])
                 ti_data += self._pack("I", ti.dtype)
-                ti_data += self._pack("Q", offset_tensor)
-                offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
+                ti_data += self._pack("Q", offset_tensor + extra_offset)
+                offset_tensor += GGUFWriter.ggml_pad(ti.nbytes + extra_offset, self.data_alignment)
 
             fout.write(ti_data)
             fout.flush()
@@ -392,7 +414,7 @@ def add_tensor(
     def write_padding(self, fp: IO[bytes], n: int, align: int | None = None) -> None:
         pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
         if pad != 0:
-            fp.write(bytes([0] * pad))
+            fp.write(b"\x00" * pad)
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
         if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
@@ -418,7 +440,7 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
 
         self.write_padding(fout, fout.tell())
         tensor.tofile(fout)
-        self.write_padding(fout, tensor.nbytes)
+        self.write_padding(fout, fout.tell())
 
         self.state = WriterState.WEIGHTS
 
@@ -458,7 +480,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
                         shard_bar.update(ti.nbytes)
                     if bar is not None:
                         bar.update(ti.nbytes)
-                    self.write_padding(fout, ti.nbytes)
+                    self.write_padding(fout, fout.tell())
                     ti.tensor = None
         else:
             self.temp_file.seek(0)

diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
@@ -1,12 +1,19 @@
 from __future__ import annotations
 from abc import ABC, ABCMeta, abstractmethod
 
-import logging
-from typing import Any, Callable
+from io import BufferedReader, BufferedWriter
+from pathlib import Path
+from typing import Any, Callable, Iterable
 
+import logging
 import numpy as np
+import os
+import shutil
+
 from numpy.typing import DTypeLike
 
+from .utility import LocalTensorRange
+
 
 logger = logging.getLogger(__name__)
 
@@ -20,10 +27,11 @@ def __getattr__(self, name: str) -> Any:
                 return type(self)._wrap_fn(
                     (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
                     use_self=self,
+                    data_noop=name in ("view", "reshape", "squeeze", "unsqueeze", "contiguous"),
                 )
             elif isinstance(meta_attr, self._tensor_type):
                 # e.g. self.T with torch.Tensor should still be wrapped
-                return type(self)._wrap_fn(lambda s: getattr(s, name))(self)
+                return type(self)._wrap_fn(lambda s: getattr(s, name), use_self=self)()
             else:
                 # no need to wrap non-tensor properties,
                 # and they likely don't depend on the actual contents of the tensor
@@ -39,8 +47,9 @@ def mk_wrap(op_name: str, *, meta_noop: bool = False):
             def wrapped_special_op(self, *args, **kwargs):
                 return type(self)._wrap_fn(
                     getattr(type(self)._tensor_type, op_name),
+                    use_self=self,
                     meta_noop=meta_noop,
-                )(self, *args, **kwargs)
+                )(*args, **kwargs)
             return wrapped_special_op
 
         # special methods bypass __getattr__, so they need to be added manually
@@ -76,14 +85,16 @@ class LazyBase(ABC, metaclass=LazyMeta):
     _args: tuple
     _kwargs: dict[str, Any]
     _func: Callable[[Any], Any] | None
+    _ranges: tuple[LocalTensorRange, ...]
 
-    def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None):
+    def __init__(self, *, meta: Any, data: Any | None = None, args: tuple = (), kwargs: dict[str, Any] | None = None, func: Callable[[Any], Any] | None = None, ranges: tuple[LocalTensorRange, ...] = ()):
         super().__init__()
         self._meta = meta
         self._data = data
         self._args = args
         self._kwargs = kwargs if kwargs is not None else {}
         self._func = func
+        self._ranges = ranges
         assert self._func is not None or self._data is not None
 
     def __init_subclass__(cls) -> None:
@@ -107,7 +118,7 @@ def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
             return o
 
     @classmethod
-    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False) -> Callable[[Any], Any]:
+    def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike | tuple[DTypeLike, Callable[[tuple[int, ...]], tuple[int, ...]]] = False, data_noop: bool = False) -> Callable[[Any], Any]:
         def wrapped_fn(*args, **kwargs):
             if kwargs is None:
                 kwargs = {}
@@ -116,6 +127,8 @@ def wrapped_fn(*args, **kwargs):
             meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
             # TODO: maybe handle tensors in kwargs too
 
+            ranges = use_self._ranges if use_self is not None and data_noop else ()
+
             if isinstance(meta_noop, bool) and not meta_noop:
                 try:
                     res = fn(*meta_args, **kwargs)
@@ -138,7 +151,7 @@ def wrapped_fn(*args, **kwargs):
                         res = cls.meta_with_dtype_and_shape(meta_noop, res.shape)
 
             if isinstance(res, cls._tensor_type):
-                return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn)
+                return cls(meta=cls.eager_to_meta(res), args=args, kwargs=kwargs, func=fn, ranges=ranges)
             elif isinstance(res, tuple) and all(isinstance(t, cls._tensor_type) for t in res):
                 # share the evaluation between lazy tuple elements
                 shared_args: list = [args, None]
@@ -202,6 +215,7 @@ class LazyNumpyTensor(LazyBase):
     _tensor_type = np.ndarray
 
     shape: tuple[int, ...]  # Makes the type checker happy in quants.py
+    nbytes: int
 
     @classmethod
     def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
@@ -214,10 +228,154 @@ def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) ->
     def astype(self, dtype, *args, **kwargs):
         meta = type(self).meta_with_dtype_and_shape(dtype, self._meta.shape)
         full_args = (self, dtype,) + args
-        return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)))
+        ranges = self._ranges if self._meta.dtype == dtype else ()
+        return type(self)(meta=meta, args=full_args, kwargs=kwargs, func=(lambda a, *args, **kwargs: a.astype(*args, **kwargs)), ranges=ranges)
 
-    def tofile(self, *args, **kwargs):
-        eager = LazyNumpyTensor.to_eager(self)
-        return eager.tofile(*args, **kwargs)
+    def tofile(self, fid, *args, **kwargs):
+        if isinstance(fid, BufferedWriter) and len(self._ranges) > 0:
+            return copy_tensor_ranges(self, fid)
+        else:
+            eager = LazyNumpyTensor.to_eager(self)
+            return eager.tofile(fid, *args, **kwargs)
 
     # TODO: __array_function__
+
+
+# For aligning blocks when reflinking
+def best_extra_offset(t: np.ndarray | LazyNumpyTensor | None, current_offset: int) -> int:
+    if not isinstance(t, LazyNumpyTensor):
+        # no file ranges, no need for an offset
+        return 0
+
+    ranges = t._ranges
+
+    histogram: dict[int, int] = {}
+
+    max_block_size = 0
+    for r in ranges:
+        # Ensure minimal alignment is 8 bytes (common with safetensors)
+        # and that the block size is valid
+        if r.offset % 8 == 0 and r.block_size > 0:
+            align_offset = r.offset % r.block_size
+            if align_offset not in histogram:
+                histogram[align_offset] = 0
+            histogram[align_offset] += r.size
+            if r.block_size > max_block_size:
+                max_block_size = r.block_size
+
+    best_offset = 0
+    best_size = 0
+    for offset, size in histogram.items():
+        if size > best_size:
+            best_size = size
+            best_offset = offset
+
+    if max_block_size > 0:
+        # the offset needs to be aligned properly
+        # or else there's probably a block size mismatch
+        assert current_offset % max_block_size == 0, current_offset % max_block_size
+
+    return best_offset
+
+
+def count_reflinkable_size(tensors: Iterable[tuple[str, np.ndarray | LazyNumpyTensor | None]]) -> int:
+    if not hasattr(os, "copy_file_range"):
+        return 0
+    size = 0
+    for name, t in tensors:
+        if isinstance(t, LazyNumpyTensor) and len(t._ranges) > 0:
+            align_offset = best_extra_offset(t, 0)
+            misaligned = 0
+            for range in t._ranges:
+                if range.block_size > 0:
+                    if range.offset % range.block_size == align_offset:
+                        size += range.size
+                    else:
+                        misaligned += 1
+            if misaligned > 0:
+                logger.debug(f"{name} misaligned for reflinking, fallback to copy for {misaligned} of {len(t._ranges)} parts")
+    return size
+
+
+# Copy tensor ranges using os.copy_file_range with aligned offsets and sizes
+# to make it more likely that copy-on-write is used where possible.
+# Block alignment is necessary for BTRFS and XFS (and likely for ZFS too).
+#
+# Falls back to shutil.copyfileobj when os.copy_file_range is not present.
+def copy_tensor_ranges(t: LazyNumpyTensor, fout: BufferedWriter):
+    ranges = t._ranges
+    assert len(ranges) > 0
+    dst_offset = fout.tell()
+    extra_offset = best_extra_offset(t, dst_offset)
+
+    if extra_offset > 0:
+        # initial padding
+        fout.write(b"\x00" * extra_offset)
+
+    dst_offset += extra_offset
+    start_offset = dst_offset
+
+    src_files: dict[Path, BufferedReader] = {}
+    for r in ranges:
+        if r.filename not in src_files:
+            src_files[r.filename] = open(r.filename, "rb")
+
+    has_copy_file_range = hasattr(os, "copy_file_range")
+
+    for r in ranges:
+        src = src_files[r.filename]
+        if has_copy_file_range:
+            if r.block_size > 0 and (r.offset % r.block_size) == (start_offset % r.block_size):
+                # Attempting to align copies for reflinking
+
+                # Block  0,      1,      2,      3,      4,
+                # |___0000|0000000|0001111|1111111|111____|
+                #
+                # 1. block 0 is partially overwritten with contents from range[0]
+                # 2. blocks 1 and 2 are copied from range[0] using os.copy_file_range
+                # 3. block 2 is partially overwritten with contents from range[1]
+                # 4. blocks 3 and 4 are copied from range[1] using os.copy_file_range
+                # (repeated for further ranges)
+                if dst_offset % r.block_size == 0:
+                    extra_size = 0
+                else:
+                    extra_size = r.block_size - (dst_offset % r.block_size)
+                    extra_size = min(extra_size, r.size)
+                    src.seek(r.offset)
+                    buf = src.read(extra_size)
+                    fout.seek(dst_offset)
+                    fout.write(buf)
+                    dst_offset += extra_size
+                    if extra_size == r.size:
+                        continue
+
+                assert dst_offset % r.block_size == 0, dst_offset % r.block_size
+
+                offset_src = r.offset + extra_size
+                offset_src_end = r.offset + r.size
+                if offset_src_end % r.block_size != 0:
+                    offset_src_end += r.block_size - (offset_src_end % r.block_size)
+                size = offset_src_end - offset_src
+                os.copy_file_range(src.fileno(), fout.fileno(), size, offset_src, dst_offset)
+                dst_offset += r.size - extra_size
+            else:
+                # not trying to use reflinks, but still using os.copy_file_range for speed
+                try:
+                    os.copy_file_range(src.fileno(), fout.fileno(), r.size, r.offset, dst_offset)
+                except OSError:
+                    # fallback when there's a problem (e.g. cross-filesystem copies)
+                    src.seek(r.offset)
+                    fout.seek(dst_offset)
+                    shutil.copyfileobj(src, fout, r.size)
+                dst_offset += r.size
+        else:
+            # not using reflinks, fallback when os.copy_file_range is not supported
+            src.seek(r.offset)
+            fout.seek(dst_offset)
+            shutil.copyfileobj(src, fout, r.size)
+            dst_offset += r.size
+
+    for f in src_files.values():
+        f.close()
+
+    fout.seek(dst_offset)