convert : fix reflinks for stacked MoE tensors

compilade · compilade · commit bbc35dd2cb16 · 2025-09-02T15:22:01.000-04:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -462,7 +462,9 @@ def prepare_tensors(self):
 
                 # workaround BF16 not being supported by Numpy
                 if data_torch.dtype == torch.bfloat16:
-                    data_torch = data_torch.view(torch.uint8)
+                    # Need a contiguous last dimension otherwise byte view doesn't work
+                    # (problem can be reproduced with DeepSeek-V2-Lite-Chat)
+                    data_torch = data_torch.contiguous().view(torch.uint8)
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
                 if len(data_torch.shape) == 0:
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -30,7 +30,7 @@
 )
 
 from .quants import quant_shape_from_byte_shape
-from .utility import LocalTensorRange, best_alignment_offset, copy_tensor_ranges
+from .utility import LocalTensorRange, best_alignment_offset, reflink_tensor_ranges
 
 logger = logging.getLogger(__name__)
 
@@ -470,7 +470,7 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
                     if self.use_reflinks and len(ranges := getattr(ti.tensor, "_ranges", ())) > 0:
                         logger.debug(f"using reflinks for {name}")
                         start_offset = fout.tell()
-                        copy_tensor_ranges(fout, ranges, self.data_alignment)
+                        reflink_tensor_ranges(fout, ranges, self.data_alignment)
                         self.write_padding(fout, fout.tell() - start_offset)
                     else:
                         ti.tensor.tofile(fout)
diff --git a/gguf-py/gguf/lazy.py b/gguf-py/gguf/lazy.py
@@ -21,7 +21,7 @@ def __getattr__(self, name: str) -> Any:
                 return type(self)._wrap_fn(
                     (lambda s, *args, **kwargs: getattr(s, name)(*args, **kwargs)),
                     use_self=self,
-                    data_noop=name in ("view", "reshape", "squeeze", "unsqueeze"),
+                    data_noop=name in ("view", "reshape", "squeeze", "unsqueeze", "contiguous"),
                 )
             elif isinstance(meta_attr, self._tensor_type):
                 # e.g. self.T with torch.Tensor should still be wrapped
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py
@@ -293,7 +293,8 @@ def best_alignment_offset(ranges: tuple[LocalTensorRange, ...], alignment: int):
     best_offset = 0
     best_size = 0
     for offset, size in hist.items():
-        if size > best_size:
+        # Ensure minimal alignment is 8-bytes (common with safetensors)
+        if size > best_size and offset % 8 == 0:
             best_size = size
             best_offset = offset
     return best_offset
@@ -303,7 +304,7 @@ def best_alignment_offset(ranges: tuple[LocalTensorRange, ...], alignment: int):
 # Copy tensor ranges using os.copy_file_range with aligned offsets and sizes
 # to make it more likely that copy-on-write is used where possible.
 # Block alignment is necessary for BTRFS and XFS (and likely for ZFS too).
-def copy_tensor_ranges(fout: BufferedWriter, ranges: tuple[LocalTensorRange, ...], alignment: int = 4096):
+def reflink_tensor_ranges(fout: BufferedWriter, ranges: tuple[LocalTensorRange, ...], alignment: int = 4096):
     assert len(ranges) > 0
     dst_offset = fout.tell()
     assert dst_offset % alignment == 0, dst_offset % alignment
@@ -331,26 +332,40 @@ def copy_tensor_ranges(fout: BufferedWriter, ranges: tuple[LocalTensorRange, ...
             src = src_files[r.filename]
             if this_align_offset != align_offset:
                 logger.debug(f"copy-on-write can't be used ({i}/{len(ranges)})")
-            if i > 0 and dst_offset % alignment != 0:
-                # Write the correct data between blocks even when they are non-consecutive
+                # relying on os.copy_file_range to fallback to a non-aligned copy
+
+            # Block  0,      1,      2,      3,      4,
+            # |___0000|0000000|0001111|1111111|111____|
+            #
+            # 1. blocks 0, 1 and 2 are copied from range[0] using os.copy_file_range
+            # 2. block 2 is partially overwritten with contents from range[1]
+            # 3. blocks 3 and 4 are copied from range[1] using os.copy_file_range
+            #
+            # (2 and 3 are repeated with further blocks if there are more ranges)
+            if i == 0:
+                extra_size = -align_offset
+            elif dst_offset % alignment == 0:
+                extra_size = 0
+            else:
                 extra_size = alignment - (dst_offset % alignment)
+                extra_size = min(extra_size, r.size)
                 src.seek(r.offset)
                 buf = src.read(extra_size)
                 fout.seek(dst_offset)
                 fout.write(buf)
                 dst_offset += extra_size
-                assert dst_offset % alignment == 0, dst_offset % alignment
-                offset_src = r.offset + extra_size
-            else:
-                # TODO: is this always correct?
-                offset_src = r.offset - align_offset
+                if extra_size == r.size:
+                    continue
+
+            assert dst_offset % alignment == 0, dst_offset % alignment
 
+            offset_src = r.offset + extra_size
             offset_src_end = r.offset + r.size
             if offset_src_end % alignment != 0:
                 offset_src_end += alignment - (offset_src_end % alignment)
             size = offset_src_end - offset_src
             os.copy_file_range(src.fileno(), fout.fileno(), size, offset_src, dst_offset)
-            dst_offset += r.size
+            dst_offset += r.size - extra_size
 
         for f in src_files.values():
             f.close()

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ def __getattr__(self, name: str) -> Any:`
`21`	`21`	`return type(self)._wrap_fn(`
`22`	`22`	`(lambda s, args, kwargs: getattr(s, name)(args, **kwargs)),`
`23`	`23`	`use_self=self,`
`24`		`- data_noop=name in ("view", "reshape", "squeeze", "unsqueeze"),`
	`24`	`+ data_noop=name in ("view", "reshape", "squeeze", "unsqueeze", "contiguous"),`
`25`	`25`	`)`
`26`	`26`	`elif isinstance(meta_attr, self._tensor_type):`
`27`	`27`	`# e.g. self.T with torch.Tensor should still be wrapped`