Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from string import ascii_letters, digits

import numpy as np
from .stream_cast import write_cast

from .constants import (
GGUF_DEFAULT_ALIGNMENT,
Expand All @@ -33,6 +34,9 @@

logger = logging.getLogger(__name__)

def _stream_log(msg: str) -> None:
if os.environ.get("GGUF_STREAM_LOG"):
print(f"[gguf-writer] {msg}", flush=True)

SHARD_NAME_FORMAT = "{:s}-{:05d}-of-{:05d}.gguf"

Expand Down Expand Up @@ -411,12 +415,43 @@ def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
fout = self.fout[file_id]

# pop the first tensor info
# TODO: cleaner way to get the first key
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no reason to remove that comment. There's nothing which attempted to fix the stated TODO.

first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
ti = self.tensors[file_id].pop(first_tensor_name)
assert ti.nbytes == tensor.nbytes

# align to data_alignment before writing tensor data
self.write_padding(fout, fout.tell())

# --- writer-side streaming for pure dtype casts (survives when tofile() isn't used) ---
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it would be simpler to keep using tofile. It would be more convenient at least. I think it's possible to make #12837 use tofile and that would remove the need to find an alternative to overriding tofile.

try:
if getattr(tensor, "_gguf_stream_cast", False):
# derive the pre-cast lazy source from the astype() node args
base = getattr(tensor, "_args", None)
base = base[0] if base else None

src_arr = None
try:
src_arr = type(base).to_eager(base)
except Exception:
src_arr = None

if isinstance(src_arr, np.ndarray):
try:
mb = int(os.environ.get("GGUF_CAST_CHUNK_MB", "64") or "64")
except Exception:
mb = 64
tgt_dtype = getattr(tensor, "_gguf_stream_cast_dtype", src_arr.dtype)
_stream_log(f"writer: streaming cast (chunk={mb} MiB) dst={tgt_dtype} shape={getattr(tensor, 'shape', '?')}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using logger.debug instead of reinventing conditional debug logging.

Suggested change
_stream_log(f"writer: streaming cast (chunk={mb} MiB) dst={tgt_dtype} shape={getattr(tensor, 'shape', '?')}")
logger.debug(f"streaming cast (chunk={mb} MiB) dst={tgt_dtype} shape={getattr(tensor, 'shape', '?')}")

write_cast(fout, src_arr, tgt_dtype, mb)
self.write_padding(fout, ti.nbytes)
self.state = WriterState.WEIGHTS
return
except Exception:
# fall back to normal path on any unexpected issue
pass
# ---------------------------------------------------------------------------------------

# Fallback: rely on the object’s own tofile() (handles lazy or eager)
tensor.tofile(fout)
self.write_padding(fout, tensor.nbytes)

Expand Down Expand Up @@ -452,8 +487,46 @@ def write_tensors_to_file(self, *, progress: bool = False) -> None:
# relying on the fact that Python dicts preserve insertion order (since 3.7)
for ti in tensors.values():
assert ti.tensor is not None # can only iterate once over the tensors
assert ti.tensor.nbytes == ti.nbytes
ti.tensor.tofile(fout)
obj = ti.tensor
assert obj.nbytes == ti.nbytes

# Try writer-side streaming for pure dtype casts
streamed = False
try:
if getattr(obj, "_gguf_stream_cast", False):
# derive the pre-cast lazy source from the astype() node args
base = getattr(obj, "_args", None)
base = base[0] if base else None

src_arr = None
try:
src_arr = type(base).to_eager(base)
except Exception:
src_arr = None

if isinstance(src_arr, np.ndarray):
try:
mb = int(os.environ.get("GGUF_CAST_CHUNK_MB", "64") or "64")
except Exception:
mb = 64
tgt_dtype = getattr(obj, "_gguf_stream_cast_dtype", src_arr.dtype)
_stream_log(f"writer: streaming cast (chunk={mb} MiB) dst={tgt_dtype} shape={getattr(obj, 'shape', '?')}")
write_cast(fout, src_arr, tgt_dtype, mb)
streamed = True
except Exception:
streamed = False # fall back below on any issue

if streamed:
if shard_bar is not None:
shard_bar.update(ti.nbytes)
if bar is not None:
bar.update(ti.nbytes)
self.write_padding(fout, ti.nbytes)
ti.tensor = None
continue

# Fallback: object’s tofile()
obj.tofile(fout)
if shard_bar is not None:
shard_bar.update(ti.nbytes)
if bar is not None:
Expand Down
70 changes: 70 additions & 0 deletions gguf-py/gguf/lazy.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any, Callable

import numpy as np
import os
from numpy.typing import DTypeLike


Expand Down Expand Up @@ -221,3 +222,72 @@ def tofile(self, *args, **kwargs):
return eager.tofile(*args, **kwargs)

# TODO: __array_function__

# --- begin low-memory streaming for dtype casts ------------------------------
# Tunable via env:
# GGUF_CAST_CHUNK_MB (MiB per chunk; default 64)
# GGUF_STREAM_LOG (set to any non-empty value to print diagnostics)

import sys
from .stream_cast import write_cast # sibling helper

try:
_LAZY_ORIG_ASTYPE = getattr(LazyNumpyTensor, "astype")
_LAZY_ORIG_TOFILE = getattr(LazyNumpyTensor, "tofile")
except NameError:
raise RuntimeError("Expected LazyNumpyTensor to be defined above this block")

def _slog(msg: str) -> None:
if os.environ.get("GGUF_STREAM_LOG"):
print(f"[gguf-stream] {msg}", file=sys.stdout, flush=True)
Comment on lines +240 to +242
Copy link
Collaborator

@compilade compilade Sep 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The uses of _slog should also probably be replaced with logger.debug. Using an undocumented environment variable isn't very user-friendly. (unless the logs are not intended to be printed)


def _gguf_streaming_astype(self, dtype, *args, **kwargs):
"""Tag astype results so writer/tofile can stream them later."""
tgt = np.dtype(dtype)
out = _LAZY_ORIG_ASTYPE(self, dtype, *args, **kwargs)
# mark as streamable and record target dtype
setattr(out, "_gguf_stream_cast", True)
setattr(out, "_gguf_stream_cast_dtype", tgt)
# NEW: record the *source* lazy tensor for writer-side streaming
setattr(out, "_gguf_stream_cast_src", self)
Comment on lines +249 to +252
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, a single attr could be used containing e.g. a tuple[LazyNumpyTensor, np.dtype], since they are expected to all exist when one of them does anyway.

Using three separate attrs seems excessive (especially since the existence one is redundant with the other ones existing).

This should also simplify (and remove the need for) most of the edge-case handling for missing values (e.g. the missing base array).

_slog(f"mark streamable astype: src={getattr(self._meta,'dtype','?')} -> dst={tgt}")
return out

def _gguf_streaming_tofile(self, fout, *args, **kwargs):
"""If this lazy tensor is a pure dtype-cast, stream in chunks; else fallback."""
if not getattr(self, "_gguf_stream_cast", False):
return _LAZY_ORIG_TOFILE(self, fout, *args, **kwargs)

# default chunk size: 64 MiB (can override via GGUF_CAST_CHUNK_MB)
try:
mb = int(os.environ.get("GGUF_CAST_CHUNK_MB", "64") or "64")
except Exception:
mb = 64
mb = max(1, mb)
Comment on lines +261 to +266
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to make this configurable at runtime? A default value should be fine here, I think?

Otherwise this is parsing environment variables at each written tensor. (very minor overhead, though)


# Prefer the explicitly tagged source lazy tensor if present (step 2)
base = getattr(self, "_gguf_stream_cast_src", None)

# Fallback to first arg (older astype behavior) if not tagged
if base is None:
base = getattr(self, "_args", None)
base = base[0] if base else None

try:
src_arr = LazyNumpyTensor.to_eager(base)
except Exception:
src_arr = None

if not isinstance(src_arr, np.ndarray):
_slog("fallback to original tofile: cannot materialize source to ndarray")
return _LAZY_ORIG_TOFILE(self, fout, *args, **kwargs)

tgt = getattr(self, "_gguf_stream_cast_dtype", src_arr.dtype)
_slog(f"streaming cast write: chunk={mb} MiB; dst={tgt}; shape={getattr(self._meta,'shape','?')}")
write_cast(fout, src_arr, tgt, mb)
return

# Install patches
LazyNumpyTensor.astype = _gguf_streaming_astype
LazyNumpyTensor.tofile = _gguf_streaming_tofile
Comment on lines +290 to +292
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be cleaner to directly modify the source code of LazyNumpyTensor.astype and LazyNumpyTensor.tofile instead of patching them?

Unless you want this to be disable-able, in which case a subclass (although not sure what name to use) could also be appropriate, and then it could be used in LazyTorchTensor.numpy().

Are there cases where astype chunking shouldn't be used?

Assuming it's implemented correctly, I think the tag (used to detect whether to stream astype on chunks) will not be kept on transformations of the LazyNumpyTensor (because a new one is created to track the transformations), and so it should be safe in pretty much all cases.

# --- end low-memory streaming for dtype casts ------------------------------
80 changes: 80 additions & 0 deletions gguf-py/gguf/stream_cast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# gguf-py/gguf/stream_cast.py
from __future__ import annotations
from typing import Any
import os
import sys
import numpy as np


def _slog(msg: str) -> None:
"""Conditional debug logging when GGUF_STREAM_LOG is set."""
if os.environ.get("GGUF_STREAM_LOG"):
print(f"[gguf-stream] {msg}", file=sys.stdout, flush=True)


def _chunk_elems(src_dtype: np.dtype, dst_dtype: np.dtype, chunk_mb: int) -> int:
"""
Compute how many elements to process per chunk so that each chunk is
approximately `chunk_mb` MiB of the *larger* of the source/destination itemsize.
"""
try:
mb = int(chunk_mb)
except Exception:
mb = 64
mb = max(1, mb)
item = max(np.dtype(src_dtype).itemsize, np.dtype(dst_dtype).itemsize)
return max(1, (mb * 1024 * 1024) // item)


def write_cast(fout, src: np.ndarray, dst_dtype: Any, chunk_mb: int) -> None:
"""
Stream `src.astype(dst_dtype)` to `fout` in fixed-size chunks to cap peak RSS.
This matches the import site in lazy.py:
from .stream_cast import write_cast
Parameters
----------
fout : file-like object
Open file handle to write bytes to (must support .write()).
src : np.ndarray
Source ndarray to be converted and streamed.
dst_dtype : Any
Target dtype (anything accepted by np.dtype).
chunk_mb : int
Desired chunk size in MiB (will be clamped to >= 1).
"""
dst = np.dtype(dst_dtype)
flat = src.reshape(-1)
n = flat.size
ce = _chunk_elems(flat.dtype, dst, chunk_mb)

_slog(
f"write_cast: src={flat.dtype} -> dst={dst}; n={n}; "
f"chunk={max(1, int(chunk_mb))} MiB; elems/chunk={ce}"
)

start = 0
# local binding for tiny speed bump
mv = memoryview
while start < n:
end = min(start + ce, n)
# copy=False avoids an extra tmp when possible
chunk = flat[start:end].astype(dst, copy=False)
fout.write(mv(chunk).tobytes())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively, directly use the memoryview of the np.ndarray, which is at np.ndarray.data.

Suggested change
fout.write(mv(chunk).tobytes())
fout.write(chunk.data)

I don't know if it would handle non-contiguous strides correctly, though, in this case the previous .reshape(-1) makes it contiguous anyway, so this should work.

There's also tofile :)

Suggested change
fout.write(mv(chunk).tobytes())
chunk.tofile(fout)

No idea of the performance difference of these approaches, but I think .tobytes() should be avoided because it returns "a copy of the raw contents of data memory".

start = end


# Optional: writer-side API that accepts chunk size in bytes (used by gguf_writer)
def stream_write(fout, src_arr: np.ndarray, dst_dtype: Any, chunk_bytes: int) -> None:
"""
Same as write_cast, but the chunk size is given in bytes.
Kept for compatibility with earlier helper drafts.
"""
if not isinstance(chunk_bytes, int) or chunk_bytes <= 0:
chunk_mb = 64
else:
# round bytes to MiB for the element count helper
chunk_mb = max(1, chunk_bytes // (1024 * 1024))

write_cast(fout, src_arr, dst_dtype, chunk_mb)