From f119cc8bf8697e3c7602d8e73ca7062d1ef1284c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 13:44:05 +0100 Subject: [PATCH 1/9] Refactor gguf scripts to improve metadata handling Added contents method to ReaderField class Added endianess property to GGUFReader class --- gguf-py/gguf/gguf_reader.py | 55 ++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index e17a4e83147d5..13c635c334849 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -6,6 +6,7 @@ import logging import os +import sys from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -15,7 +16,6 @@ from .quants import quant_shape_to_byte_shape if __name__ == "__main__": - import sys from pathlib import Path # Allow running file in package as a script. @@ -28,6 +28,7 @@ GGUF_VERSION, GGMLQuantizationType, GGUFValueType, + GGUFEndian, ) logger = logging.getLogger(__name__) @@ -53,6 +54,48 @@ class ReaderField(NamedTuple): types: list[GGUFValueType] = [] + def contents(self, index_or_slice: int | slice = slice(None)) -> Any: + if self.types: + to_string = lambda x: str(x.tobytes(), encoding='utf-8') + main_type = self.types[0] + + if main_type == GGUFValueType.ARRAY: + sub_type = self.types[-1] + + if sub_type == GGUFValueType.STRING: + indices = self.data[index_or_slice] + + if isinstance(index_or_slice, int): + return to_string(self.parts[indices]) + else: + return [to_string(self.parts[idx]) for idx in indices] + else: + # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too + + # Check if it's unsafe to perform slice optimization on data + # if any(True for idx in self.data if len(self.parts[idx]) != 1): + # optim_slice = slice(None) + # else: + # optim_slice = index_or_slice + # index_or_slice = slice(None) + + # if isinstance(optim_slice, int): + # return self.parts[self.data[optim_slice]].tolist()[0] + # else: + # return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice] + + if isinstance(index_or_slice, int): + return self.parts[self.data[index_or_slice]].tolist()[0] + else: + return [pv for idx in self.data[index_or_slice] for pv in self.parts[idx].tolist()] + + if main_type == GGUFValueType.STRING: + return to_string(self.parts[-1]) + else: + return self.parts[-1].tolist()[0] + + return None + class ReaderTensor(NamedTuple): name: str @@ -105,6 +148,15 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = version = temp_version[0] if version not in READER_SUPPORTED_VERSIONS: raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') + if sys.byteorder == "little": + # Host is little endian + host_endian = GGUFEndian.LITTLE + swapped_endian = GGUFEndian.BIG + else: + # Sorry PDP or other weird systems that don't use BE or LE. + host_endian = GGUFEndian.BIG + swapped_endian = GGUFEndian.LITTLE + self.endianess = swapped_endian if self.byte_order == "S" else host_endian self.fields: OrderedDict[str, ReaderField] = OrderedDict() self.tensors: list[ReaderTensor] = [] offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32])) @@ -190,6 +242,7 @@ def _get_field_parts( offs += int(alen.nbytes) aparts: list[npt.NDArray[Any]] = [raw_itype, alen] data_idxs: list[int] = [] + # FIXME: Handle multi-dimensional arrays properly instead of flattening for idx in range(alen[0]): curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0]) if idx == 0: From b473285afc3fea032ad443edcd01142be2e69f5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 13:46:46 +0100 Subject: [PATCH 2/9] update scripts --- gguf-py/gguf/scripts/gguf_convert_endian.py | 23 ++++------- gguf-py/gguf/scripts/gguf_dump.py | 34 ++++++++--------- gguf-py/gguf/scripts/gguf_new_metadata.py | 42 ++------------------- 3 files changed, 28 insertions(+), 71 deletions(-) diff --git a/gguf-py/gguf/scripts/gguf_convert_endian.py b/gguf-py/gguf/scripts/gguf_convert_endian.py index f97e91bd4c26f..a7287fa294865 100755 --- a/gguf-py/gguf/scripts/gguf_convert_endian.py +++ b/gguf-py/gguf/scripts/gguf_convert_endian.py @@ -20,22 +20,15 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = "little" - swapped_endian = "big" + file_endian = reader.endianess.name + if reader.byte_order == 'S': + host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE' else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = "big" - swapped_endian = "little" - if reader.byte_order == "S": - file_endian = swapped_endian - else: - file_endian = host_endian - order = host_endian if args.order == "native" else args.order - logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") + host_endian = file_endian + order = host_endian if args.order == "native" else args.order.upper() + logger.info(f"* Host is {host_endian} endian, GGUF file seems to be {file_endian} endian") if file_endian == order: - logger.info(f"* File is already {order.upper()} endian. Nothing to do.") + logger.info(f"* File is already {order} endian. Nothing to do.") sys.exit(0) logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: @@ -45,7 +38,7 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None gguf.GGMLQuantizationType.Q8_0, ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + logger.info(f"* Preparing to convert from {file_endian} to {order}") if args.dry_run: return logger.warning("*** Warning *** Warning *** Warning **") diff --git a/gguf-py/gguf/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py index 20f23d729f4b9..e282892d645c7 100755 --- a/gguf-py/gguf/scripts/gguf_dump.py +++ b/gguf-py/gguf/scripts/gguf_dump.py @@ -9,8 +9,6 @@ from pathlib import Path from typing import Any -import numpy as np - # Necessary to load the local gguf package if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists(): sys.path.insert(0, str(Path(__file__).parent.parent.parent)) @@ -21,11 +19,11 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: - host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' + file_endian = reader.endianess.name if reader.byte_order == 'S': - file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE' + host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE' else: - file_endian = host_endian + host_endian = file_endian return (host_endian, file_endian) @@ -45,12 +43,20 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: pretty_type = str(field.types[-1].name) log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}' - if len(field.types) == 1: + if field.types: curr_type = field.types[0] if curr_type == GGUFValueType.STRING: - log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])) - elif field.types[0] in reader.gguf_scalar_to_np: - log_message += ' = {0}'.format(field.parts[-1][0]) + content = field.contents() + if len(content) > 60: + content = content[:57] + '...' + log_message += ' = {0}'.format(repr(content)) + elif curr_type in reader.gguf_scalar_to_np: + log_message += ' = {0}'.format(field.contents()) + else: + content = repr(field.contents(slice(6))) + if len(field.data) > 6: + content = content[:-1] + ', ...]' + log_message += ' = {0}'.format(content) print(log_message) # noqa: NP100 if args.no_tensors: return @@ -82,15 +88,9 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: curr["array_types"] = [t.name for t in field.types][1:] if not args.json_array: continue - itype = field.types[-1] - if itype == GGUFValueType.STRING: - curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data] - else: - curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()] - elif field.types[0] == GGUFValueType.STRING: - curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8") + curr["value"] = field.contents() else: - curr["value"] = field.parts[-1].tolist()[0] + curr["value"] = field.contents() if not args.no_tensors: for idx, tensor in enumerate(reader.tensors): tensors[tensor.name] = { diff --git a/gguf-py/gguf/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py index a8cfc9d581b3d..5128403fcb245 100755 --- a/gguf-py/gguf/scripts/gguf_new_metadata.py +++ b/gguf-py/gguf/scripts/gguf_new_metadata.py @@ -27,45 +27,10 @@ class MetadataDetails(NamedTuple): description: str = '' -def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian: - if np.uint32(1) == np.uint32(1).newbyteorder("<"): - # Host is little endian - host_endian = gguf.GGUFEndian.LITTLE - swapped_endian = gguf.GGUFEndian.BIG - else: - # Sorry PDP or other weird systems that don't use BE or LE. - host_endian = gguf.GGUFEndian.BIG - swapped_endian = gguf.GGUFEndian.LITTLE - - if reader.byte_order == "S": - return swapped_endian - else: - return host_endian - - -def decode_field(field: gguf.ReaderField | None) -> Any: - if field and field.types: - main_type = field.types[0] - - if main_type == gguf.GGUFValueType.ARRAY: - sub_type = field.types[-1] - - if sub_type == gguf.GGUFValueType.STRING: - return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data] - else: - return [pv for idx in field.data for pv in field.parts[idx].tolist()] - if main_type == gguf.GGUFValueType.STRING: - return str(bytes(field.parts[-1]), encoding='utf-8') - else: - return field.parts[-1][0] - - return None - - def get_field_data(reader: gguf.GGUFReader, key: str) -> Any: field = reader.get_field(key) - return decode_field(field) + return field.contents() if field else None def find_token(token_list: Sequence[int], token: str) -> Sequence[int]: @@ -93,7 +58,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new logger.debug(f'Removing {field.name}') continue - old_val = MetadataDetails(field.types[0], decode_field(field)) + old_val = MetadataDetails(field.types[0], field.contents()) val = new_metadata.get(field.name, old_val) if field.name in new_metadata: @@ -192,7 +157,6 @@ def main() -> None: reader = gguf.GGUFReader(args.input, 'r') arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE) - endianess = get_byteorder(reader) token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or [] @@ -230,7 +194,7 @@ def main() -> None: sys.exit(0) logger.info(f'* Writing: {args.output}') - writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess) + writer = gguf.GGUFWriter(args.output, arch=arch, endianess=reader.endianess) alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT) if alignment is not None: From ef976c32b228707e8e69784ac0c9c0a7c085a62d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 13:47:34 +0100 Subject: [PATCH 3/9] fix import --- gguf-py/examples/reader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py index d841048c6d701..703b782b5fa66 100644 --- a/gguf-py/examples/reader.py +++ b/gguf-py/examples/reader.py @@ -2,12 +2,14 @@ import logging import sys from pathlib import Path -from gguf.gguf_reader import GGUFReader logger = logging.getLogger("reader") +# Necessary to load the local gguf package sys.path.insert(0, str(Path(__file__).parent.parent)) +from gguf.gguf_reader import GGUFReader + def read_gguf_file(gguf_file_path): """ From 6670f933567126bb984ce028614970d0808aa341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 14:00:21 +0100 Subject: [PATCH 4/9] remove unused import --- gguf-py/gguf/scripts/gguf_new_metadata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/scripts/gguf_new_metadata.py b/gguf-py/gguf/scripts/gguf_new_metadata.py index 5128403fcb245..7aff6c9259a1f 100755 --- a/gguf-py/gguf/scripts/gguf_new_metadata.py +++ b/gguf-py/gguf/scripts/gguf_new_metadata.py @@ -8,7 +8,6 @@ import json from pathlib import Path -import numpy as np from tqdm import tqdm from typing import Any, Sequence, NamedTuple From f5185559d31c0e588937b165049c5a3748dfc58d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 14:19:04 +0100 Subject: [PATCH 5/9] attempt to work around flake and pyright errors --- gguf-py/gguf/gguf_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 13c635c334849..144880f10c5b5 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -56,14 +56,14 @@ class ReaderField(NamedTuple): def contents(self, index_or_slice: int | slice = slice(None)) -> Any: if self.types: - to_string = lambda x: str(x.tobytes(), encoding='utf-8') + to_string = lambda x: str(x.tobytes(), encoding='utf-8') # noqa: E731 main_type = self.types[0] if main_type == GGUFValueType.ARRAY: sub_type = self.types[-1] if sub_type == GGUFValueType.STRING: - indices = self.data[index_or_slice] + indices: int | list = self.data[index_or_slice] if isinstance(index_or_slice, int): return to_string(self.parts[indices]) From e90e0c758d64a4c0b49fd547e092609183c4a9ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 14:24:46 +0100 Subject: [PATCH 6/9] second attempt --- gguf-py/gguf/gguf_reader.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 144880f10c5b5..897a0a192e35f 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -63,12 +63,14 @@ def contents(self, index_or_slice: int | slice = slice(None)) -> Any: sub_type = self.types[-1] if sub_type == GGUFValueType.STRING: - indices: int | list = self.data[index_or_slice] + indices: int | list[int] = self.data[index_or_slice] if isinstance(index_or_slice, int): - return to_string(self.parts[indices]) + idx: int = indices + return to_string(self.parts[idx]) else: - return [to_string(self.parts[idx]) for idx in indices] + idxlist: list[int] = indices + return [to_string(self.parts[idx]) for idx in idxlist] else: # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too From 3aa339a84ab4703d72ec90ffdfe04037669f184d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 14:30:53 +0100 Subject: [PATCH 7/9] give up, ignore type --- gguf-py/gguf/gguf_reader.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 897a0a192e35f..266822f253f8d 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -63,14 +63,12 @@ def contents(self, index_or_slice: int | slice = slice(None)) -> Any: sub_type = self.types[-1] if sub_type == GGUFValueType.STRING: - indices: int | list[int] = self.data[index_or_slice] + indices = self.data[index_or_slice] if isinstance(index_or_slice, int): - idx: int = indices - return to_string(self.parts[idx]) + return to_string(self.parts[indices]) # type: ignore else: - idxlist: list[int] = indices - return [to_string(self.parts[idx]) for idx in idxlist] + return [to_string(self.parts[idx]) for idx in indices] # type: ignore else: # FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too From e4727e338660ff6e9ca830f46084a02d0c754cdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 14:35:08 +0100 Subject: [PATCH 8/9] bump version --- gguf-py/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index b4a47333dd15f..d214e8720122b 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.15.0" +version = "0.16.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From 9ab2f4affc5cd3b7cd95ca6da967ed100107040d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 16 Feb 2025 18:43:36 +0100 Subject: [PATCH 9/9] apply newbyteorder fixes --- gguf-py/gguf/gguf_reader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 266822f253f8d..5991cdb76beac 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -144,7 +144,7 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] = # If we get 0 here that means it's (probably) a GGUF file created for # the opposite byte order of the machine this script is running on. self.byte_order = 'S' - temp_version = temp_version.newbyteorder(self.byte_order) + temp_version = temp_version.view(temp_version.dtype.newbyteorder(self.byte_order)) version = temp_version[0] if version not in READER_SUPPORTED_VERSIONS: raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle') @@ -198,9 +198,7 @@ def _get( itemsize = int(np.empty([], dtype = dtype).itemsize) end_offs = offset + itemsize * count arr = self.data[offset:end_offs].view(dtype=dtype)[:count] - if override_order is None: - return arr - return arr.view(arr.dtype.newbyteorder(override_order)) + return arr.view(arr.dtype.newbyteorder(self.byte_order if override_order is None else override_order)) def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: