Skip to content

Commit bcf4ec8

Browse files
author
katsu560
committed
add files to kv and tensor data
1 parent aaa93bc commit bcf4ec8

File tree

1 file changed

+42
-21
lines changed

1 file changed

+42
-21
lines changed

examples/yolo/gguf-addfile.py

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@
88
import os
99
import sys
1010
from pathlib import Path
11-
from typing import Any
11+
#from typing import Any
12+
from typing import Any, Literal, NamedTuple, TypeVar, Union
1213

1314
import numpy as np
15+
import numpy.typing as npt
1416

1517
# Necessary to load the local gguf package
1618
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
1719
sys.path.insert(0, str(Path(__file__).parent.parent))
1820

19-
from gguf import GGUFReader, GGUFWriter, ReaderField, GGUFEndian, GGUFValueType, Keys # noqa: E402
21+
from gguf import GGUFReader, GGUFWriter, ReaderField, GGMLQuantizationType, GGUFEndian, GGUFValueType, Keys # noqa: E402
2022

2123
logger = logging.getLogger("gguf-addfile")
2224

@@ -54,17 +56,11 @@ def decode_field(field: ReaderField) -> Any:
5456
sub_type = field.types[-1]
5557

5658
if sub_type == GGUFValueType.STRING:
57-
if not field.name[0] == Keys.General.FILE_MARK:
58-
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
59-
else:
60-
return [bytes(field.parts[idx]) for idx in field.data]
59+
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
6160
else:
6261
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
6362
if main_type == GGUFValueType.STRING:
64-
if not field.name[0] == Keys.General.FILE_MARK:
65-
return str(bytes(field.parts[-1]), encoding='utf8')
66-
else:
67-
return bytes(field.parts[-1])
63+
return str(bytes(field.parts[-1]), encoding='utf8')
6864
else:
6965
return field.parts[-1][0]
7066

@@ -77,7 +73,7 @@ def get_field_data(reader: GGUFReader, key: str) -> Any:
7773
return decode_field(field)
7874

7975

80-
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str]) -> None:
76+
def copy_with_filename(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], filename: str[Any]) -> None:
8177
for field in reader.fields.values():
8278
# Suppress virtual fields and fields written by GGUFWriter
8379
if field.name == Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@@ -107,24 +103,49 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
107103
writer.add_chat_template(new_metadata[Keys.Tokenizer.CHAT_TEMPLATE])
108104
del new_metadata[Keys.Tokenizer.CHAT_TEMPLATE]
109105

110-
for key, name in new_metadata.items():
111-
logger.debug(f'Adding {key}: {name}')
112-
with open(name, "rb") as f:
113-
val = f.read()
114-
writer.add_object(key, val)
106+
# add filenames to kv
107+
writer.add_array(Keys.EMBEDDED_FILES, filename)
115108

116109
for tensor in reader.tensors:
117110
# Dimensions are written in reverse order, so flip them first
118111
shape = np.flipud(tensor.shape)
119112
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
120113

114+
offset_next = 0
115+
len_last = 0
116+
offset_last = 0
117+
for n, tensor in enumerate(reader.tensors, 1):
118+
len_last = tensor.n_bytes
119+
offset_last = tensor.data_offset
120+
offset_next = max(offset_next, writer.ggml_pad(offset_last + int(len_last), writer.data_alignment))
121+
122+
offs = offset_next
123+
# add file info as tensor_info
124+
for path in filename:
125+
logger.debug(f'Adding {path}')
126+
with open(path, "rb") as f:
127+
data = f.read()
128+
data_len = len(data)
129+
dims = [data_len]
130+
raw_dtype = GGMLQuantizationType.I8
131+
writer.add_tensor_info(path, dims, np.float16, data_len, raw_dtype)
132+
121133
writer.write_header_to_file()
122134
writer.write_kv_data_to_file()
123135
writer.write_ti_data_to_file()
124136

125137
for tensor in reader.tensors:
126138
writer.write_tensor_data(tensor.data)
127139

140+
# write file body as tensor data
141+
for path in filename:
142+
logger.debug(f'Adding {path}')
143+
with open(path, "rb") as f:
144+
data = f.read()
145+
data_len = len(data)
146+
# write data with padding
147+
writer.write_data(data)
148+
128149
writer.close()
129150

130151

@@ -152,12 +173,12 @@ def main() -> None:
152173

153174
logger.info(f'* Adding: {args.addfiles}')
154175
new_metadata = {}
176+
filename = []
155177
for path in args.addfiles:
156-
# add FILE_MARK to key
157-
key = Keys.General.FILE_MARK + path
158-
new_metadata[key] = path
159-
logger.info(f'* Adding: {key} = {path}')
160-
copy_with_new_metadata(reader, writer, new_metadata)
178+
filename.append(path)
179+
logger.info(f'* Adding: {path}')
180+
#new_metadata[Keys.EMBEDDED_FILES] = path
181+
copy_with_filename(reader, writer, new_metadata, filename)
161182

162183

163184
if __name__ == '__main__':

0 commit comments

Comments
 (0)