88import os
99import sys
1010from pathlib import Path
11- from typing import Any
11+ #from typing import Any
12+ from typing import Any , Literal , NamedTuple , TypeVar , Union
1213
1314import numpy as np
15+ import numpy .typing as npt
1416
1517# Necessary to load the local gguf package
1618if "NO_LOCAL_GGUF" not in os .environ and (Path (__file__ ).parent .parent .parent / 'gguf-py' ).exists ():
1719 sys .path .insert (0 , str (Path (__file__ ).parent .parent ))
1820
19- from gguf import GGUFReader , GGUFWriter , ReaderField , GGUFEndian , GGUFValueType , Keys # noqa: E402
21+ from gguf import GGUFReader , GGUFWriter , ReaderField , GGMLQuantizationType , GGUFEndian , GGUFValueType , Keys # noqa: E402
2022
2123logger = logging .getLogger ("gguf-addfile" )
2224
@@ -54,17 +56,11 @@ def decode_field(field: ReaderField) -> Any:
5456 sub_type = field .types [- 1 ]
5557
5658 if sub_type == GGUFValueType .STRING :
57- if not field .name [0 ] == Keys .General .FILE_MARK :
58- return [str (bytes (field .parts [idx ]), encoding = 'utf8' ) for idx in field .data ]
59- else :
60- return [bytes (field .parts [idx ]) for idx in field .data ]
59+ return [str (bytes (field .parts [idx ]), encoding = 'utf8' ) for idx in field .data ]
6160 else :
6261 return [pv for idx in field .data for pv in field .parts [idx ].tolist ()]
6362 if main_type == GGUFValueType .STRING :
64- if not field .name [0 ] == Keys .General .FILE_MARK :
65- return str (bytes (field .parts [- 1 ]), encoding = 'utf8' )
66- else :
67- return bytes (field .parts [- 1 ])
63+ return str (bytes (field .parts [- 1 ]), encoding = 'utf8' )
6864 else :
6965 return field .parts [- 1 ][0 ]
7066
@@ -77,7 +73,7 @@ def get_field_data(reader: GGUFReader, key: str) -> Any:
7773 return decode_field (field )
7874
7975
80- def copy_with_new_metadata (reader : gguf .GGUFReader , writer : gguf .GGUFWriter , new_metadata : Mapping [str , str ]) -> None :
76+ def copy_with_filename (reader : gguf .GGUFReader , writer : gguf .GGUFWriter , new_metadata : Mapping [str , str ], filename : str [ Any ]) -> None :
8177 for field in reader .fields .values ():
8278 # Suppress virtual fields and fields written by GGUFWriter
8379 if field .name == Keys .General .ARCHITECTURE or field .name .startswith ('GGUF.' ):
@@ -107,24 +103,49 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
107103 writer .add_chat_template (new_metadata [Keys .Tokenizer .CHAT_TEMPLATE ])
108104 del new_metadata [Keys .Tokenizer .CHAT_TEMPLATE ]
109105
110- for key , name in new_metadata .items ():
111- logger .debug (f'Adding { key } : { name } ' )
112- with open (name , "rb" ) as f :
113- val = f .read ()
114- writer .add_object (key , val )
106+ # add filenames to kv
107+ writer .add_array (Keys .EMBEDDED_FILES , filename )
115108
116109 for tensor in reader .tensors :
117110 # Dimensions are written in reverse order, so flip them first
118111 shape = np .flipud (tensor .shape )
119112 writer .add_tensor_info (tensor .name , shape , tensor .data .dtype , tensor .data .nbytes , tensor .tensor_type )
120113
114+ offset_next = 0
115+ len_last = 0
116+ offset_last = 0
117+ for n , tensor in enumerate (reader .tensors , 1 ):
118+ len_last = tensor .n_bytes
119+ offset_last = tensor .data_offset
120+ offset_next = max (offset_next , writer .ggml_pad (offset_last + int (len_last ), writer .data_alignment ))
121+
122+ offs = offset_next
123+ # add file info as tensor_info
124+ for path in filename :
125+ logger .debug (f'Adding { path } ' )
126+ with open (path , "rb" ) as f :
127+ data = f .read ()
128+ data_len = len (data )
129+ dims = [data_len ]
130+ raw_dtype = GGMLQuantizationType .I8
131+ writer .add_tensor_info (path , dims , np .float16 , data_len , raw_dtype )
132+
121133 writer .write_header_to_file ()
122134 writer .write_kv_data_to_file ()
123135 writer .write_ti_data_to_file ()
124136
125137 for tensor in reader .tensors :
126138 writer .write_tensor_data (tensor .data )
127139
140+ # write file body as tensor data
141+ for path in filename :
142+ logger .debug (f'Adding { path } ' )
143+ with open (path , "rb" ) as f :
144+ data = f .read ()
145+ data_len = len (data )
146+ # write data with padding
147+ writer .write_data (data )
148+
128149 writer .close ()
129150
130151
@@ -152,12 +173,12 @@ def main() -> None:
152173
153174 logger .info (f'* Adding: { args .addfiles } ' )
154175 new_metadata = {}
176+ filename = []
155177 for path in args .addfiles :
156- # add FILE_MARK to key
157- key = Keys .General .FILE_MARK + path
158- new_metadata [key ] = path
159- logger .info (f'* Adding: { key } = { path } ' )
160- copy_with_new_metadata (reader , writer , new_metadata )
178+ filename .append (path )
179+ logger .info (f'* Adding: { path } ' )
180+ #new_metadata[Keys.EMBEDDED_FILES] = path
181+ copy_with_filename (reader , writer , new_metadata , filename )
161182
162183
163184if __name__ == '__main__' :
0 commit comments