Skip to content

Commit 64387f6

Browse files
gguf-py: byteswapping improvements (ggml-org#12851)
* gguf-py: implement byteswapping for Q4_0 This is needed to byteswap Mistral model. Also restore original shapes after byteswapping tensors. It is not needed at the moment, but do it in case they'd be used in future. * Rework byteswapping code in gguf-py Move out details from byteswapping tensor blocks code
1 parent d35a1e8 commit 64387f6

File tree

1 file changed

+67
-63
lines changed

1 file changed

+67
-63
lines changed

gguf-py/gguf/scripts/gguf_convert_endian.py

Lines changed: 67 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,61 @@
1919
logger = logging.getLogger("gguf-convert-endian")
2020

2121

22+
def byteswap_q4_0(tensor, block_offs):
23+
# Each block_q4_0 consists of an f16 delta (scaling factor) followed by 16 int8 quantizations.
24+
25+
# Byte-Swap f16 sized delta field
26+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
27+
delta.byteswap(inplace=True)
28+
29+
30+
def byteswap_q8_0(tensor, block_offs):
31+
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
32+
33+
# Byte-Swap f16 sized delta field
34+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
35+
delta.byteswap(inplace=True)
36+
37+
38+
def byteswap_q4_k(tensor, block_offs):
39+
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
40+
41+
# Byte-Swap f16 sized fields
42+
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
43+
delta.byteswap(inplace=True)
44+
45+
delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
46+
delta.byteswap(inplace=True)
47+
48+
49+
def byteswap_q6_k(tensor, block_offs):
50+
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
51+
52+
# Byte-Swap f16 sized field
53+
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
54+
delta.byteswap(inplace=True)
55+
56+
57+
byteswap_tensors = {
58+
gguf.GGMLQuantizationType.Q4_0: {
59+
"block_size": 18, # 18 bytes = <f16 delta scaling factor> + 16 * <int8 quant>
60+
"byteswap_func": byteswap_q4_0,
61+
},
62+
gguf.GGMLQuantizationType.Q8_0: {
63+
"block_size": 34, # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
64+
"byteswap_func": byteswap_q8_0,
65+
},
66+
gguf.GGMLQuantizationType.Q4_K: {
67+
"block_size": 144, # 144 bytes = 2 * <f16 delta scaling factor> + 140 * <int8 quant>
68+
"byteswap_func": byteswap_q4_k,
69+
},
70+
gguf.GGMLQuantizationType.Q6_K: {
71+
"block_size": 210, # 210 bytes = <f16 delta scaling factor> + 208 * <int8 quant>
72+
"byteswap_func": byteswap_q6_k,
73+
},
74+
}
75+
76+
2277
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
2378
file_endian = reader.endianess.name
2479
if reader.byte_order == 'S':
@@ -32,13 +87,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
3287
sys.exit(0)
3388
logger.info("* Checking tensors for conversion compatibility")
3489
for tensor in reader.tensors:
35-
if tensor.tensor_type not in (
36-
gguf.GGMLQuantizationType.F32,
37-
gguf.GGMLQuantizationType.F16,
38-
gguf.GGMLQuantizationType.Q8_0,
39-
gguf.GGMLQuantizationType.Q4_K,
40-
gguf.GGMLQuantizationType.Q6_K,
41-
):
90+
if tensor.tensor_type not in byteswap_tensors and \
91+
tensor.tensor_type not in (
92+
gguf.GGMLQuantizationType.F32,
93+
gguf.GGMLQuantizationType.F16,
94+
):
4295
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
4396
logger.info(f"* Preparing to convert from {file_endian} to {order}")
4497
if args.dry_run:
@@ -72,78 +125,29 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
72125
part.byteswap(inplace=True)
73126

74127
# Byte-swap tensor data if necessary
75-
if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
76-
# Handle Q8_0 tensor blocks (block_q8_0)
77-
# Specific handling of block_q8_0 is required.
78-
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
79-
80-
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
81-
82-
n_blocks = len(tensor.data) // block_size
83-
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
84-
block_offs = block_num * block_size
85-
86-
# Byte-Swap f16 sized delta field
87-
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
88-
delta.byteswap(inplace=True)
89-
90-
# Byte-Swap Q8 weights
91-
if block_num % 100000 == 0:
92-
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
93-
94-
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q4_K:
95-
# Handle Q4_K tensor blocks (block_q4_k)
96-
# Specific handling of block_q4_k is required.
97-
# Each block_q4_k consists of 2 f16 values followed by 140 int8 values.
98-
128+
if tensor.tensor_type in byteswap_tensors:
99129
# first flatten structure
130+
oldshape = tensor.data.shape
100131
newshape = 1
101132
for i in tensor.data.shape:
102133
newshape *= i
103134

104135
tensor.data.resize(newshape)
105136

106-
block_size = 144
107-
n_blocks = len(tensor.data) // block_size
108-
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
109-
block_offs = block_num * block_size
110-
111-
# Byte-Swap f16 sized fields
112-
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
113-
delta.byteswap(inplace=True)
114-
115-
delta = tensor.data[block_offs + 2:block_offs + 4].view(dtype=np.uint16)
116-
delta.byteswap(inplace=True)
117-
118-
# Byte-Swap
119-
if block_num % 100000 == 0:
120-
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
121-
122-
elif tensor.tensor_type == gguf.GGMLQuantizationType.Q6_K:
123-
# Handle Q6_K tensor blocks (block_q6_k)
124-
# Specific handling of block_q6_k is required.
125-
# Each block_q6_k consists of 208 int8 values followed by 1 f16 value.
126-
127-
# first flatten structure
128-
newshape = 1
129-
for i in tensor.data.shape:
130-
newshape *= i
131-
132-
tensor.data.resize(newshape)
137+
block_size = byteswap_tensors[tensor.tensor_type]["block_size"]
138+
byteswap_func = byteswap_tensors[tensor.tensor_type]["byteswap_func"]
133139

134-
block_size = 210
135140
n_blocks = len(tensor.data) // block_size
136141
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
137142
block_offs = block_num * block_size
138143

139-
# Byte-Swap f16 sized field
140-
delta = tensor.data[block_offs + 208:block_offs + 210].view(dtype=np.uint16)
141-
delta.byteswap(inplace=True)
144+
byteswap_func(tensor, block_offs)
142145

143-
# Byte-Swap
144146
if block_num % 100000 == 0:
145147
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
146148

149+
# restore old shape in case it's ever used
150+
tensor.data.resize(oldshape)
147151
else:
148152
# Handle other tensor types
149153
tensor.data.byteswap(inplace=True)

0 commit comments

Comments
 (0)