Skip to content

Commit dea956e

Browse files
authored
gguf : add findNearestQuantType (#1421)
In this PR: - Move `GGMLFileQuantizationType` to `tasks` - Update the list of `GGMLFileQuantizationType` (NOTE: a **File** can contains multiple quants, for example Q4_K_M **File** is Q4_K + Q6_K) - For `GGMLQuantizationType`, add TQ1_0 and TQ2_0 tenary quants - Add `findNearestQuantType` (see below) ## findNearestQuantType This function is useful for `/v2` registry endpoint with **text + vision models**, in case we want to pick the correspond vision model that can be paired with a text model. The main issue is that text model can go lower than Q4, like Q3/2/1, but it is **not** the case for vision model, as vision model is quite sensitive to quantization. On @bartowski1182 repos , most of vision models will have BF16, F16 and maybe Q8_0 versions. The idea is: - If user pick BF16/F16/Q8_0 text model, we pair it to the correspond BF16/F16/Q8_0 - If user pick something else, like Q4_K_M, we find the nearest quant to be paired with. It's Q8_0 in this case
1 parent 16fa28c commit dea956e

File tree

5 files changed

+216
-58
lines changed

5 files changed

+216
-58
lines changed

packages/gguf/src/gguf.spec.ts

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import {
77
ggufAllShards,
88
parseGgufShardFilename,
99
parseGGUFQuantLabel,
10+
GGUF_QUANT_ORDER,
11+
findNearestQuantType,
1012
} from "./gguf";
1113
import fs from "node:fs";
1214

@@ -46,7 +48,7 @@ describe("gguf", () => {
4648
tensor_count: 291n,
4749
kv_count: 19n,
4850
"general.architecture": "llama",
49-
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K,
51+
"general.file_type": GGMLFileQuantizationType.Q2_K,
5052
"general.name": "LLaMA v2",
5153
"general.quantization_version": 2,
5254
"llama.attention.head_count": 32,
@@ -105,7 +107,7 @@ describe("gguf", () => {
105107
tensor_count: 291n,
106108
kv_count: 24n,
107109
"general.architecture": "llama",
108-
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M,
110+
"general.file_type": GGMLFileQuantizationType.Q5_K_M,
109111
"general.name": "mistralai_mistral-7b-instruct-v0.2",
110112
"general.quantization_version": 2,
111113
"llama.attention.head_count": 32,
@@ -143,7 +145,7 @@ describe("gguf", () => {
143145
tensor_count: 164n,
144146
kv_count: 21n,
145147
"general.architecture": "gemma",
146-
"general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M,
148+
"general.file_type": GGMLFileQuantizationType.Q4_K_M,
147149
"general.name": "gemma-2b-it",
148150
"general.quantization_version": 2,
149151
"gemma.attention.head_count": 8,
@@ -180,7 +182,7 @@ describe("gguf", () => {
180182
tensor_count: 197n,
181183
kv_count: 23n,
182184
"general.architecture": "bert",
183-
"general.file_type": GGMLFileQuantizationType.MOSTLY_F16,
185+
"general.file_type": GGMLFileQuantizationType.F16,
184186
"general.name": "bge-small-en-v1.5",
185187
"bert.attention.causal": false,
186188
"bert.attention.head_count": 12,
@@ -280,12 +282,47 @@ describe("gguf", () => {
280282
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q2_K.gguf")).toEqual("Q2_K");
281283
expect(parseGGUFQuantLabel("Codestral-22B-v0.1.gguf")).toEqual(undefined);
282284
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-F32-Q2_K.gguf")).toEqual("Q2_K"); // gguf name with two quant labels [F32, Q2_K]
283-
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual(undefined); // TODO: investigate IQ3_XS
285+
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual("IQ3_XS");
284286
expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q4_0_4_4.gguf")).toEqual("Q4_0"); // TODO: investigate Q4_0_4_4
285287
});
286288

287289
it("calculate tensor data offset", async () => {
288290
const { tensorDataOffset } = await gguf(URL_LLAMA);
289291
expect(tensorDataOffset).toEqual(741056n);
290292
});
293+
294+
// Quantization handler
295+
296+
it("should have GGUF_QUANT_ORDER in sync with GGMLQuantizationType enum", () => {
297+
const enumValues = Object.values(GGMLQuantizationType).filter((value) => typeof value === "number") as number[];
298+
const checkValues = new Set(GGUF_QUANT_ORDER);
299+
for (const value of enumValues) {
300+
expect(checkValues).toContain(value);
301+
}
302+
});
303+
304+
it("should find the nearest quant", () => {
305+
const quant = GGMLFileQuantizationType.IQ2_M;
306+
const availableQuants = [
307+
GGMLFileQuantizationType.Q2_K,
308+
GGMLFileQuantizationType.Q4_K_M,
309+
GGMLFileQuantizationType.Q8_0,
310+
];
311+
const nearestQuant = findNearestQuantType(quant, availableQuants);
312+
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q2_K);
313+
});
314+
315+
it("should find the nearest quant (vision model)", () => {
316+
const visionQuants = [GGMLFileQuantizationType.Q8_0, GGMLFileQuantizationType.F16, GGMLFileQuantizationType.BF16];
317+
let nearestQuant;
318+
// text = Q4_K_M
319+
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q4_K_M, visionQuants);
320+
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
321+
// text = Q8_0
322+
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q8_0, visionQuants);
323+
expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
324+
// text = F16
325+
nearestQuant = findNearestQuantType(GGMLFileQuantizationType.F16, visionQuants);
326+
expect(nearestQuant).toEqual(GGMLFileQuantizationType.F16);
327+
});
291328
});

packages/gguf/src/gguf.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,16 @@ import { isBackend } from "./utils/isBackend";
44
import { promisesQueue } from "./utils/promisesQueue";
55

66
export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
7-
export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types";
7+
export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types";
88
export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";
9-
export { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL } from "@huggingface/tasks";
9+
export {
10+
parseGGUFQuantLabel,
11+
GGUF_QUANT_RE,
12+
GGUF_QUANT_RE_GLOBAL,
13+
GGUF_QUANT_ORDER,
14+
findNearestQuantType,
15+
GGMLFileQuantizationType,
16+
} from "@huggingface/tasks";
1017

1118
export const RE_GGUF_FILE = /\.gguf$/;
1219
export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;

packages/gguf/src/quant-descriptions.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,14 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
124124
txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
125125
src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
126126
},
127+
[GGMLQuantizationType.TQ1_0]: {
128+
txt: "Ternary quantization.",
129+
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
130+
},
131+
[GGMLQuantizationType.TQ2_0]: {
132+
txt: "Ternary quantization.",
133+
src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
134+
},
127135
};
128136

129137
const QK_K = 256;
@@ -163,6 +171,6 @@ export const GGML_QUANT_SIZES = {
163171
[GGMLQuantizationType.F64]: calcBPW(1, 8),
164172
[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
165173
[GGMLQuantizationType.BF16]: calcBPW(1, 2),
166-
// [GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
167-
// [GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
174+
[GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
175+
[GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
168176
};

packages/gguf/src/types.ts

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,13 @@
11
import type { TransformerLLM } from "./transformer-llm";
22
import { LLM_ARCHITECTURES } from "./transformer-llm";
3-
import type { GGMLQuantizationType } from "@huggingface/tasks";
3+
import type { GGMLQuantizationType, GGMLFileQuantizationType } from "@huggingface/tasks";
44
export { GGMLQuantizationType } from "@huggingface/tasks";
55

66
export type MetadataBaseValue = string | number | bigint | boolean;
77
export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataValue[]; /// recursive as arrays can be nested.
88

99
export type Version = 1 | 2 | 3;
1010

11-
export enum GGMLFileQuantizationType {
12-
MOSTLY_F32 = 0,
13-
MOSTLY_F16 = 1,
14-
MOSTLY_Q4_0 = 2,
15-
MOSTLY_Q4_1 = 3,
16-
MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
17-
// MOSTLY_Q4_2 = 5, // support has been removed
18-
// MOSTLY_Q4_3 = 6, // support has been removed
19-
MOSTLY_Q8_0 = 7,
20-
MOSTLY_Q5_0 = 8,
21-
MOSTLY_Q5_1 = 9,
22-
MOSTLY_Q2_K = 10,
23-
MOSTLY_Q3_K_S = 11,
24-
MOSTLY_Q3_K_M = 12,
25-
MOSTLY_Q3_K_L = 13,
26-
MOSTLY_Q4_K_S = 14,
27-
MOSTLY_Q4_K_M = 15,
28-
MOSTLY_Q5_K_S = 16,
29-
MOSTLY_Q5_K_M = 17,
30-
MOSTLY_Q6_K = 18,
31-
MOSTLY_IQ2_XXS = 19,
32-
MOSTLY_IQ2_XS = 20,
33-
MOSTLY_Q2_K_S = 21,
34-
MOSTLY_IQ3_XS = 22,
35-
MOSTLY_IQ3_XXS = 23,
36-
MOSTLY_IQ1_S = 24,
37-
MOSTLY_IQ4_NL = 25,
38-
MOSTLY_IQ3_S = 26,
39-
MOSTLY_IQ3_M = 27,
40-
MOSTLY_IQ2_S = 28,
41-
MOSTLY_IQ2_M = 29,
42-
MOSTLY_IQ4_XS = 30,
43-
MOSTLY_IQ1_M = 31,
44-
MOSTLY_BF16 = 32,
45-
MOSTLY_Q4_0_4_4 = 33,
46-
MOSTLY_Q4_0_4_8 = 34,
47-
MOSTLY_Q4_0_8_8 = 35,
48-
}
49-
5011
export enum GGUFValueType {
5112
UINT8 = 0,
5213
INT8 = 1,

packages/tasks/src/gguf.ts

Lines changed: 154 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,155 @@
1+
// This list is copied from gguf/types.ts, but will all types available (for backward compatibility)
2+
// NOT to be confused with GGMLQuantizationType, a FileQuantization can contain multiple GGMLQuantizationType
3+
// For example, Q4_K_M model can contains Q4_K and Q6_K tensors
4+
export enum GGMLFileQuantizationType {
5+
F32 = 0,
6+
F16 = 1,
7+
Q4_0 = 2,
8+
Q4_1 = 3,
9+
Q4_1_SOME_F16 = 4,
10+
Q4_2 = 5,
11+
Q4_3 = 6,
12+
Q8_0 = 7,
13+
Q5_0 = 8,
14+
Q5_1 = 9,
15+
Q2_K = 10,
16+
Q3_K_S = 11,
17+
Q3_K_M = 12,
18+
Q3_K_L = 13,
19+
Q4_K_S = 14,
20+
Q4_K_M = 15,
21+
Q5_K_S = 16,
22+
Q5_K_M = 17,
23+
Q6_K = 18,
24+
IQ2_XXS = 19,
25+
IQ2_XS = 20,
26+
Q2_K_S = 21,
27+
IQ3_XS = 22,
28+
IQ3_XXS = 23,
29+
IQ1_S = 24,
30+
IQ4_NL = 25,
31+
IQ3_S = 26,
32+
IQ3_M = 27,
33+
IQ2_S = 28,
34+
IQ2_M = 29,
35+
IQ4_XS = 30,
36+
IQ1_M = 31,
37+
BF16 = 32,
38+
Q4_0_4_4 = 33,
39+
Q4_0_4_8 = 34,
40+
Q4_0_8_8 = 35,
41+
TQ1_0 = 36,
42+
TQ2_0 = 37,
43+
}
44+
45+
const ggufQuants = Object.values(GGMLFileQuantizationType).filter((v): v is string => typeof v === "string");
46+
export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
47+
export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");
48+
49+
export function parseGGUFQuantLabel(fname: string): string | undefined {
50+
const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
51+
return quantLabel;
52+
}
53+
54+
// order of quantization, from biggest to smallest
55+
// this list must be in sync with the order in GGMLFileQuantizationType
56+
// the gguf.spec.ts tests are using verify if the order is correct
57+
export const GGUF_QUANT_ORDER: GGMLFileQuantizationType[] = [
58+
GGMLFileQuantizationType.F32,
59+
GGMLFileQuantizationType.BF16,
60+
GGMLFileQuantizationType.F16,
61+
GGMLFileQuantizationType.Q8_0,
62+
63+
// 6-bit quantizations
64+
GGMLFileQuantizationType.Q6_K,
65+
66+
// 5-bit quantizations
67+
GGMLFileQuantizationType.Q5_0,
68+
GGMLFileQuantizationType.Q5_1,
69+
GGMLFileQuantizationType.Q5_K_M,
70+
GGMLFileQuantizationType.Q5_K_S,
71+
72+
// 4-bit quantizations
73+
GGMLFileQuantizationType.Q4_K_M,
74+
GGMLFileQuantizationType.Q4_K_S,
75+
GGMLFileQuantizationType.IQ4_NL,
76+
GGMLFileQuantizationType.IQ4_XS,
77+
GGMLFileQuantizationType.Q4_0_4_4,
78+
GGMLFileQuantizationType.Q4_0_4_8,
79+
GGMLFileQuantizationType.Q4_0_8_8,
80+
GGMLFileQuantizationType.Q4_0,
81+
GGMLFileQuantizationType.Q4_1_SOME_F16,
82+
GGMLFileQuantizationType.Q4_1,
83+
GGMLFileQuantizationType.Q4_2,
84+
GGMLFileQuantizationType.Q4_3,
85+
86+
// 3-bit quantizations
87+
GGMLFileQuantizationType.Q3_K_L,
88+
GGMLFileQuantizationType.Q3_K_M,
89+
GGMLFileQuantizationType.Q3_K_S,
90+
GGMLFileQuantizationType.IQ3_M,
91+
GGMLFileQuantizationType.IQ3_S,
92+
GGMLFileQuantizationType.IQ3_XS,
93+
GGMLFileQuantizationType.IQ3_XXS,
94+
95+
// 2-bit quantizations
96+
GGMLFileQuantizationType.Q2_K,
97+
GGMLFileQuantizationType.Q2_K_S,
98+
GGMLFileQuantizationType.IQ2_M,
99+
GGMLFileQuantizationType.IQ2_S,
100+
GGMLFileQuantizationType.IQ2_XS,
101+
GGMLFileQuantizationType.IQ2_XXS,
102+
103+
// 1-bit quantizations
104+
GGMLFileQuantizationType.IQ1_S,
105+
GGMLFileQuantizationType.IQ1_M,
106+
GGMLFileQuantizationType.TQ1_0,
107+
GGMLFileQuantizationType.TQ2_0,
108+
];
109+
110+
// This function finds the nearest quantization type that is less than or equal to the given quantization type.
111+
// It returns undefined if no such quantization type is found.
112+
export function findNearestQuantType(
113+
quant: GGMLFileQuantizationType,
114+
availableQuants: GGMLFileQuantizationType[]
115+
): GGMLFileQuantizationType | undefined {
116+
// Create a map for quick index lookup from the defined order
117+
const orderMap = new Map<GGMLFileQuantizationType, number>();
118+
GGUF_QUANT_ORDER.forEach((q, index) => {
119+
orderMap.set(q, index);
120+
});
121+
122+
const targetIndex = orderMap.get(quant) ?? 0; // the 0 case should never happen
123+
124+
// Filter the available quantizations to include only those defined in the order map,
125+
// then sort them according to the GGUF_QUANT_ORDER (from largest/index 0 to smallest/highest index).
126+
const sortedAvailable = availableQuants
127+
.filter((q) => orderMap.has(q))
128+
.sort((a, b) => (orderMap.get(a) ?? Infinity) - (orderMap.get(b) ?? Infinity));
129+
130+
// If no valid quantizations are available after filtering
131+
if (sortedAvailable.length === 0) {
132+
return undefined;
133+
}
134+
135+
// Iterate through the sorted available quantizations (largest to smallest).
136+
// Find the first one whose order index is >= the target index.
137+
// This means finding the largest quantization that is smaller than or equal to the target.
138+
for (const availableQuant of sortedAvailable) {
139+
// We know the key exists due to the filter above.
140+
const availableIndex = orderMap.get(availableQuant)!;
141+
if (availableIndex >= targetIndex) {
142+
return availableQuant;
143+
}
144+
}
145+
146+
// If the loop completes, it means all available quantizations are larger (have a smaller index)
147+
// than the target quantization. In this case, return the "smallest" available quantization,
148+
// which is the last element in the sorted list (highest index among available).
149+
return sortedAvailable[sortedAvailable.length - 1];
150+
}
151+
152+
// This list is only used to calculate the size of the model, NOT to be confused with the quantization FILE type
1153
export enum GGMLQuantizationType {
2154
F32 = 0,
3155
F16 = 1,
@@ -28,13 +180,6 @@ export enum GGMLQuantizationType {
28180
F64 = 28,
29181
IQ1_M = 29,
30182
BF16 = 30,
31-
}
32-
33-
const ggufQuants = Object.values(GGMLQuantizationType).filter((v): v is string => typeof v === "string");
34-
export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
35-
export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");
36-
37-
export function parseGGUFQuantLabel(fname: string): string | undefined {
38-
const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
39-
return quantLabel;
183+
TQ1_0 = 34,
184+
TQ2_0 = 35,
40185
}

0 commit comments

Comments
 (0)