gguf : add findNearestQuantType (#1421)

ngxson · web-flow · commit dea956ee8ec2 · 2025-05-06T18:19:59.000+02:00
In this PR: - Move `GGMLFileQuantizationType` to `tasks` - Update the list of `GGMLFileQuantizationType` (NOTE: a **File** can contains multiple quants, for example Q4_K_M **File** is Q4_K + Q6_K) - For `GGMLQuantizationType`, add TQ1_0 and TQ2_0 tenary quants - Add `findNearestQuantType` (see below) ## findNearestQuantType This function is useful for `/v2` registry endpoint with **text + vision models**, in case we want to pick the correspond vision model that can be paired with a text model. The main issue is that text model can go lower than Q4, like Q3/2/1, but it is **not** the case for vision model, as vision model is quite sensitive to quantization. On @bartowski1182 repos , most of vision models will have BF16, F16 and maybe Q8_0 versions. The idea is: - If user pick BF16/F16/Q8_0 text model, we pair it to the correspond BF16/F16/Q8_0 - If user pick something else, like Q4_K_M, we find the nearest quant to be paired with. It's Q8_0 in this case
diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts
@@ -7,6 +7,8 @@ import {
 	ggufAllShards,
 	parseGgufShardFilename,
 	parseGGUFQuantLabel,
+	GGUF_QUANT_ORDER,
+	findNearestQuantType,
 } from "./gguf";
 import fs from "node:fs";
 
@@ -46,7 +48,7 @@ describe("gguf", () => {
 			tensor_count: 291n,
 			kv_count: 19n,
 			"general.architecture": "llama",
-			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K,
+			"general.file_type": GGMLFileQuantizationType.Q2_K,
 			"general.name": "LLaMA v2",
 			"general.quantization_version": 2,
 			"llama.attention.head_count": 32,
@@ -105,7 +107,7 @@ describe("gguf", () => {
 			tensor_count: 291n,
 			kv_count: 24n,
 			"general.architecture": "llama",
-			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M,
+			"general.file_type": GGMLFileQuantizationType.Q5_K_M,
 			"general.name": "mistralai_mistral-7b-instruct-v0.2",
 			"general.quantization_version": 2,
 			"llama.attention.head_count": 32,
@@ -143,7 +145,7 @@ describe("gguf", () => {
 			tensor_count: 164n,
 			kv_count: 21n,
 			"general.architecture": "gemma",
-			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M,
+			"general.file_type": GGMLFileQuantizationType.Q4_K_M,
 			"general.name": "gemma-2b-it",
 			"general.quantization_version": 2,
 			"gemma.attention.head_count": 8,
@@ -180,7 +182,7 @@ describe("gguf", () => {
 			tensor_count: 197n,
 			kv_count: 23n,
 			"general.architecture": "bert",
-			"general.file_type": GGMLFileQuantizationType.MOSTLY_F16,
+			"general.file_type": GGMLFileQuantizationType.F16,
 			"general.name": "bge-small-en-v1.5",
 			"bert.attention.causal": false,
 			"bert.attention.head_count": 12,
@@ -280,12 +282,47 @@ describe("gguf", () => {
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q2_K.gguf")).toEqual("Q2_K");
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1.gguf")).toEqual(undefined);
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-F32-Q2_K.gguf")).toEqual("Q2_K"); // gguf name with two quant labels [F32, Q2_K]
-		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual(undefined); // TODO: investigate IQ3_XS
+		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-IQ3_XS.gguf")).toEqual("IQ3_XS");
 		expect(parseGGUFQuantLabel("Codestral-22B-v0.1-Q4_0_4_4.gguf")).toEqual("Q4_0"); // TODO: investigate Q4_0_4_4
 	});
 
 	it("calculate tensor data offset", async () => {
 		const { tensorDataOffset } = await gguf(URL_LLAMA);
 		expect(tensorDataOffset).toEqual(741056n);
 	});
+
+	// Quantization handler
+
+	it("should have GGUF_QUANT_ORDER in sync with GGMLQuantizationType enum", () => {
+		const enumValues = Object.values(GGMLQuantizationType).filter((value) => typeof value === "number") as number[];
+		const checkValues = new Set(GGUF_QUANT_ORDER);
+		for (const value of enumValues) {
+			expect(checkValues).toContain(value);
+		}
+	});
+
+	it("should find the nearest quant", () => {
+		const quant = GGMLFileQuantizationType.IQ2_M;
+		const availableQuants = [
+			GGMLFileQuantizationType.Q2_K,
+			GGMLFileQuantizationType.Q4_K_M,
+			GGMLFileQuantizationType.Q8_0,
+		];
+		const nearestQuant = findNearestQuantType(quant, availableQuants);
+		expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q2_K);
+	});
+
+	it("should find the nearest quant (vision model)", () => {
+		const visionQuants = [GGMLFileQuantizationType.Q8_0, GGMLFileQuantizationType.F16, GGMLFileQuantizationType.BF16];
+		let nearestQuant;
+		// text = Q4_K_M
+		nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q4_K_M, visionQuants);
+		expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
+		// text = Q8_0
+		nearestQuant = findNearestQuantType(GGMLFileQuantizationType.Q8_0, visionQuants);
+		expect(nearestQuant).toEqual(GGMLFileQuantizationType.Q8_0);
+		// text = F16
+		nearestQuant = findNearestQuantType(GGMLFileQuantizationType.F16, visionQuants);
+		expect(nearestQuant).toEqual(GGMLFileQuantizationType.F16);
+	});
 });
diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts
@@ -4,9 +4,16 @@ import { isBackend } from "./utils/isBackend";
 import { promisesQueue } from "./utils/promisesQueue";
 
 export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
-export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types";
+export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types";
 export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";
-export { parseGGUFQuantLabel, GGUF_QUANT_RE, GGUF_QUANT_RE_GLOBAL } from "@huggingface/tasks";
+export {
+	parseGGUFQuantLabel,
+	GGUF_QUANT_RE,
+	GGUF_QUANT_RE_GLOBAL,
+	GGUF_QUANT_ORDER,
+	findNearestQuantType,
+	GGMLFileQuantizationType,
+} from "@huggingface/tasks";
 
 export const RE_GGUF_FILE = /\.gguf$/;
 export const RE_GGUF_SHARD_FILE = /^(?<prefix>.*?)-(?<shard>\d{5})-of-(?<total>\d{5})\.gguf$/;
diff --git a/packages/gguf/src/quant-descriptions.ts b/packages/gguf/src/quant-descriptions.ts
@@ -124,6 +124,14 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
 		txt: "16-bit shortened version of the 32-bit IEEE 754 single-precision floating-point number.",
 		src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
 	},
+	[GGMLQuantizationType.TQ1_0]: {
+		txt: "Ternary quantization.",
+		src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
+	},
+	[GGMLQuantizationType.TQ2_0]: {
+		txt: "Ternary quantization.",
+		src_url: "https://github.com/ggml-org/llama.cpp/pull/8151",
+	},
 };
 
 const QK_K = 256;
@@ -163,6 +171,6 @@ export const GGML_QUANT_SIZES = {
 	[GGMLQuantizationType.F64]: calcBPW(1, 8),
 	[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
 	[GGMLQuantizationType.BF16]: calcBPW(1, 2),
-	// [GGMLQuantizationType.TQ1_0]:   calcBPW(256, 2 + 4 * 13),
-	// [GGMLQuantizationType.TQ2_0]:   calcBPW(256, 2 + 64),
+	[GGMLQuantizationType.TQ1_0]: calcBPW(256, 2 + 4 * 13),
+	[GGMLQuantizationType.TQ2_0]: calcBPW(256, 2 + 64),
 };
diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts
@@ -1,52 +1,13 @@
 import type { TransformerLLM } from "./transformer-llm";
 import { LLM_ARCHITECTURES } from "./transformer-llm";
-import type { GGMLQuantizationType } from "@huggingface/tasks";
+import type { GGMLQuantizationType, GGMLFileQuantizationType } from "@huggingface/tasks";
 export { GGMLQuantizationType } from "@huggingface/tasks";
 
 export type MetadataBaseValue = string | number | bigint | boolean;
 export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataValue[]; /// recursive as arrays can be nested.
 
 export type Version = 1 | 2 | 3;
 
-export enum GGMLFileQuantizationType {
-	MOSTLY_F32 = 0,
-	MOSTLY_F16 = 1,
-	MOSTLY_Q4_0 = 2,
-	MOSTLY_Q4_1 = 3,
-	MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-	// MOSTLY_Q4_2 = 5,  // support has been removed
-	// MOSTLY_Q4_3 = 6,  // support has been removed
-	MOSTLY_Q8_0 = 7,
-	MOSTLY_Q5_0 = 8,
-	MOSTLY_Q5_1 = 9,
-	MOSTLY_Q2_K = 10,
-	MOSTLY_Q3_K_S = 11,
-	MOSTLY_Q3_K_M = 12,
-	MOSTLY_Q3_K_L = 13,
-	MOSTLY_Q4_K_S = 14,
-	MOSTLY_Q4_K_M = 15,
-	MOSTLY_Q5_K_S = 16,
-	MOSTLY_Q5_K_M = 17,
-	MOSTLY_Q6_K = 18,
-	MOSTLY_IQ2_XXS = 19,
-	MOSTLY_IQ2_XS = 20,
-	MOSTLY_Q2_K_S = 21,
-	MOSTLY_IQ3_XS = 22,
-	MOSTLY_IQ3_XXS = 23,
-	MOSTLY_IQ1_S = 24,
-	MOSTLY_IQ4_NL = 25,
-	MOSTLY_IQ3_S = 26,
-	MOSTLY_IQ3_M = 27,
-	MOSTLY_IQ2_S = 28,
-	MOSTLY_IQ2_M = 29,
-	MOSTLY_IQ4_XS = 30,
-	MOSTLY_IQ1_M = 31,
-	MOSTLY_BF16 = 32,
-	MOSTLY_Q4_0_4_4 = 33,
-	MOSTLY_Q4_0_4_8 = 34,
-	MOSTLY_Q4_0_8_8 = 35,
-}
-
 export enum GGUFValueType {
 	UINT8 = 0,
 	INT8 = 1,
diff --git a/packages/tasks/src/gguf.ts b/packages/tasks/src/gguf.ts
@@ -1,3 +1,155 @@
+// This list is copied from gguf/types.ts, but will all types available (for backward compatibility)
+// NOT to be confused with GGMLQuantizationType, a FileQuantization can contain multiple GGMLQuantizationType
+// For example, Q4_K_M model can contains Q4_K and Q6_K tensors
+export enum GGMLFileQuantizationType {
+	F32 = 0,
+	F16 = 1,
+	Q4_0 = 2,
+	Q4_1 = 3,
+	Q4_1_SOME_F16 = 4,
+	Q4_2 = 5,
+	Q4_3 = 6,
+	Q8_0 = 7,
+	Q5_0 = 8,
+	Q5_1 = 9,
+	Q2_K = 10,
+	Q3_K_S = 11,
+	Q3_K_M = 12,
+	Q3_K_L = 13,
+	Q4_K_S = 14,
+	Q4_K_M = 15,
+	Q5_K_S = 16,
+	Q5_K_M = 17,
+	Q6_K = 18,
+	IQ2_XXS = 19,
+	IQ2_XS = 20,
+	Q2_K_S = 21,
+	IQ3_XS = 22,
+	IQ3_XXS = 23,
+	IQ1_S = 24,
+	IQ4_NL = 25,
+	IQ3_S = 26,
+	IQ3_M = 27,
+	IQ2_S = 28,
+	IQ2_M = 29,
+	IQ4_XS = 30,
+	IQ1_M = 31,
+	BF16 = 32,
+	Q4_0_4_4 = 33,
+	Q4_0_4_8 = 34,
+	Q4_0_8_8 = 35,
+	TQ1_0 = 36,
+	TQ2_0 = 37,
+}
+
+const ggufQuants = Object.values(GGMLFileQuantizationType).filter((v): v is string => typeof v === "string");
+export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
+export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");
+
+export function parseGGUFQuantLabel(fname: string): string | undefined {
+	const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
+	return quantLabel;
+}
+
+// order of quantization, from biggest to smallest
+// this list must be in sync with the order in GGMLFileQuantizationType
+// the gguf.spec.ts tests are using verify if the order is correct
+export const GGUF_QUANT_ORDER: GGMLFileQuantizationType[] = [
+	GGMLFileQuantizationType.F32,
+	GGMLFileQuantizationType.BF16,
+	GGMLFileQuantizationType.F16,
+	GGMLFileQuantizationType.Q8_0,
+
+	// 6-bit quantizations
+	GGMLFileQuantizationType.Q6_K,
+
+	// 5-bit quantizations
+	GGMLFileQuantizationType.Q5_0,
+	GGMLFileQuantizationType.Q5_1,
+	GGMLFileQuantizationType.Q5_K_M,
+	GGMLFileQuantizationType.Q5_K_S,
+
+	// 4-bit quantizations
+	GGMLFileQuantizationType.Q4_K_M,
+	GGMLFileQuantizationType.Q4_K_S,
+	GGMLFileQuantizationType.IQ4_NL,
+	GGMLFileQuantizationType.IQ4_XS,
+	GGMLFileQuantizationType.Q4_0_4_4,
+	GGMLFileQuantizationType.Q4_0_4_8,
+	GGMLFileQuantizationType.Q4_0_8_8,
+	GGMLFileQuantizationType.Q4_0,
+	GGMLFileQuantizationType.Q4_1_SOME_F16,
+	GGMLFileQuantizationType.Q4_1,
+	GGMLFileQuantizationType.Q4_2,
+	GGMLFileQuantizationType.Q4_3,
+
+	// 3-bit quantizations
+	GGMLFileQuantizationType.Q3_K_L,
+	GGMLFileQuantizationType.Q3_K_M,
+	GGMLFileQuantizationType.Q3_K_S,
+	GGMLFileQuantizationType.IQ3_M,
+	GGMLFileQuantizationType.IQ3_S,
+	GGMLFileQuantizationType.IQ3_XS,
+	GGMLFileQuantizationType.IQ3_XXS,
+
+	// 2-bit quantizations
+	GGMLFileQuantizationType.Q2_K,
+	GGMLFileQuantizationType.Q2_K_S,
+	GGMLFileQuantizationType.IQ2_M,
+	GGMLFileQuantizationType.IQ2_S,
+	GGMLFileQuantizationType.IQ2_XS,
+	GGMLFileQuantizationType.IQ2_XXS,
+
+	// 1-bit quantizations
+	GGMLFileQuantizationType.IQ1_S,
+	GGMLFileQuantizationType.IQ1_M,
+	GGMLFileQuantizationType.TQ1_0,
+	GGMLFileQuantizationType.TQ2_0,
+];
+
+// This function finds the nearest quantization type that is less than or equal to the given quantization type.
+// It returns undefined if no such quantization type is found.
+export function findNearestQuantType(
+	quant: GGMLFileQuantizationType,
+	availableQuants: GGMLFileQuantizationType[]
+): GGMLFileQuantizationType | undefined {
+	// Create a map for quick index lookup from the defined order
+	const orderMap = new Map<GGMLFileQuantizationType, number>();
+	GGUF_QUANT_ORDER.forEach((q, index) => {
+		orderMap.set(q, index);
+	});
+
+	const targetIndex = orderMap.get(quant) ?? 0; // the 0 case should never happen
+
+	// Filter the available quantizations to include only those defined in the order map,
+	// then sort them according to the GGUF_QUANT_ORDER (from largest/index 0 to smallest/highest index).
+	const sortedAvailable = availableQuants
+		.filter((q) => orderMap.has(q))
+		.sort((a, b) => (orderMap.get(a) ?? Infinity) - (orderMap.get(b) ?? Infinity));
+
+	// If no valid quantizations are available after filtering
+	if (sortedAvailable.length === 0) {
+		return undefined;
+	}
+
+	// Iterate through the sorted available quantizations (largest to smallest).
+	// Find the first one whose order index is >= the target index.
+	// This means finding the largest quantization that is smaller than or equal to the target.
+	for (const availableQuant of sortedAvailable) {
+		// We know the key exists due to the filter above.
+		const availableIndex = orderMap.get(availableQuant)!;
+		if (availableIndex >= targetIndex) {
+			return availableQuant;
+		}
+	}
+
+	// If the loop completes, it means all available quantizations are larger (have a smaller index)
+	// than the target quantization. In this case, return the "smallest" available quantization,
+	// which is the last element in the sorted list (highest index among available).
+	return sortedAvailable[sortedAvailable.length - 1];
+}
+
+// This list is only used to calculate the size of the model, NOT to be confused with the quantization FILE type
 export enum GGMLQuantizationType {
 	F32 = 0,
 	F16 = 1,
@@ -28,13 +180,6 @@ export enum GGMLQuantizationType {
 	F64 = 28,
 	IQ1_M = 29,
 	BF16 = 30,
-}
-
-const ggufQuants = Object.values(GGMLQuantizationType).filter((v): v is string => typeof v === "string");
-export const GGUF_QUANT_RE = new RegExp(`(?<quant>${ggufQuants.join("|")})` + "(_(?<sizeVariation>[A-Z]+))?");
-export const GGUF_QUANT_RE_GLOBAL = new RegExp(GGUF_QUANT_RE, "g");
-
-export function parseGGUFQuantLabel(fname: string): string | undefined {
-	const quantLabel = fname.toUpperCase().match(GGUF_QUANT_RE_GLOBAL)?.at(-1); // if there is multiple quant substrings in a name, we prefer the last one
-	return quantLabel;
+	TQ1_0 = 34,
+	TQ2_0 = 35,
 }