feat: add GGMLFileQuantizationType and apply to test (huggingface#806)

snowyu · ngxson · web-flow · commit 1140e0c3380d · 2024-08-16T19:19:37.000+02:00
@mishig25 that's it for huggingface#794 --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
diff --git a/packages/gguf/src/gguf.spec.ts b/packages/gguf/src/gguf.spec.ts
@@ -1,6 +1,6 @@
 import { beforeAll, describe, expect, it } from "vitest";
 import type { GGUFParseOutput } from "./gguf";
-import { GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf";
+import { GGMLFileQuantizationType, GGMLQuantizationType, gguf, ggufAllShards, parseGgufShardFilename } from "./gguf";
 import fs from "node:fs";
 
 const URL_LLAMA = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/191239b/llama-2-7b-chat.Q2_K.gguf";
@@ -21,9 +21,11 @@ describe("gguf", () => {
 		if (!fs.existsSync(".cache")) {
 			fs.mkdirSync(".cache");
 		}
-		const res = await fetch(URL_BIG_METADATA);
-		const arrayBuf = await res.arrayBuffer();
-		fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf));
+		if (!fs.existsSync(".cache/model.gguf")) {
+			const res = await fetch(URL_BIG_METADATA);
+			const arrayBuf = await res.arrayBuffer();
+			fs.writeFileSync(".cache/model.gguf", Buffer.from(arrayBuf));
+		}
 	});
 
 	it("should parse a llama2 7b", async () => {
@@ -37,7 +39,7 @@ describe("gguf", () => {
 			tensor_count: 291n,
 			kv_count: 19n,
 			"general.architecture": "llama",
-			"general.file_type": 10,
+			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q2_K,
 			"general.name": "LLaMA v2",
 			"general.quantization_version": 2,
 			"llama.attention.head_count": 32,
@@ -96,7 +98,7 @@ describe("gguf", () => {
 			tensor_count: 291n,
 			kv_count: 24n,
 			"general.architecture": "llama",
-			"general.file_type": 17,
+			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q5_K_M,
 			"general.name": "mistralai_mistral-7b-instruct-v0.2",
 			"general.quantization_version": 2,
 			"llama.attention.head_count": 32,
@@ -134,7 +136,7 @@ describe("gguf", () => {
 			tensor_count: 164n,
 			kv_count: 21n,
 			"general.architecture": "gemma",
-			"general.file_type": GGMLQuantizationType.Q8_K, // 15
+			"general.file_type": GGMLFileQuantizationType.MOSTLY_Q4_K_M,
 			"general.name": "gemma-2b-it",
 			"general.quantization_version": 2,
 			"gemma.attention.head_count": 8,
@@ -171,7 +173,7 @@ describe("gguf", () => {
 			tensor_count: 197n,
 			kv_count: 23n,
 			"general.architecture": "bert",
-			"general.file_type": GGMLQuantizationType.F16,
+			"general.file_type": GGMLFileQuantizationType.MOSTLY_F16,
 			"general.name": "bge-small-en-v1.5",
 			"bert.attention.causal": false,
 			"bert.attention.head_count": 12,
diff --git a/packages/gguf/src/gguf.ts b/packages/gguf/src/gguf.ts
@@ -4,7 +4,7 @@ import { isBackend } from "./utils/isBackend";
 import { promisesQueue } from "./utils/promisesQueue";
 
 export type { MetadataBaseValue, MetadataValue, Version, GGUFMetadata, GGUFTensorInfo, GGUFParseOutput } from "./types";
-export { GGUFValueType, GGMLQuantizationType, Architecture } from "./types";
+export { GGUFValueType, GGMLFileQuantizationType, GGMLQuantizationType, Architecture } from "./types";
 export { GGUF_QUANT_DESCRIPTIONS } from "./quant-descriptions";
 
 export const RE_GGUF_FILE = /\.gguf$/;
diff --git a/packages/gguf/src/types.ts b/packages/gguf/src/types.ts
@@ -6,6 +6,45 @@ export type MetadataValue = MetadataBaseValue | MetadataBaseValue[] | MetadataVa
 
 export type Version = 1 | 2 | 3;
 
+export enum GGMLFileQuantizationType {
+	MOSTLY_F32 = 0,
+	MOSTLY_F16 = 1,
+	MOSTLY_Q4_0 = 2,
+	MOSTLY_Q4_1 = 3,
+	MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+	// MOSTLY_Q4_2 = 5,  // support has been removed
+	// MOSTLY_Q4_3 = 6,  // support has been removed
+	MOSTLY_Q8_0 = 7,
+	MOSTLY_Q5_0 = 8,
+	MOSTLY_Q5_1 = 9,
+	MOSTLY_Q2_K = 10,
+	MOSTLY_Q3_K_S = 11,
+	MOSTLY_Q3_K_M = 12,
+	MOSTLY_Q3_K_L = 13,
+	MOSTLY_Q4_K_S = 14,
+	MOSTLY_Q4_K_M = 15,
+	MOSTLY_Q5_K_S = 16,
+	MOSTLY_Q5_K_M = 17,
+	MOSTLY_Q6_K = 18,
+	MOSTLY_IQ2_XXS = 19,
+	MOSTLY_IQ2_XS = 20,
+	MOSTLY_Q2_K_S = 21,
+	MOSTLY_IQ3_XS = 22,
+	MOSTLY_IQ3_XXS = 23,
+	MOSTLY_IQ1_S = 24,
+	MOSTLY_IQ4_NL = 25,
+	MOSTLY_IQ3_S = 26,
+	MOSTLY_IQ3_M = 27,
+	MOSTLY_IQ2_S = 28,
+	MOSTLY_IQ2_M = 29,
+	MOSTLY_IQ4_XS = 30,
+	MOSTLY_IQ1_M = 31,
+	MOSTLY_BF16 = 32,
+	MOSTLY_Q4_0_4_4 = 33,
+	MOSTLY_Q4_0_4_8 = 34,
+	MOSTLY_Q4_0_8_8 = 35,
+}
+
 export enum GGMLQuantizationType {
 	F32 = 0,
 	F16 = 1,
@@ -60,7 +99,7 @@ export type Architecture = (typeof ARCHITECTURES)[number];
 export interface GGUFGeneralInfo<TArchitecture extends Architecture> {
 	"general.architecture": TArchitecture;
 	"general.name"?: string;
-	"general.file_type"?: number;
+	"general.file_type"?: GGMLFileQuantizationType;
 	"general.quantization_version"?: number;
 }