gguf: add memory calculator to CLI

ngxson · ngxson · commit 91c0941ca0b9 · 2025-02-25T17:23:41.000+01:00
diff --git a/packages/gguf/src/cli.ts b/packages/gguf/src/cli.ts
@@ -1,6 +1,7 @@
 #!/usr/bin/env node
 
-import { GGMLQuantizationType, gguf } from ".";
+import { GGMLQuantizationType, gguf, GGUFParseOutput } from ".";
+import { GGML_QUANT_SIZES } from "./quant-descriptions";
 
 interface PrintColumnHeader {
 	name: string;
@@ -11,7 +12,21 @@ interface PrintColumnHeader {
 const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
 
 async function main() {
-	const ggufPath = process.argv[2];
+	let ggufPath = "";
+	let showTensors = false;
+	for (let i = 2; i < process.argv.length; i++) {
+		if (process.argv[i] === "--show-tensor") {
+			showTensors = true;
+		} else {
+			ggufPath = process.argv[i];
+		}
+	}
+
+	if (!ggufPath.length) {
+		console.error("Usage: gguf-view [--show-tensor] <path/to/gguf>");
+		process.exit(1);
+	}
+
 	const { metadata, tensorInfos } = await gguf(ggufPath, {
 		allowLocalFile: true,
 	});
@@ -43,29 +58,102 @@ async function main() {
 	);
 
 	console.log();
-	console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
+	console.log(`* Memory usage estimation`);
+	const kvUsage = calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], 4096);
+	let modelWeightInBytes = 0;
+	for (const tensorInfo of tensorInfos) {
+		const nElem = Number(tensorInfo.shape.reduce((a, b) => a * b, 1n));
+		const tensorSizeInBytes = nElem * (GGML_QUANT_SIZES[tensorInfo.dtype] / 8);
+		modelWeightInBytes += tensorSizeInBytes;
+	}
+	const overhead =
+		calcMemoryUsage(metadata as GGUFParseOutput<{ strict: false }>["metadata"], 256).totalBytes +
+		modelWeightInBytes * 0.05;
+	const totalMemoryUsage = kvUsage.totalBytes + overhead + modelWeightInBytes;
 	printTable(
+		[{ name: "Item" }, { name: "Memory usage", alignRight: true }],
 		[
-			{ name: "Idx", alignRight: true },
-			{ name: "Num Elements", alignRight: true },
-			{ name: "Shape" },
-			{ name: "Data Type" },
-			{ name: "Name" },
-		],
-		tensorInfos.map((tensorInfo, i) => {
-			const shape = [1n, 1n, 1n, 1n];
-			tensorInfo.shape.forEach((dim, i) => {
-				shape[i] = dim;
-			});
-			return [
-				(i + 1).toString(),
-				shape.reduce((acc, n) => acc * n, 1n).toString(),
-				shape.map((n) => n.toString().padStart(6)).join(", "),
-				mapDtypeToName[tensorInfo.dtype],
-				tensorInfo.name,
-			];
-		})
+			["K cache", (kvUsage.totalBytesK / 1e9).toFixed(2) + " GB"],
+			["V cache", (kvUsage.totalBytesV / 1e9).toFixed(2) + " GB"],
+			["Weight", (modelWeightInBytes / 1e9).toFixed(2) + " GB"],
+			["Overhead", (overhead / 1e9).toFixed(2) + " GB"],
+			["", "---"],
+			["TOTAL", (totalMemoryUsage / 1e9).toFixed(2) + " GB"],
+		]
 	);
+
+	if (showTensors) {
+		console.log();
+		console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
+		printTable(
+			[
+				{ name: "Idx", alignRight: true },
+				{ name: "Num Elements", alignRight: true },
+				{ name: "Shape" },
+				{ name: "Data Type" },
+				{ name: "Name" },
+			],
+			tensorInfos.map((tensorInfo, i) => {
+				const shape = [1n, 1n, 1n, 1n];
+				tensorInfo.shape.forEach((dim, i) => {
+					shape[i] = dim;
+				});
+				return [
+					(i + 1).toString(),
+					shape.reduce((acc, n) => acc * n, 1n).toString(),
+					shape.map((n) => n.toString().padStart(6)).join(", "),
+					mapDtypeToName[tensorInfo.dtype],
+					tensorInfo.name,
+				];
+			})
+		);
+	} else {
+		console.log();
+		console.log(`* Use --show-tensor to display tensor information`);
+	}
+}
+
+function calcMemoryUsage(
+	metadata: GGUFParseOutput<{ strict: false }>["metadata"],
+	kvSize: number,
+	kvTypeK: GGMLQuantizationType = GGMLQuantizationType.F16,
+	kvTypeV: GGMLQuantizationType = GGMLQuantizationType.F16
+) {
+	const arch = metadata["general.architecture"] ?? "unknown";
+	const n_embd = (metadata[`${arch}.embedding_length`] as number) ?? 0;
+	const n_head = (metadata[`${arch}.attention.head_count`] as number) ?? 0;
+	const n_embd_head_k = (metadata[`${arch}.attention.key_length`] as number) ?? n_embd / n_head;
+	const n_embd_head_v = (metadata[`${arch}.attention.value_length`] as number) ?? n_embd / n_head;
+	const n_head_kv = (metadata[`${arch}.attention.head_count_kv`] as number[] | number) ?? [];
+	const n_layer = (metadata[`${arch}.block_count`] as number) ?? 0;
+
+	const n_head_kv_arr = Array(n_layer).fill(n_head);
+	if (Array.isArray(n_head_kv)) {
+		for (let i = 0; i < n_layer; i++) {
+			if (n_head_kv[i]) {
+				n_head_kv_arr[i] = n_head_kv[i];
+			}
+		}
+	} else {
+		for (let i = 0; i < n_layer; i++) {
+			n_head_kv_arr[i] = n_head_kv;
+		}
+	}
+
+	let totalElemsK = 0;
+	let totalElemsV = 0;
+	for (let i = 0; i < n_layer; i++) {
+		const n_embd_k_gqa = n_embd_head_k * n_head_kv_arr[i];
+		const n_embd_v_gqa = n_embd_head_v * n_head_kv_arr[i];
+		totalElemsK += n_embd_k_gqa * kvSize;
+		totalElemsV += n_embd_v_gqa * kvSize;
+	}
+
+	return {
+		totalBytesK: totalElemsK * (GGML_QUANT_SIZES[kvTypeK] / 8),
+		totalBytesV: totalElemsV * (GGML_QUANT_SIZES[kvTypeV] / 8),
+		totalBytes: (totalElemsK + totalElemsV) * (GGML_QUANT_SIZES[kvTypeV] / 8),
+	};
 }
 
 function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {
diff --git a/packages/gguf/src/quant-descriptions.ts b/packages/gguf/src/quant-descriptions.ts
@@ -125,3 +125,43 @@ export const GGUF_QUANT_DESCRIPTIONS: Record<GGMLQuantizationType, { txt: string
 		src_url: "https://en.wikipedia.org/wiki/Bfloat16_floating-point_format",
 	},
 };
+
+const QK_K = 256;
+const calcBPW = (blockSize: number, typeSize: number) => {
+	return (typeSize * 8) / blockSize;
+};
+
+// map quantization type to element size in bits per weight (example: Q4_K -> 4.5 bpw)
+export const GGML_QUANT_SIZES = {
+	[GGMLQuantizationType.F32]: calcBPW(1, 4),
+	[GGMLQuantizationType.F16]: calcBPW(1, 2),
+	[GGMLQuantizationType.Q4_0]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.Q4_1]: calcBPW(32, 2 + 2 + 16),
+	[GGMLQuantizationType.Q5_0]: calcBPW(32, 2 + 4 + 16),
+	[GGMLQuantizationType.Q5_1]: calcBPW(32, 2 + 2 + 4 + 16),
+	[GGMLQuantizationType.Q8_0]: calcBPW(32, 2 + 32),
+	[GGMLQuantizationType.Q8_1]: calcBPW(32, 4 + 4 + 32),
+	[GGMLQuantizationType.Q2_K]: calcBPW(256, 2 + 2 + QK_K / 16 + QK_K / 4),
+	[GGMLQuantizationType.Q3_K]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q4_K]: calcBPW(256, 2 + 2 + QK_K / 2 + 12),
+	[GGMLQuantizationType.Q5_K]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 8 + 12),
+	[GGMLQuantizationType.Q6_K]: calcBPW(256, 2 + QK_K / 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.Q8_K]: calcBPW(256, 4 + QK_K + QK_K / 8),
+	[GGMLQuantizationType.IQ2_XXS]: calcBPW(256, 2 + QK_K / 4),
+	[GGMLQuantizationType.IQ2_XS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 32),
+	[GGMLQuantizationType.IQ3_XXS]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8),
+	[GGMLQuantizationType.IQ1_S]: calcBPW(256, 2 + QK_K / 8 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_NL]: calcBPW(32, 2 + 16),
+	[GGMLQuantizationType.IQ3_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 8 + QK_K / 32 + 4),
+	[GGMLQuantizationType.IQ2_S]: calcBPW(256, 2 + QK_K / 4 + QK_K / 16),
+	[GGMLQuantizationType.IQ4_XS]: calcBPW(256, 2 + 2 + QK_K / 2 + QK_K / 64),
+	[GGMLQuantizationType.I8]: calcBPW(1, 1),
+	[GGMLQuantizationType.I16]: calcBPW(1, 2),
+	[GGMLQuantizationType.I32]: calcBPW(1, 4),
+	[GGMLQuantizationType.I64]: calcBPW(1, 8),
+	[GGMLQuantizationType.F64]: calcBPW(1, 8),
+	[GGMLQuantizationType.IQ1_M]: calcBPW(256, QK_K / 8 + QK_K / 16 + QK_K / 32),
+	[GGMLQuantizationType.BF16]: calcBPW(1, 2),
+	// [GGMLQuantizationType.TQ1_0]:   calcBPW(256, 2 + 4 * 13),
+	// [GGMLQuantizationType.TQ2_0]:   calcBPW(256, 2 + 64),
+};