gguf: add CLI (#1221)

ngxson · web-flow · commit f0518b3d6d6e · 2025-02-25T13:58:01.000+01:00
Ref discussion: https://huggingface.slack.com/archives/C02CLHA19TL/p1740399079674399?thread_ts=1739968558.574099&cid=C02CLHA19TL I'm trying with this command: ```bash pnpm run build && npx . ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf ``` Output: ``` * Dumping 36 key/value pair(s) Idx | Count | Value ----|--------|---------------------------------------------------------------------------------- 1 | 1 | version = 3 2 | 1 | tensor_count = 292 3 | 1 | kv_count = 33 4 | 1 | general.architecture = "llama" 5 | 1 | general.type = "model" 6 | 1 | general.name = "Meta Llama 3.1 8B Instruct" 7 | 1 | general.finetune = "Instruct" 8 | 1 | general.basename = "Meta-Llama-3.1" 9 | 1 | general.size_label = "8B" 10 | 1 | general.license = "llama3.1" 11 | 6 | general.tags = ["facebook","meta","pytorch","llama","llama-3","te... 12 | 8 | general.languages = ["en","de","fr","it","pt","hi","es","th"] 13 | 1 | llama.block_count = 32 14 | 1 | llama.context_length = 131072 15 | 1 | llama.embedding_length = 4096 16 | 1 | llama.feed_forward_length = 14336 17 | 1 | llama.attention.head_count = 32 18 | 1 | llama.attention.head_count_kv = 8 19 | 1 | llama.rope.freq_base = 500000 20 | 1 | llama.attention.layer_norm_rms_epsilon = 0.000009999999747378752 21 | 1 | general.file_type = 15 22 | 1 | llama.vocab_size = 128256 23 | 1 | llama.rope.dimension_count = 128 24 | 1 | tokenizer.ggml.model = "gpt2" 25 | 1 | tokenizer.ggml.pre = "llama-bpe" 26 | 128256 | tokenizer.ggml.tokens = ["!","\"","#","$","%","&","'","(",")","*","+",",",... 27 | 128256 | tokenizer.ggml.token_type = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1... 28 | 280147 | tokenizer.ggml.merges = ["Ġ Ġ","Ġ ĠĠĠ","ĠĠ ĠĠ","ĠĠĠ Ġ","i n","Ġ t","Ġ ĠĠĠĠ... 29 | 1 | tokenizer.ggml.bos_token_id = 128000 30 | 1 | tokenizer.ggml.eos_token_id = 128009 31 | 1 | tokenizer.chat_template = "{{- bos_token }}\n{%- if custom_tools is defined ... 32 | 1 | general.quantization_version = 2 33 | 1 | quantize.imatrix.file = "/models_out/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-... 34 | 1 | quantize.imatrix.dataset = "/training_dir/calibration_datav3.txt" 35 | 1 | quantize.imatrix.entries_count = 224 36 | 1 | quantize.imatrix.chunks_count = 125 * Dumping 292 tensor(s) Idx | Num Elements | Shape | Data Type | Name ----|--------------|--------------------------------|-----------|-------------------------- 1 | 64 | 64, 1, 1, 1 | F32 | rope_freqs.weight 2 | 525336576 | 4096, 128256, 1, 1 | Q4_K | token_embd.weight 3 | 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight 4 | 58720256 | 14336, 4096, 1, 1 | Q6_K | blk.0.ffn_down.weight ...(truncated) ``` --- For reference, here is the output of `gguf_dump.py`: ``` $ python gguf_dump.py ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf INFO:gguf-dump:* Loading: /Users/ngxson/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf * File is LITTLE endian, script is running on a LITTLE endian host. * Dumping 36 key/value pair(s) 1: UINT32 | 1 | GGUF.version = 3 2: UINT64 | 1 | GGUF.tensor_count = 292 3: UINT64 | 1 | GGUF.kv_count = 33 4: STRING | 1 | general.architecture = 'llama' 5: STRING | 1 | general.type = 'model' 6: STRING | 1 | general.name = 'Meta Llama 3.1 8B Instruct' 7: STRING | 1 | general.finetune = 'Instruct' 8: STRING | 1 | general.basename = 'Meta-Llama-3.1' 9: STRING | 1 | general.size_label = '8B' 10: STRING | 1 | general.license = 'llama3.1' 11: [STRING] | 6 | general.tags 12: [STRING] | 8 | general.languages 13: UINT32 | 1 | llama.block_count = 32 14: UINT32 | 1 | llama.context_length = 131072 15: UINT32 | 1 | llama.embedding_length = 4096 16: UINT32 | 1 | llama.feed_forward_length = 14336 17: UINT32 | 1 | llama.attention.head_count = 32 18: UINT32 | 1 | llama.attention.head_count_kv = 8 19: FLOAT32 | 1 | llama.rope.freq_base = 500000.0 20: FLOAT32 | 1 | llama.attention.layer_norm_rms_epsilon = 9.999999747378752e-06 21: UINT32 | 1 | general.file_type = 15 22: UINT32 | 1 | llama.vocab_size = 128256 23: UINT32 | 1 | llama.rope.dimension_count = 128 24: STRING | 1 | tokenizer.ggml.model = 'gpt2' 25: STRING | 1 | tokenizer.ggml.pre = 'llama-bpe' 26: [STRING] | 128256 | tokenizer.ggml.tokens 27: [INT32] | 128256 | tokenizer.ggml.token_type 28: [STRING] | 280147 | tokenizer.ggml.merges 29: UINT32 | 1 | tokenizer.ggml.bos_token_id = 128000 30: UINT32 | 1 | tokenizer.ggml.eos_token_id = 128009 31: STRING | 1 | tokenizer.chat_template = '{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- s' 32: UINT32 | 1 | general.quantization_version = 2 33: STRING | 1 | quantize.imatrix.file = '/models_out/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8' 34: STRING | 1 | quantize.imatrix.dataset = '/training_dir/calibration_datav3.txt' 35: INT32 | 1 | quantize.imatrix.entries_count = 224 36: INT32 | 1 | quantize.imatrix.chunks_count = 125 * Dumping 292 tensor(s) 1: 64 | 64, 1, 1, 1 | F32 | rope_freqs.weight 2: 525336576 | 4096, 128256, 1, 1 | Q4_K | token_embd.weight 3: 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight 4: 58720256 | 14336, 4096, 1, 1 | Q6_K | blk.0.ffn_down.weight 5: 58720256 | 4096, 14336, 1, 1 | Q4_K | blk.0.ffn_gate.weight 6: 58720256 | 4096, 14336, 1, 1 | Q4_K | blk.0.ffn_up.weight 7: 4096 | 4096, 1, 1, 1 | F32 | blk.0.ffn_norm.weight 8: 4194304 | 4096, 1024, 1, 1 | Q4_K | blk.0.attn_k.weight ```
diff --git a/packages/gguf/README.md b/packages/gguf/README.md
@@ -96,6 +96,52 @@ In case you want to use your own GGUF metadata structure, you can disable strict
 const { metadata, tensorInfos }: GGUFParseOutput<{ strict: false }> = await gguf(URL_LLAMA);
 ```
 
+## Command line interface
+
+This package provides a CLI equivalent to [`gguf_dump.py`](https://github.com/ggml-org/llama.cpp/blob/7a2c913e66353362d7f28d612fd3c9d51a831eda/gguf-py/gguf/scripts/gguf_dump.py) script. You can dump GGUF metadata and list of tensors using this command:
+
+```bash
+npx @huggingface/gguf my_model.gguf
+
+# or, with a remote GGUF file:
+# npx @huggingface/gguf https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf
+```
+
+Example for the output:
+
+```
+* Dumping 36 key/value pair(s)
+  Idx | Count  | Value                                                                            
+  ----|--------|----------------------------------------------------------------------------------
+    1 |      1 | version = 3                                                                      
+    2 |      1 | tensor_count = 292                                                               
+    3 |      1 | kv_count = 33                                                                    
+    4 |      1 | general.architecture = "llama"                                                   
+    5 |      1 | general.type = "model"                                                           
+    6 |      1 | general.name = "Meta Llama 3.1 8B Instruct"                                      
+    7 |      1 | general.finetune = "Instruct"                                                    
+    8 |      1 | general.basename = "Meta-Llama-3.1"                                                   
+
+[truncated]
+
+* Dumping 292 tensor(s)
+  Idx | Num Elements | Shape                          | Data Type | Name                     
+  ----|--------------|--------------------------------|-----------|--------------------------
+    1 |           64 |     64,      1,      1,      1 | F32       | rope_freqs.weight        
+    2 |    525336576 |   4096, 128256,      1,      1 | Q4_K      | token_embd.weight        
+    3 |         4096 |   4096,      1,      1,      1 | F32       | blk.0.attn_norm.weight   
+    4 |     58720256 |  14336,   4096,      1,      1 | Q6_K      | blk.0.ffn_down.weight
+
+[truncated]
+```
+
+Alternatively, you can install this package as global, which will provide the `gguf-view` command:
+
+```bash
+npm i -g @huggingface/gguf
+gguf-view my_model.gguf
+```
+
 ## Hugging Face Hub
 
 The Hub supports all file formats and has built-in features for GGUF format. 
diff --git a/packages/gguf/package.json b/packages/gguf/package.json
@@ -10,6 +10,9 @@
 	"main": "./dist/index.js",
 	"module": "./dist/index.mjs",
 	"types": "./dist/index.d.ts",
+	"bin": {
+		"gguf-view": "./dist/cli.js"
+	},
 	"exports": {
 		".": {
 			"types": "./dist/index.d.ts",
@@ -18,6 +21,7 @@
 		}
 	},
 	"browser": {
+		"./src/cli.ts": false,
 		"./src/utils/FileBlob.ts": false,
 		"./dist/index.js": "./dist/browser/index.js",
 		"./dist/index.mjs": "./dist/browser/index.mjs"
@@ -32,7 +36,7 @@
 		"format": "prettier --write .",
 		"format:check": "prettier --check .",
 		"prepublishOnly": "pnpm run build",
-		"build": "tsup src/index.ts --format cjs,esm --clean && tsc --emitDeclarationOnly --declaration",
+		"build": "tsup src/index.ts src/cli.ts --format cjs,esm --clean && tsc --emitDeclarationOnly --declaration",
 		"build:llm": "tsx scripts/generate-llm.ts && pnpm run format",
 		"test": "vitest run",
 		"check": "tsc"
diff --git a/packages/gguf/src/cli.ts b/packages/gguf/src/cli.ts
@@ -0,0 +1,102 @@
+#!/usr/bin/env node
+
+import { GGMLQuantizationType, gguf } from ".";
+
+interface PrintColumnHeader {
+	name: string;
+	maxWidth?: number;
+	alignRight?: boolean;
+}
+
+const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
+
+async function main() {
+	const ggufPath = process.argv[2];
+	const { metadata, tensorInfos } = await gguf(ggufPath, {
+		allowLocalFile: true,
+	});
+
+	// TODO: print info about endianess
+	console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
+	printTable(
+		[
+			{ name: "Idx", alignRight: true },
+			// { name: 'Type' }, // TODO: support this
+			{ name: "Count", alignRight: true },
+			{ name: "Value" },
+		],
+		Object.entries(metadata).map(([key, value], i) => {
+			const MAX_LEN = 50;
+			let strVal = "";
+			let count = 1;
+			if (Array.isArray(value)) {
+				strVal = JSON.stringify(value);
+				count = value.length;
+			} else if (value instanceof String || typeof value === "string") {
+				strVal = JSON.stringify(value);
+			} else {
+				strVal = value.toString();
+			}
+			strVal = strVal.length > MAX_LEN ? strVal.slice(0, MAX_LEN) + "..." : strVal;
+			return [(i + 1).toString(), count.toString(), `${key} = ${strVal}`];
+		})
+	);
+
+	console.log();
+	console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
+	printTable(
+		[
+			{ name: "Idx", alignRight: true },
+			{ name: "Num Elements", alignRight: true },
+			{ name: "Shape" },
+			{ name: "Data Type" },
+			{ name: "Name" },
+		],
+		tensorInfos.map((tensorInfo, i) => {
+			const shape = [1n, 1n, 1n, 1n];
+			tensorInfo.shape.forEach((dim, i) => {
+				shape[i] = dim;
+			});
+			return [
+				(i + 1).toString(),
+				shape.reduce((acc, n) => acc * n, 1n).toString(),
+				shape.map((n) => n.toString().padStart(6)).join(", "),
+				mapDtypeToName[tensorInfo.dtype],
+				tensorInfo.name,
+			];
+		})
+	);
+}
+
+function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {
+	const leftPadStr = " ".repeat(leftPad);
+
+	// Calculate column widths
+	const columnWidths = header.map((h, i) => {
+		const maxContentWidth = Math.max(h.name.length, ...rows.map((row) => (row[i] || "").length));
+		return h.maxWidth ? Math.min(maxContentWidth, h.maxWidth) : maxContentWidth;
+	});
+
+	// Print header
+	const headerLine = header
+		.map((h, i) => {
+			return h.name.padEnd(columnWidths[i]);
+		})
+		.join(" | ");
+	console.log(leftPadStr + headerLine);
+
+	// Print separator
+	console.log(leftPadStr + columnWidths.map((w) => "-".repeat(w)).join("-|-"));
+
+	// Print rows
+	for (const row of rows) {
+		const line = header
+			.map((h, i) => {
+				return h.alignRight ? (row[i] || "").padStart(columnWidths[i]) : (row[i] || "").padEnd(columnWidths[i]);
+			})
+			.join(" | ");
+		console.log(leftPadStr + line);
+	}
+}
+
+main();