Skip to content

Commit f0518b3

Browse files
authored
gguf: add CLI (#1221)
Ref discussion: https://huggingface.slack.com/archives/C02CLHA19TL/p1740399079674399?thread_ts=1739968558.574099&cid=C02CLHA19TL I'm trying with this command: ```bash pnpm run build && npx . ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf ``` Output: ``` * Dumping 36 key/value pair(s) Idx | Count | Value ----|--------|---------------------------------------------------------------------------------- 1 | 1 | version = 3 2 | 1 | tensor_count = 292 3 | 1 | kv_count = 33 4 | 1 | general.architecture = "llama" 5 | 1 | general.type = "model" 6 | 1 | general.name = "Meta Llama 3.1 8B Instruct" 7 | 1 | general.finetune = "Instruct" 8 | 1 | general.basename = "Meta-Llama-3.1" 9 | 1 | general.size_label = "8B" 10 | 1 | general.license = "llama3.1" 11 | 6 | general.tags = ["facebook","meta","pytorch","llama","llama-3","te... 12 | 8 | general.languages = ["en","de","fr","it","pt","hi","es","th"] 13 | 1 | llama.block_count = 32 14 | 1 | llama.context_length = 131072 15 | 1 | llama.embedding_length = 4096 16 | 1 | llama.feed_forward_length = 14336 17 | 1 | llama.attention.head_count = 32 18 | 1 | llama.attention.head_count_kv = 8 19 | 1 | llama.rope.freq_base = 500000 20 | 1 | llama.attention.layer_norm_rms_epsilon = 0.000009999999747378752 21 | 1 | general.file_type = 15 22 | 1 | llama.vocab_size = 128256 23 | 1 | llama.rope.dimension_count = 128 24 | 1 | tokenizer.ggml.model = "gpt2" 25 | 1 | tokenizer.ggml.pre = "llama-bpe" 26 | 128256 | tokenizer.ggml.tokens = ["!","\"","#","$","%","&","'","(",")","*","+",",",... 27 | 128256 | tokenizer.ggml.token_type = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1... 28 | 280147 | tokenizer.ggml.merges = ["Ġ Ġ","Ġ ĠĠĠ","ĠĠ ĠĠ","ĠĠĠ Ġ","i n","Ġ t","Ġ ĠĠĠĠ... 29 | 1 | tokenizer.ggml.bos_token_id = 128000 30 | 1 | tokenizer.ggml.eos_token_id = 128009 31 | 1 | tokenizer.chat_template = "{{- bos_token }}\n{%- if custom_tools is defined ... 32 | 1 | general.quantization_version = 2 33 | 1 | quantize.imatrix.file = "/models_out/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-... 34 | 1 | quantize.imatrix.dataset = "/training_dir/calibration_datav3.txt" 35 | 1 | quantize.imatrix.entries_count = 224 36 | 1 | quantize.imatrix.chunks_count = 125 * Dumping 292 tensor(s) Idx | Num Elements | Shape | Data Type | Name ----|--------------|--------------------------------|-----------|-------------------------- 1 | 64 | 64, 1, 1, 1 | F32 | rope_freqs.weight 2 | 525336576 | 4096, 128256, 1, 1 | Q4_K | token_embd.weight 3 | 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight 4 | 58720256 | 14336, 4096, 1, 1 | Q6_K | blk.0.ffn_down.weight ...(truncated) ``` --- For reference, here is the output of `gguf_dump.py`: ``` $ python gguf_dump.py ~/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf INFO:gguf-dump:* Loading: /Users/ngxson/work/models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf * File is LITTLE endian, script is running on a LITTLE endian host. * Dumping 36 key/value pair(s) 1: UINT32 | 1 | GGUF.version = 3 2: UINT64 | 1 | GGUF.tensor_count = 292 3: UINT64 | 1 | GGUF.kv_count = 33 4: STRING | 1 | general.architecture = 'llama' 5: STRING | 1 | general.type = 'model' 6: STRING | 1 | general.name = 'Meta Llama 3.1 8B Instruct' 7: STRING | 1 | general.finetune = 'Instruct' 8: STRING | 1 | general.basename = 'Meta-Llama-3.1' 9: STRING | 1 | general.size_label = '8B' 10: STRING | 1 | general.license = 'llama3.1' 11: [STRING] | 6 | general.tags 12: [STRING] | 8 | general.languages 13: UINT32 | 1 | llama.block_count = 32 14: UINT32 | 1 | llama.context_length = 131072 15: UINT32 | 1 | llama.embedding_length = 4096 16: UINT32 | 1 | llama.feed_forward_length = 14336 17: UINT32 | 1 | llama.attention.head_count = 32 18: UINT32 | 1 | llama.attention.head_count_kv = 8 19: FLOAT32 | 1 | llama.rope.freq_base = 500000.0 20: FLOAT32 | 1 | llama.attention.layer_norm_rms_epsilon = 9.999999747378752e-06 21: UINT32 | 1 | general.file_type = 15 22: UINT32 | 1 | llama.vocab_size = 128256 23: UINT32 | 1 | llama.rope.dimension_count = 128 24: STRING | 1 | tokenizer.ggml.model = 'gpt2' 25: STRING | 1 | tokenizer.ggml.pre = 'llama-bpe' 26: [STRING] | 128256 | tokenizer.ggml.tokens 27: [INT32] | 128256 | tokenizer.ggml.token_type 28: [STRING] | 280147 | tokenizer.ggml.merges 29: UINT32 | 1 | tokenizer.ggml.bos_token_id = 128000 30: UINT32 | 1 | tokenizer.ggml.eos_token_id = 128009 31: STRING | 1 | tokenizer.chat_template = '{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- s' 32: UINT32 | 1 | general.quantization_version = 2 33: STRING | 1 | quantize.imatrix.file = '/models_out/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8' 34: STRING | 1 | quantize.imatrix.dataset = '/training_dir/calibration_datav3.txt' 35: INT32 | 1 | quantize.imatrix.entries_count = 224 36: INT32 | 1 | quantize.imatrix.chunks_count = 125 * Dumping 292 tensor(s) 1: 64 | 64, 1, 1, 1 | F32 | rope_freqs.weight 2: 525336576 | 4096, 128256, 1, 1 | Q4_K | token_embd.weight 3: 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight 4: 58720256 | 14336, 4096, 1, 1 | Q6_K | blk.0.ffn_down.weight 5: 58720256 | 4096, 14336, 1, 1 | Q4_K | blk.0.ffn_gate.weight 6: 58720256 | 4096, 14336, 1, 1 | Q4_K | blk.0.ffn_up.weight 7: 4096 | 4096, 1, 1, 1 | F32 | blk.0.ffn_norm.weight 8: 4194304 | 4096, 1024, 1, 1 | Q4_K | blk.0.attn_k.weight ```
1 parent 5e4beab commit f0518b3

File tree

3 files changed

+153
-1
lines changed

3 files changed

+153
-1
lines changed

packages/gguf/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,52 @@ In case you want to use your own GGUF metadata structure, you can disable strict
9696
const { metadata, tensorInfos }: GGUFParseOutput<{ strict: false }> = await gguf(URL_LLAMA);
9797
```
9898

99+
## Command line interface
100+
101+
This package provides a CLI equivalent to [`gguf_dump.py`](https://github.com/ggml-org/llama.cpp/blob/7a2c913e66353362d7f28d612fd3c9d51a831eda/gguf-py/gguf/scripts/gguf_dump.py) script. You can dump GGUF metadata and list of tensors using this command:
102+
103+
```bash
104+
npx @huggingface/gguf my_model.gguf
105+
106+
# or, with a remote GGUF file:
107+
# npx @huggingface/gguf https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf
108+
```
109+
110+
Example for the output:
111+
112+
```
113+
* Dumping 36 key/value pair(s)
114+
Idx | Count | Value
115+
----|--------|----------------------------------------------------------------------------------
116+
1 | 1 | version = 3
117+
2 | 1 | tensor_count = 292
118+
3 | 1 | kv_count = 33
119+
4 | 1 | general.architecture = "llama"
120+
5 | 1 | general.type = "model"
121+
6 | 1 | general.name = "Meta Llama 3.1 8B Instruct"
122+
7 | 1 | general.finetune = "Instruct"
123+
8 | 1 | general.basename = "Meta-Llama-3.1"
124+
125+
[truncated]
126+
127+
* Dumping 292 tensor(s)
128+
Idx | Num Elements | Shape | Data Type | Name
129+
----|--------------|--------------------------------|-----------|--------------------------
130+
1 | 64 | 64, 1, 1, 1 | F32 | rope_freqs.weight
131+
2 | 525336576 | 4096, 128256, 1, 1 | Q4_K | token_embd.weight
132+
3 | 4096 | 4096, 1, 1, 1 | F32 | blk.0.attn_norm.weight
133+
4 | 58720256 | 14336, 4096, 1, 1 | Q6_K | blk.0.ffn_down.weight
134+
135+
[truncated]
136+
```
137+
138+
Alternatively, you can install this package as global, which will provide the `gguf-view` command:
139+
140+
```bash
141+
npm i -g @huggingface/gguf
142+
gguf-view my_model.gguf
143+
```
144+
99145
## Hugging Face Hub
100146

101147
The Hub supports all file formats and has built-in features for GGUF format.

packages/gguf/package.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
"main": "./dist/index.js",
1111
"module": "./dist/index.mjs",
1212
"types": "./dist/index.d.ts",
13+
"bin": {
14+
"gguf-view": "./dist/cli.js"
15+
},
1316
"exports": {
1417
".": {
1518
"types": "./dist/index.d.ts",
@@ -18,6 +21,7 @@
1821
}
1922
},
2023
"browser": {
24+
"./src/cli.ts": false,
2125
"./src/utils/FileBlob.ts": false,
2226
"./dist/index.js": "./dist/browser/index.js",
2327
"./dist/index.mjs": "./dist/browser/index.mjs"
@@ -32,7 +36,7 @@
3236
"format": "prettier --write .",
3337
"format:check": "prettier --check .",
3438
"prepublishOnly": "pnpm run build",
35-
"build": "tsup src/index.ts --format cjs,esm --clean && tsc --emitDeclarationOnly --declaration",
39+
"build": "tsup src/index.ts src/cli.ts --format cjs,esm --clean && tsc --emitDeclarationOnly --declaration",
3640
"build:llm": "tsx scripts/generate-llm.ts && pnpm run format",
3741
"test": "vitest run",
3842
"check": "tsc"

packages/gguf/src/cli.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#!/usr/bin/env node
2+
3+
import { GGMLQuantizationType, gguf } from ".";
4+
5+
interface PrintColumnHeader {
6+
name: string;
7+
maxWidth?: number;
8+
alignRight?: boolean;
9+
}
10+
11+
const mapDtypeToName = Object.fromEntries(Object.entries(GGMLQuantizationType).map(([name, value]) => [value, name]));
12+
13+
async function main() {
14+
const ggufPath = process.argv[2];
15+
const { metadata, tensorInfos } = await gguf(ggufPath, {
16+
allowLocalFile: true,
17+
});
18+
19+
// TODO: print info about endianess
20+
console.log(`* Dumping ${Object.keys(metadata).length} key/value pair(s)`);
21+
printTable(
22+
[
23+
{ name: "Idx", alignRight: true },
24+
// { name: 'Type' }, // TODO: support this
25+
{ name: "Count", alignRight: true },
26+
{ name: "Value" },
27+
],
28+
Object.entries(metadata).map(([key, value], i) => {
29+
const MAX_LEN = 50;
30+
let strVal = "";
31+
let count = 1;
32+
if (Array.isArray(value)) {
33+
strVal = JSON.stringify(value);
34+
count = value.length;
35+
} else if (value instanceof String || typeof value === "string") {
36+
strVal = JSON.stringify(value);
37+
} else {
38+
strVal = value.toString();
39+
}
40+
strVal = strVal.length > MAX_LEN ? strVal.slice(0, MAX_LEN) + "..." : strVal;
41+
return [(i + 1).toString(), count.toString(), `${key} = ${strVal}`];
42+
})
43+
);
44+
45+
console.log();
46+
console.log(`* Dumping ${tensorInfos.length} tensor(s)`);
47+
printTable(
48+
[
49+
{ name: "Idx", alignRight: true },
50+
{ name: "Num Elements", alignRight: true },
51+
{ name: "Shape" },
52+
{ name: "Data Type" },
53+
{ name: "Name" },
54+
],
55+
tensorInfos.map((tensorInfo, i) => {
56+
const shape = [1n, 1n, 1n, 1n];
57+
tensorInfo.shape.forEach((dim, i) => {
58+
shape[i] = dim;
59+
});
60+
return [
61+
(i + 1).toString(),
62+
shape.reduce((acc, n) => acc * n, 1n).toString(),
63+
shape.map((n) => n.toString().padStart(6)).join(", "),
64+
mapDtypeToName[tensorInfo.dtype],
65+
tensorInfo.name,
66+
];
67+
})
68+
);
69+
}
70+
71+
function printTable(header: PrintColumnHeader[], rows: string[][], leftPad = 2) {
72+
const leftPadStr = " ".repeat(leftPad);
73+
74+
// Calculate column widths
75+
const columnWidths = header.map((h, i) => {
76+
const maxContentWidth = Math.max(h.name.length, ...rows.map((row) => (row[i] || "").length));
77+
return h.maxWidth ? Math.min(maxContentWidth, h.maxWidth) : maxContentWidth;
78+
});
79+
80+
// Print header
81+
const headerLine = header
82+
.map((h, i) => {
83+
return h.name.padEnd(columnWidths[i]);
84+
})
85+
.join(" | ");
86+
console.log(leftPadStr + headerLine);
87+
88+
// Print separator
89+
console.log(leftPadStr + columnWidths.map((w) => "-".repeat(w)).join("-|-"));
90+
91+
// Print rows
92+
for (const row of rows) {
93+
const line = header
94+
.map((h, i) => {
95+
return h.alignRight ? (row[i] || "").padStart(columnWidths[i]) : (row[i] || "").padEnd(columnWidths[i]);
96+
})
97+
.join(" | ");
98+
console.log(leftPadStr + line);
99+
}
100+
}
101+
102+
main();

0 commit comments

Comments
 (0)