Update README.md

Gadflyii · Gadflyii · commit 0a28796e353c · 2025-09-27T11:54:02.000-05:00
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ The default behavior for CPU only operations is unchanged. When a GPU is present
 
 ## llama-bench
 ### No AMX
-
+```
 numactl -N 2 -m 2 llama-bench -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -t 32 --numa numactl -ngl 10 -nopo 1 -b 512 -ub 512 -pg 512,512 --repetitions 3
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
 ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
@@ -20,9 +20,10 @@ ggml_cuda_init: found 1 CUDA devices:
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |    1 |           pp512 |        214.45 ± 0.11 |
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |    1 |           tg128 |         45.67 ± 0.03 |
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |    1 |     pp512+tg512 |         65.27 ± 0.13 |
+```
 
 ### With AMX
-
+```
 numactl -N 2 -m 2 llama-bench -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -t 32 --numa numactl -ngl 10 --amx -nopo 1 -b 512 -ub 512 -pg 512,512 --repetitions 3
 ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
 ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
@@ -34,15 +35,16 @@ ggml_cuda_init: found 1 CUDA devices:
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |         1 |    1 |           pp512 |        284.08 ± 0.26 |
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |         1 |    1 |           tg128 |         55.55 ± 0.26 |
 | qwen3moe 30B.A3B Q4_0          |  16.18 GiB |    30.53 B | CUDA       |  10 |      32 |     512 |         1 |    1 |     pp512+tg512 |         77.62 ± 0.26 |
+```
+**PP512         + 69.62 t/s (+32.47%)**
+**TG128         + 9.88 t/s (+21.63%)**
+**PP512+TG512   + 12.35 t/s (+18.92%)**
 
-### PP512         + 69.62 t/s (+32.47%)
-### TG128         + 9.88 t/s (+21.63%)
-### PP512+TG512   + 12.35 t/s (+18.92%)
 
 ## CLI performance:
 
 ### No AMX
-
+```
 numactl -N 2 -m 2 /llama-cli -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -ngl 10 -t 32 -b 4096 -c 4096 -n 512 --numa numactl -p "10 facts about birds" -no-cnv
 
 llama_perf_sampler_print:    sampling time =      62.16 ms /   517 runs   (    0.12 ms per token,  8316.84 tokens per second)
@@ -51,10 +53,10 @@ llama_perf_context_print: prompt eval time =      58.17 ms /     5 tokens (   11
 llama_perf_context_print:        eval time =   12675.00 ms /   511 runs   (   24.80 ms per token,    40.32 tokens per second)
 llama_perf_context_print:       total time =   13012.05 ms /   516 tokens
 llama_perf_context_print:    graphs reused =        508
-
+```
 
 ### With AMX
-
+```
 numactl -N 2 -m 2 /llama-cli -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -ngl 10 --amx -t 32 -b 4096 -c 4096 -n 512 --numa numactl -p "10 facts about birds" -no-cnv
 
 llama_perf_sampler_print:    sampling time =      56.16 ms /   517 runs   (    0.11 ms per token,  9205.18 tokens per second)
@@ -63,17 +65,17 @@ llama_perf_context_print: prompt eval time =      51.53 ms /     5 tokens (   10
 llama_perf_context_print:        eval time =   10416.81 ms /   511 runs   (   20.39 ms per token,    49.06 tokens per second)
 llama_perf_context_print:       total time =   10670.73 ms /   516 tokens
 llama_perf_context_print:    graphs reused =        508
-
-### Decode (generation): +8.74 t/s (+21.68%)
-### Prompt (prefill): +11.07 t/s (+12.88%)
-### Overall throughput: + 8.77 t/s (+21.64%)
+```
+**Decode (generation): +8.74 t/s (+21.68%)**
+**Prompt (prefill): +11.07 t/s (+12.88%)**
+**Overall throughput: + 8.77 t/s (+21.64%)**
 
 
 ## Instructions:
 
 Build with all the normal AMX flags (unchanged from upstream); then use the new varible "--amx" in your run commands. You can use "--amx" on all excutables, including llama-bench. 
 
-## Copy and paste / pull and build (bash):
+### Copy and paste / pull and build (bash):
 
 ```
 set -euo pipefail
@@ -103,7 +105,7 @@ cmake -S . -B build -G Ninja \
 
 cmake --build build -j"$(nproc)"
 ```
-# Example Commands
+## Example Commands
 ```
 # Bench (hybrid GPU+CPU AMX, no warmup)
 ./build/bin/llama-bench \
@@ -124,6 +126,9 @@ cmake --build build -j"$(nproc)"
 
 ## Thanks for helping me test!
 
+
+
+
 ---
 
 # llama.cpp