@@ -8,33 +8,32 @@ The default behavior for CPU only operations is unchanged. When a GPU is present
88
99## llama-bench
1010## No AMX
11- '''
11+
1212numactl -N 2 -m 2 llama-bench -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -t 32 --numa numactl -ngl 10 -nopo 1 -b 512 -ub 512 -pg 512,512 --repetitions 3
1313ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
1414ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
1515ggml_cuda_init: found 1 CUDA devices:
1616 Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
17+
1718| model | size | params | backend | ngl | threads | n_batch | nopo | test | t/s |
1819| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------: | ---: | --------------: | -------------------: |
1920| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | pp512 | 214.45 ± 0.11 |
2021| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | tg128 | 45.67 ± 0.03 |
2122| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | pp512+tg512 | 65.27 ± 0.13 |
22- '''
2323
2424## With AMX
2525
26- '''
2726numactl -N 2 -m 2 llama-bench -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -t 32 --numa numactl -ngl 10 --amx -nopo 1 -b 512 -ub 512 -pg 512,512 --repetitions 3
2827ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2928ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3029ggml_cuda_init: found 1 CUDA devices:
3130 Device 0: NVIDIA GeForce RTX 5090, compute capability 12.0, VMM: yes
31+
3232| model | size | params | backend | ngl | threads | n_batch | amx | nopo | test | t/s |
3333| ------------------------------ | ---------: | ---------: | ---------- | --: | ------: | ------: | --------: | ---: | --------------: | -------------------: |
3434| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | 1 | pp512 | 284.08 ± 0.26 |
3535| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | 1 | tg128 | 55.55 ± 0.26 |
3636| qwen3moe 30B.A3B Q4_0 | 16.18 GiB | 30.53 B | CUDA | 10 | 32 | 512 | 1 | 1 | pp512+tg512 | 77.62 ± 0.26 |
37- '''
3837
3938##PP512 + 69.62 t/s (+32.47%)
4039##TG128 + 9.88 t/s (+21.63%)
@@ -44,7 +43,6 @@ ggml_cuda_init: found 1 CUDA devices:
4443
4544## No AMX
4645
47- '''
4846numactl -N 2 -m 2 /llama-cli -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -ngl 10 -t 32 -b 4096 -c 4096 -n 512 --numa numactl -p "10 facts about birds" -no-cnv
4947
5048llama_perf_sampler_print: sampling time = 62.16 ms / 517 runs ( 0.12 ms per token, 8316.84 tokens per second)
@@ -53,11 +51,10 @@ llama_perf_context_print: prompt eval time = 58.17 ms / 5 tokens ( 11
5351llama_perf_context_print: eval time = 12675.00 ms / 511 runs ( 24.80 ms per token, 40.32 tokens per second)
5452llama_perf_context_print: total time = 13012.05 ms / 516 tokens
5553llama_perf_context_print: graphs reused = 508
56- '''
54+
5755
5856## With AMX
5957
60- '''
6158numactl -N 2 -m 2 /llama-cli -m /Qwen3-30B-A3B-Thinking-2507-Q4_0.gguf -ngl 10 --amx -t 32 -b 4096 -c 4096 -n 512 --numa numactl -p "10 facts about birds" -no-cnv
6259
6360llama_perf_sampler_print: sampling time = 56.16 ms / 517 runs ( 0.11 ms per token, 9205.18 tokens per second)
@@ -66,7 +63,6 @@ llama_perf_context_print: prompt eval time = 51.53 ms / 5 tokens ( 10
6663llama_perf_context_print: eval time = 10416.81 ms / 511 runs ( 20.39 ms per token, 49.06 tokens per second)
6764llama_perf_context_print: total time = 10670.73 ms / 516 tokens
6865llama_perf_context_print: graphs reused = 508
69- '''
7066
7167## Decode (generation): +8.74 t/s (+21.68%)
7268## Prompt (prefill): +11.07 t/s (+12.88%)
@@ -82,28 +78,21 @@ Build with all the normal AMX flags (unchanged from upstream); then use the new
8278'''
8379set -euo pipefail
8480
85- # 1) System packages (compiler toolchain, cmake, Ninja optional, perf tools, Python venv)
8681sudo apt-get update
8782sudo apt-get install -y \
8883 build-essential cmake ninja-build git pkg-config \
8984 python3-venv python3-pip python3-dev \
9085 linux-tools-common linux-tools-$(uname -r)
9186
92- # 2) Python virtual environment
9387mkdir -p ~ /venvs
9488python3 -m venv ~ /venvs/amxllama
9589source ~ /venvs/amxllama/bin/activate
9690python -m pip install -U pip
9791
98- # 3) Clone this fork
9992mkdir -p ~ /src
10093git clone https://github.com/Gadflyii/llama.cpp.git ~ /src/amx-llama.cpp
10194cd ~ /src/amx-llama.cpp
10295
103- # 4) Configure CMake (AMX on, CUDA on)
104- # - GGML_NATIVE=ON : enable host-specific CPU optimizations
105- # - GGML_CUDA=ON : enable CUDA backend (requires CUDA/cuBLAS installed)
106- # - GGML_AMX_TILE/INT8/BF16=ON : enable AMX paths
10796cmake -S . -B build -G Ninja \
10897 -DCMAKE_BUILD_TYPE=Release \
10998 -DGGML_NATIVE=ON \
@@ -112,11 +101,9 @@ cmake -S . -B build -G Ninja \
112101 -DGGML_AMX_INT8=ON \
113102 -DGGML_AMX_BF16=ON
114103
115- # 5) Build
116104cmake --build build -j"$(nproc)"
117105'''
118- ## Example startup and benchmark commands (recommend you run with numactl and adjust thread count to match your numa node):
119-
106+ # Example Commands
120107'''
121108# Bench (hybrid GPU+CPU AMX, no warmup)
122109./build/bin/llama-bench \
0 commit comments