Skip to content

Conversation

SavicStefan
Copy link
Contributor

This PR adds for cache_a and cache_b to load an additional vec2, and increases BK=32 for non-CM mul_mm.comp

Performance Comparison (Without coopmat and coopmat2) NVIDIA GeForce RTX 4060 Ti
Kernel Before(us/run) After(us/run) Δ %
MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5767.79 5176.01 +10.26%
MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5355.88 4105.95 +23.34%
MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5219.90 5432.22 -4.07%
MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2722.40 2732.62 -0.38%
MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2743.99 2753.02 -0.33%
MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2843.99 2850.78 -0.24%
MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2840.88 2841.73 -0.03%
MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2853.15 2857.24 -0.14%
MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4327.78 4334.87 -0.16%
MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4306.28 4289.52 +0.39%
MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4751.79 4781.23 -0.62%
MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4748.76 4785.89 -0.78%
MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5155.43 5164.14 -0.17%
MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4900.78 4914.74 -0.28%
MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4318.07 4371.76 -1.24%
MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4643.73 4815.24 -3.69%
MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5250.76 5015.61 +4.48%
MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4348.33 4388.21 -0.92%
MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4821.34 4570.77 +5.20%
MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5646.37 5633.01 +0.24%
MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4229.37 4240.83 -0.27%
MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4339.20 4358.97 -0.46%
MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4724.33 4779.14 -1.16%
Performance Comparison (Without coopmat and coopmat2) AMD Radeon RX 7800 XT
Kernel Before(us/run) After(us/run) Δ %
MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 8873.61 5853.29 +34.04%
MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 6458.76 5747.87 +11.01%
MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 7124.22 7401.83 -3.90%
MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3289.51 3318.63 -0.89%
MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3499.61 3527.61 -0.80%
MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3424.27 3446.08 -0.64%
MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3707.70 3732.88 -0.68%
MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3747.02 3767.69 -0.55%
MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 6160.74 6393.07 -3.77%
MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5936.61 6047.77 -1.87%
MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 7717.80 7037.06 +8.82%
MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 8219.73 8849.61 -7.66%
MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 7289.05 7447.10 -2.17%
MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 7668.33 6923.90 +9.71%
MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5797.82 5618.78 +3.09%
MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5764.74 5403.05 +6.27%
MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5695.78 5998.68 -5.32%
MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 6074.55 5980.28 +1.55%
MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5571.36 5367.69 +3.66%
MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5704.28 5651.10 +0.93%
MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 6416.39 5307.34 +17.28%
MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5968.62 5845.84 +2.06%
MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 8289.75 7982.64 +3.70%

@SavicStefan SavicStefan requested a review from 0cc4m as a code owner October 17, 2025 15:52
@github-actions github-actions bot added Vulkan Issues specific to the Vulkan backend ggml changes relating to the ggml tensor library for machine learning labels Oct 17, 2025
@0cc4m
Copy link
Collaborator

0cc4m commented Oct 18, 2025

This mostly affects non-quantized tests, some positive, some negative for Nvidia and Intel, no difference on (older) AMD. Any theories on why it behaves so differently based on input type?

Actual (quantized) model performance seems mostly unaffected.

RTX 3090 (without coopmat or integer dot)
Test Before (TFLOPS) After (TFLOPS) Δ%
MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 15.170 14.390 -5.14%
MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 15.400 18.970 +23.18%
MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.920 13.500 -3.02%
MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 14.530 13.750 -5.37%
MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 14.070 16.080 +14.29%
MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.460 13.650 +1.41%
MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 14.150 15.930 +12.58%
MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.660 13.950 +2.12%
MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.100 15.460 +18.02%
MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 15.610 12.920 -17.23%
MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.570 14.890 +9.73%
MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 12.570 13.540 +7.72%
MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 17.950 18.000 +0.28%
MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 14.160 17.730 +25.21%
MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 13.420 16.060 +19.67%
MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 17.000 18.780 +10.47%
MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 14.020 16.040 +14.41%
MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 12.330 13.460 +9.16%
MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 15.630 18.370 +17.53%
MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 18.100 14.020 -22.54%
MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 12.070 11.530 -4.47%
MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 12.860 14.110 +9.72%
MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 18.540 18.240 -1.62%
model size params backend ngl fa test t/s (before) t/s (after) diff
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 0 pp512 1428.13 ± 4.99 1420.59 ± 2.74 -0.5%
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 1 pp512 1408.76 ± 3.46 1394.17 ± 3.05 -1.0%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 0 pp512 1328.26 ± 4.41 1320.42 ± 2.91 -0.6%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 1 pp512 1319.03 ± 0.92 1295.03 ± 0.88 -1.8%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 0 pp512 1249.23 ± 3.43 1224.25 ± 2.11 -2.0%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 1 pp512 1232.04 ± 2.15 1204.84 ± 2.13 -2.2%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 0 pp512 1245.91 ± 3.63 1227.11 ± 2.81 -1.5%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 1 pp512 1227.12 ± 4.74 1207.00 ± 3.60 -1.6%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 0 pp512 1417.78 ± 7.29 1403.64 ± 5.81 -1.0%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 1 pp512 1394.82 ± 4.26 1376.73 ± 3.74 -1.3%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 0 pp512 1392.13 ± 6.83 1391.03 ± 7.94 -0.1%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 1 pp512 1370.81 ± 4.58 1362.14 ± 3.75 -0.6%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 0 pp512 1389.96 ± 5.16 1363.92 ± 5.72 -1.9%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 1 pp512 1372.17 ± 6.30 1337.70 ± 6.51 -2.5%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 0 pp512 1168.93 ± 4.88 1155.29 ± 7.86 -1.2%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 1 pp512 1153.62 ± 8.97 1131.69 ± 9.52 -1.9%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 0 pp512 1372.82 ± 6.87 1360.07 ± 4.77 -0.9%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 1 pp512 1350.96 ± 9.15 1336.27 ± 6.46 -1.1%
Radeon Pro VII (without integer dot)
Test Before (TFLOPS) After (TFLOPS) Δ%
MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5.300 5.270 -0.57%
MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.270 4.370 +2.34%
MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5.070 5.160 +1.78%
MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.660 4.670 +0.21%
MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.710 4.720 +0.21%
MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.620 4.620 +0.00%
MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.700 4.720 +0.43%
MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.690 4.720 +0.64%
MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.570 4.570 +0.00%
MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.550 4.550 +0.00%
MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.740 4.770 +0.63%
MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.340 4.340 +0.00%
MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.760 4.770 +0.21%
MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.430 4.460 +0.68%
MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.100 4.110 +0.24%
MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.770 4.800 +0.63%
MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.800 4.820 +0.42%
MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.140 4.150 +0.24%
MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.520 4.540 +0.44%
MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.580 4.620 +0.87%
MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.920 3.960 +1.02%
MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.200 4.220 +0.48%
MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.660 4.690 +0.64%
model size params backend ngl fa test t/s (before) t/s (after) diff
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 0 pp512 333.56 ± 1.54 334.33 ± 0.57 +0.2%
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 1 pp512 318.91 ± 0.29 317.94 ± 0.71 -0.3%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 0 pp512 325.10 ± 0.31 324.61 ± 0.72 -0.2%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 1 pp512 311.45 ± 0.37 309.19 ± 1.08 -0.7%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 0 pp512 308.37 ± 0.35 306.52 ± 1.13 -0.6%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 1 pp512 295.37 ± 0.38 292.86 ± 0.32 -0.8%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 0 pp512 296.69 ± 0.64 292.75 ± 0.97 -1.3%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 1 pp512 284.44 ± 0.28 281.94 ± 0.69 -0.9%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 0 pp512 343.14 ± 0.33 338.06 ± 0.96 -1.5%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 1 pp512 327.28 ± 0.25 324.34 ± 0.76 -0.9%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 0 pp512 344.91 ± 0.86 340.55 ± 0.82 -1.3%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 1 pp512 327.73 ± 1.39 326.89 ± 0.55 -0.3%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 0 pp512 334.02 ± 0.62 332.73 ± 0.63 -0.4%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 1 pp512 319.21 ± 0.35 317.35 ± 0.27 -0.6%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 0 pp512 383.00 ± 4.64 380.22 ± 5.54 -0.7%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 1 pp512 359.23 ± 2.87 353.52 ± 2.94 -1.6%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 0 pp512 538.00 ± 3.01 529.22 ± 5.06 -1.6%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 1 pp512 520.04 ± 2.56 514.61 ± 1.92 -1.0%
Intel A770 (without integer dot)
Test Before (TFLOPS) After (TFLOPS) Δ%
MUL_MAT(type_a=bf16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 5.110 3.520 -31.12%
MUL_MAT(type_a=f16,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.260 3.560 +9.20%
MUL_MAT(type_a=f32,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 4.770 4.000 -16.14%
MUL_MAT(type_a=iq1_m,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.910 3.840 -1.79%
MUL_MAT(type_a=iq1_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.940 3.850 -2.28%
MUL_MAT(type_a=iq2_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.940 2.900 -1.36%
MUL_MAT(type_a=iq2_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.930 3.920 -0.25%
MUL_MAT(type_a=iq2_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.510 3.480 -0.85%
MUL_MAT(type_a=iq3_s,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.380 3.360 -0.59%
MUL_MAT(type_a=iq3_xxs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.320 3.270 -1.51%
MUL_MAT(type_a=iq4_nl,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.150 3.150 +0.00%
MUL_MAT(type_a=iq4_xs,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.900 2.890 -0.34%
MUL_MAT(type_a=mxfp4,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.100 3.110 +0.32%
MUL_MAT(type_a=q2_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.870 3.000 +4.53%
MUL_MAT(type_a=q3_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.850 2.850 +0.00%
MUL_MAT(type_a=q4_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.580 3.440 -3.91%
MUL_MAT(type_a=q4_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.650 3.570 -2.19%
MUL_MAT(type_a=q4_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.840 2.830 -0.35%
MUL_MAT(type_a=q5_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.030 2.820 -6.93%
MUL_MAT(type_a=q5_1,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.210 3.140 -2.18%
MUL_MAT(type_a=q5_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.760 2.760 +0.00%
MUL_MAT(type_a=q6_K,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 2.840 2.660 -6.34%
MUL_MAT(type_a=q8_0,type_b=f32,m=4096,n=512,k=14336,bs=[1,1],nr=[1,1],per=[0,1,2,3],v=0,o=1) 3.320 3.250 -2.11%
model size params backend ngl fa test t/s (before) t/s (after) diff
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 0 pp512 302.41 ± 0.36 293.98 ± 0.48 -2.8%
llama 8B IQ1_S - 1.5625 bpw 1.87 GiB 8.03 B Vulkan 99 1 pp512 96.27 ± 0.10 95.83 ± 0.05 -0.5%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 0 pp512 229.93 ± 0.20 227.57 ± 0.27 -1.0%
llama 8B IQ2_M - 2.7 bpw 2.74 GiB 8.03 B Vulkan 99 1 pp512 111.17 ± 0.08 110.85 ± 0.04 -0.3%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 0 pp512 236.70 ± 0.14 234.64 ± 0.16 -0.9%
llama 8B IQ4_XS - 4.25 bpw 4.13 GiB 8.03 B Vulkan 99 1 pp512 104.64 ± 0.06 104.79 ± 0.06 +0.1%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 0 pp512 225.13 ± 0.15 225.52 ± 0.14 +0.2%
llama 8B Q4_K - Small 4.36 GiB 8.03 B Vulkan 99 1 pp512 102.14 ± 0.01 102.36 ± 0.04 +0.2%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 0 pp512 287.46 ± 0.59 285.18 ± 0.48 -0.8%
llama 8B Q4_0 4.33 GiB 8.03 B Vulkan 99 1 pp512 113.45 ± 0.11 113.47 ± 0.11 +0.0%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 0 pp512 287.89 ± 0.57 282.68 ± 0.25 -1.8%
llama 8B Q4_1 4.77 GiB 8.03 B Vulkan 99 1 pp512 113.53 ± 0.09 112.86 ± 0.08 -0.6%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 0 pp512 266.09 ± 0.51 262.90 ± 0.52 -1.2%
llama 8B Q8_0 7.95 GiB 8.03 B Vulkan 99 1 pp512 112.69 ± 0.06 112.58 ± 0.02 -0.1%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 0 pp512 300.48 ± 1.18 299.92 ± 0.53 -0.2%
qwen3moe 30B.A3B Q2_K - Medium 10.48 GiB 30.53 B Vulkan 99 1 pp512 119.97 ± 0.36 120.08 ± 0.36 +0.1%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 0 pp512 425.55 ± 2.05 426.73 ± 1.24 +0.3%
gpt-oss 20B Q8_0 11.27 GiB 20.91 B Vulkan 99 1 pp512 396.40 ± 1.59 398.89 ± 2.10 +0.6%

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

ggml changes relating to the ggml tensor library for machine learning Vulkan Issues specific to the Vulkan backend

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants