@@ -7879,8 +7879,8 @@ kernel void kernel_mul_mm(
78797879 ushort tiitg[[thread_index_in_threadgroup]],
78807880 ushort sgitg[[simdgroup_index_in_threadgroup]]) {
78817881
7882- threadgroup T * sa = (threadgroup T *)(shmem);
7883- threadgroup float * sb = (threadgroup float *)(shmem + 4096 );
7882+ threadgroup T * sa = (threadgroup T *)(shmem);
7883+ threadgroup half * sb = (threadgroup half *)(shmem + 4096 );
78847884
78857885 const int r0 = tgpig.y ;
78867886 const int r1 = tgpig.x ;
@@ -7895,7 +7895,7 @@ kernel void kernel_mul_mm(
78957895 const short thread_col = ((short )tiitg/THREAD_PER_COL) < n_cols ? ((short )tiitg/THREAD_PER_COL) : n_cols - 1 ;
78967896
78977897 simdgroup_T8x8 ma[4 ];
7898- simdgroup_float8x8 mb[2 ];
7898+ simdgroup_half8x8 mb[2 ];
78997899 simdgroup_float8x8 mc[8 ];
79007900
79017901 for (short i = 0 ; i < 8 ; i++){
@@ -7933,7 +7933,7 @@ kernel void kernel_mul_mm(
79337933 + (tiitg/THREAD_PER_ROW)%8 + (i&7 )*8 ) = temp_a[i/4 ][i%4 ];
79347934 }
79357935
7936- *(threadgroup float2x4 *)(sb + 32 *8 *(tiitg%THREAD_PER_COL) + 8 *(tiitg/THREAD_PER_COL)) = (float2x4 )(*((device U2x4 *) y));
7936+ *(threadgroup half2x4 *)(sb + 32 *8 *(tiitg%THREAD_PER_COL) + 8 *(tiitg/THREAD_PER_COL)) = (half2x4 )(*((device U2x4 *) y));
79377937
79387938 il = (il + 2 < nl) ? il + 2 : il % 2 ;
79397939 x = (il < 2 ) ? x + (2 + nl - 1 )/nl : x;
@@ -7942,8 +7942,8 @@ kernel void kernel_mul_mm(
79427942 threadgroup_barrier (mem_flags::mem_threadgroup);
79437943
79447944 // load matrices from threadgroup memory and conduct outer products
7945- threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2 ));
7946- threadgroup const float * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2 ));
7945+ threadgroup const T * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2 ));
7946+ threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2 ));
79477947
79487948 #pragma unroll(4)
79497949 for (short ik = 0 ; ik < BLOCK_SIZE_K/8 ; ik++) {
@@ -8076,7 +8076,7 @@ template [[host_name("kernel_mul_mm_id_map0_ne20_8" )]] kernel kernel_mul_mm_id_
80768076template [[host_name(" kernel_mul_mm_id_map0_ne20_10" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<10 >;
80778077template [[host_name(" kernel_mul_mm_id_map0_ne20_16" )]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<16 >;
80788078
8079- template <typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short , thread T4x4 &)>
8079+ template <typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short , thread T4x4 &), typename U, typename U2x4 >
80808080kernel void kernel_mul_mm_id (
80818081 constant ggml_metal_kargs_mul_mm_id & args,
80828082 device const char * src0,
@@ -8136,7 +8136,7 @@ kernel void kernel_mul_mm_id(
81368136 device const block_q * x = (device const block_q *)(src0
81378137 + args.nb01 *(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;
81388138
8139- device const float * y = (device const float *)(src1
8139+ device const U * y = (device const U *)(src1
81408140 + args.nb13 *i13
81418141 + args.nb12 *i12
81428142 + args.nb11 *i11
@@ -8156,7 +8156,7 @@ kernel void kernel_mul_mm_id(
81568156 + (tiitg/THREAD_PER_ROW)%8 + (i&7 )*8 ) = temp_a[i/4 ][i%4 ];
81578157 }
81588158
8159- *(threadgroup half2x4 *)(sb + 32 *8 *(tiitg%THREAD_PER_COL) + 8 *(tiitg/THREAD_PER_COL)) = (half2x4)(*((device float2x4 *) y));
8159+ *(threadgroup half2x4 *)(sb + 32 *8 *(tiitg%THREAD_PER_COL) + 8 *(tiitg/THREAD_PER_COL)) = (half2x4)(*((device U2x4 *) y));
81608160
81618161 il = (il + 2 < nl) ? il + 2 : il % 2 ;
81628162 x = (il < 2 ) ? x + (2 + nl - 1 )/nl : x;
@@ -8357,34 +8357,59 @@ template [[host_name("kernel_mul_mm_iq4_xs_f16")]] kernel mul_mm_t kernel_mul_m
83578357// indirect matrix-matrix multiplication
83588358//
83598359
8360- typedef decltype (kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1 , dequantize_f32>) mul_mm_id;
8360+ typedef decltype (kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1 , dequantize_f32, float , float2x4 >) mul_mm_id;
83618361
8362- template [[host_name(" kernel_mul_mm_id_f32_f16 " )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1 , dequantize_f32>;
8363- template [[host_name(" kernel_mul_mm_id_f16_f16 " )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half4x4, 1 , dequantize_f16>;
8362+ template [[host_name(" kernel_mul_mm_id_f32_f32 " )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1 , dequantize_f32, float , float2x4 >;
8363+ template [[host_name(" kernel_mul_mm_id_f16_f32 " )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half4x4, 1 , dequantize_f16, float , float2x4 >;
83648364#if defined(GGML_METAL_HAS_BF16)
8365- template [[host_name(" kernel_mul_mm_id_bf16_f16 " )]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1 , dequantize_bf16>;
8365+ template [[host_name(" kernel_mul_mm_id_bf16_f32 " )]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1 , dequantize_bf16, float , float2x4 >;
83668366#endif
8367- template [[host_name(" kernel_mul_mm_id_q4_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_0, 2 , dequantize_q4_0>;
8368- template [[host_name(" kernel_mul_mm_id_q4_1_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_1, 2 , dequantize_q4_1>;
8369- template [[host_name(" kernel_mul_mm_id_q5_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_0, 2 , dequantize_q5_0>;
8370- template [[host_name(" kernel_mul_mm_id_q5_1_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_1, 2 , dequantize_q5_1>;
8371- template [[host_name(" kernel_mul_mm_id_q8_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q8_0, 2 , dequantize_q8_0>;
8372- template [[host_name(" kernel_mul_mm_id_mxfp4_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_mxfp4, 2 , dequantize_mxfp4>;
8373- template [[host_name(" kernel_mul_mm_id_q2_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K>;
8374- template [[host_name(" kernel_mul_mm_id_q3_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K>;
8375- template [[host_name(" kernel_mul_mm_id_q4_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K>;
8376- template [[host_name(" kernel_mul_mm_id_q5_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K>;
8377- template [[host_name(" kernel_mul_mm_id_q6_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K>;
8378- template [[host_name(" kernel_mul_mm_id_iq2_xxs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
8379- template [[host_name(" kernel_mul_mm_id_iq2_xs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs>;
8380- template [[host_name(" kernel_mul_mm_id_iq3_xxs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
8381- template [[host_name(" kernel_mul_mm_id_iq3_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s>;
8382- template [[host_name(" kernel_mul_mm_id_iq2_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s>;
8383- template [[host_name(" kernel_mul_mm_id_iq1_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s>;
8384- template [[host_name(" kernel_mul_mm_id_iq1_m_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m>;
8385- template [[host_name(" kernel_mul_mm_id_iq4_nl_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2 , dequantize_iq4_nl>;
8386- template [[host_name(" kernel_mul_mm_id_iq4_xs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs>;
8387-
8367+ template [[host_name(" kernel_mul_mm_id_q4_0_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_0, 2 , dequantize_q4_0, float , float2x4>;
8368+ template [[host_name(" kernel_mul_mm_id_q4_1_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_1, 2 , dequantize_q4_1, float , float2x4>;
8369+ template [[host_name(" kernel_mul_mm_id_q5_0_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_0, 2 , dequantize_q5_0, float , float2x4>;
8370+ template [[host_name(" kernel_mul_mm_id_q5_1_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_1, 2 , dequantize_q5_1, float , float2x4>;
8371+ template [[host_name(" kernel_mul_mm_id_q8_0_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q8_0, 2 , dequantize_q8_0, float , float2x4>;
8372+ template [[host_name(" kernel_mul_mm_id_mxfp4_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_mxfp4, 2 , dequantize_mxfp4, float , float2x4>;
8373+ template [[host_name(" kernel_mul_mm_id_q2_K_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, float , float2x4>;
8374+ template [[host_name(" kernel_mul_mm_id_q3_K_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, float , float2x4>;
8375+ template [[host_name(" kernel_mul_mm_id_q4_K_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, float , float2x4>;
8376+ template [[host_name(" kernel_mul_mm_id_q5_K_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, float , float2x4>;
8377+ template [[host_name(" kernel_mul_mm_id_q6_K_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, float , float2x4>;
8378+ template [[host_name(" kernel_mul_mm_id_iq2_xxs_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, float , float2x4>;
8379+ template [[host_name(" kernel_mul_mm_id_iq2_xs_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, float , float2x4>;
8380+ template [[host_name(" kernel_mul_mm_id_iq3_xxs_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, float , float2x4>;
8381+ template [[host_name(" kernel_mul_mm_id_iq3_s_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, float , float2x4>;
8382+ template [[host_name(" kernel_mul_mm_id_iq2_s_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, float , float2x4>;
8383+ template [[host_name(" kernel_mul_mm_id_iq1_s_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, float , float2x4>;
8384+ template [[host_name(" kernel_mul_mm_id_iq1_m_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, float , float2x4>;
8385+ template [[host_name(" kernel_mul_mm_id_iq4_nl_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2 , dequantize_iq4_nl, float , float2x4>;
8386+ template [[host_name(" kernel_mul_mm_id_iq4_xs_f32" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, float , float2x4>;
8387+
8388+ template [[host_name(" kernel_mul_mm_id_f32_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1 , dequantize_f32, half, half2x4>;
8389+ template [[host_name(" kernel_mul_mm_id_f16_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, half4x4, 1 , dequantize_f16, half, half2x4>;
8390+ #if defined(GGML_METAL_HAS_BF16)
8391+ template [[host_name(" kernel_mul_mm_id_bf16_f16" )]] kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4, 1 , dequantize_bf16, half, half2x4>;
8392+ #endif
8393+ template [[host_name(" kernel_mul_mm_id_q4_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_0, 2 , dequantize_q4_0, half, half2x4>;
8394+ template [[host_name(" kernel_mul_mm_id_q4_1_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_1, 2 , dequantize_q4_1, half, half2x4>;
8395+ template [[host_name(" kernel_mul_mm_id_q5_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_0, 2 , dequantize_q5_0, half, half2x4>;
8396+ template [[host_name(" kernel_mul_mm_id_q5_1_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_1, 2 , dequantize_q5_1, half, half2x4>;
8397+ template [[host_name(" kernel_mul_mm_id_q8_0_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q8_0, 2 , dequantize_q8_0, half, half2x4>;
8398+ template [[host_name(" kernel_mul_mm_id_mxfp4_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_mxfp4, 2 , dequantize_mxfp4, half, half2x4>;
8399+ template [[host_name(" kernel_mul_mm_id_q2_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q2_K, QK_NL, dequantize_q2_K, half, half2x4>;
8400+ template [[host_name(" kernel_mul_mm_id_q3_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q3_K, QK_NL, dequantize_q3_K, half, half2x4>;
8401+ template [[host_name(" kernel_mul_mm_id_q4_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q4_K, QK_NL, dequantize_q4_K, half, half2x4>;
8402+ template [[host_name(" kernel_mul_mm_id_q5_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q5_K, QK_NL, dequantize_q5_K, half, half2x4>;
8403+ template [[host_name(" kernel_mul_mm_id_q6_K_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_q6_K, QK_NL, dequantize_q6_K, half, half2x4>;
8404+ template [[host_name(" kernel_mul_mm_id_iq2_xxs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xxs, QK_NL, dequantize_iq2_xxs, half, half2x4>;
8405+ template [[host_name(" kernel_mul_mm_id_iq2_xs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_xs, QK_NL, dequantize_iq2_xs, half, half2x4>;
8406+ template [[host_name(" kernel_mul_mm_id_iq3_xxs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_xxs, QK_NL, dequantize_iq3_xxs, half, half2x4>;
8407+ template [[host_name(" kernel_mul_mm_id_iq3_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq3_s, QK_NL, dequantize_iq3_s, half, half2x4>;
8408+ template [[host_name(" kernel_mul_mm_id_iq2_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq2_s, QK_NL, dequantize_iq2_s, half, half2x4>;
8409+ template [[host_name(" kernel_mul_mm_id_iq1_s_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_s, QK_NL, dequantize_iq1_s, half, half2x4>;
8410+ template [[host_name(" kernel_mul_mm_id_iq1_m_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq1_m, QK_NL, dequantize_iq1_m, half, half2x4>;
8411+ template [[host_name(" kernel_mul_mm_id_iq4_nl_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_nl, 2 , dequantize_iq4_nl, half, half2x4>;
8412+ template [[host_name(" kernel_mul_mm_id_iq4_xs_f16" )]] kernel mul_mm_id kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, block_iq4_xs, QK_NL, dequantize_iq4_xs, half, half2x4>;
83888413
83898414//
83908415// matrix-vector multiplication
0 commit comments