metal : add comments

ggerganov · ggerganov · commit 434fc452c3be · 2024-12-03T11:39:33.000+02:00
ggml-ci
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
@@ -2008,8 +2008,10 @@ static void ggml_metal_encode_node(
 
                 // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                 // to the matrix-vector kernel
-                int ne11_mm_min = 4;
+                const int ne11_mm_min = 4;
 
+                // first try to use small-batch mat-mv kernels
+                // these should be efficient for BS [2, ~8]
                 if (src1t == GGML_TYPE_F32 && (ne00%256 == 0) &&
                     (
                      (
@@ -2033,12 +2035,20 @@ static void ggml_metal_encode_node(
                     )
                    ) {
                     // TODO: determine the optimal parameters based on grid utilization
-                    const int nsg    = 2; // TODO: or 4?
-                    const int nxpsg  = ne11 < 3 ? 16 : 8;
-                    const int nypsg  = 32/nxpsg;
-                    const int r0ptg  = nypsg*nsg;
-                          int r1ptg  = 4;
+                    //       I still don't know why we should not always use the maximum available threads:
+                    //
+                    //       nsg = pipeline.maxTotalThreadsPerThreadgroup / 32
+                    //
+                    //       my current hypothesis is that the work grid is not evenly divisible for different nsg
+                    //       values and there can be some tail effects when nsg is high. need to confirm this
+                    //
+                    const int nsg    = 2;                 // num simdgroups per threadgroup
+                    const int nxpsg  = ne11 < 3 ? 16 : 8; // num threads along row per simdgroup
+                    const int nypsg  = 32/nxpsg;          // num threads along col per simdgroup (i.e. a simdgroup processes that many src0 rows at a time)
+                    const int r0ptg  = nypsg*nsg;         // num src0 rows per threadgroup
+                          int r1ptg  = 4;                 // num src1 rows per threadgroup
 
+                    // note: not sure how optimal are those across all different hardware. there might be someting cleverer
                     switch (ne11) {
                         case 2:
                             r1ptg = 2; break;
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1870,6 +1870,8 @@ kernel void kernel_mul_mv_q8_0_f32(
     kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);
 }
 
+// mat-vec kernel processing in chunks of float4
+// chpb - chunks per quantization block
 template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4)(device const q_t *, short, thread float4 &) >
 void kernel_mul_mv_ext_q4_f32_impl(
         constant ggml_metal_kargs_mul_mv_ext & args,
@@ -1879,7 +1881,7 @@ void kernel_mul_mv_ext_q4_f32_impl(
         uint3   tgpig[[threadgroup_position_in_grid]],
         ushort  tiisg[[thread_index_in_simdgroup]],
         ushort  sgitg[[simdgroup_index_in_threadgroup]]) {
-    const short chpt = 4;
+    const short chpt = 4; // chunks per thread
 
   //const short nxpsg = (32);
     const short nypsg = (32/nxpsg);
@@ -1907,7 +1909,7 @@ void kernel_mul_mv_ext_q4_f32_impl(
 
     float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };
 
-    short cch = tx%chpb;
+    short cch = tx%chpb; // current chunk index
 
     for (int ich = tx; 4*ich < args.ne00; ich += chpt*nxpsg) {
         float4 lx[chpt];
@@ -1938,6 +1940,7 @@ void kernel_mul_mv_ext_q4_f32_impl(
         }
     }
 
+    // reduce only the threads in each row
     for (short ir1 = 0; ir1 < r1ptg; ++ir1) {
         if (nxpsg >= 32) {
             sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);
@@ -1969,6 +1972,7 @@ void kernel_mul_mv_ext_q4_f32_impl(
     }
 }
 
+// mat-vec kernel processing in chunks of float4x4
 template<short nxpsg, short r1ptg, typename q_t, short chpb, void (*deq_t4x4)(device const q_t *, short, thread float4x4 &) >
 void kernel_mul_mv_ext_q4x4_f32_impl(
         constant ggml_metal_kargs_mul_mv_ext & args,
@@ -2072,6 +2076,8 @@ void kernel_mul_mv_ext_q4x4_f32_impl(
     }
 }
 
+// dispatchers needed for compile-time nxpsg
+// epb - elements per quantization block
 template<short r1ptg, typename q_t, short epb, void (*deq_t4)(device const q_t *, short, thread float4 &)>
 kernel void kernel_mul_mv_ext_q4_f32_disp(
         constant ggml_metal_kargs_mul_mv_ext & args,

Original file line number	Diff line number	Diff line change
`@@ -1870,6 +1870,8 @@ kernel void kernel_mul_mv_q8_0_f32(`
`1870`	`1870`	`kernel_mul_mv_q8_0_f32_impl<constant ggml_metal_kargs_mul_mv &>(args, src0, src1, dst, nullptr, tgpig, tiisg, sgitg);`
`1871`	`1871`	`}`
`1872`	`1872`
	`1873`	`+// mat-vec kernel processing in chunks of float4`
	`1874`	`+// chpb - chunks per quantization block`
`1873`	`1875`	`template<short nxpsg, short r1ptg, typename q_t, short chpb, void (deq_t4)(device const q_t , short, thread float4 &) >`
`1874`	`1876`	`void kernel_mul_mv_ext_q4_f32_impl(`
`1875`	`1877`	`constant ggml_metal_kargs_mul_mv_ext & args,`
`@@ -1879,7 +1881,7 @@ void kernel_mul_mv_ext_q4_f32_impl(`
`1879`	`1881`	`uint3 tgpig[[threadgroup_position_in_grid]],`
`1880`	`1882`	`ushort tiisg[[thread_index_in_simdgroup]],`
`1881`	`1883`	`ushort sgitg[[simdgroup_index_in_threadgroup]]) {`
`1882`		`- const short chpt = 4;`
	`1884`	`+ const short chpt = 4; // chunks per thread`
`1883`	`1885`
`1884`	`1886`	`//const short nxpsg = (32);`
`1885`	`1887`	`const short nypsg = (32/nxpsg);`
`@@ -1907,7 +1909,7 @@ void kernel_mul_mv_ext_q4_f32_impl(`
`1907`	`1909`
`1908`	`1910`	`float sumf[r1ptg] = { [ 0 ... r1ptg - 1 ] = 0.0f };`
`1909`	`1911`
`1910`		`- short cch = tx%chpb;`
	`1912`	`+ short cch = tx%chpb; // current chunk index`
`1911`	`1913`
`1912`	`1914`	`for (int ich = tx; 4ich < args.ne00; ich += chptnxpsg) {`
`1913`	`1915`	`float4 lx[chpt];`
`@@ -1938,6 +1940,7 @@ void kernel_mul_mv_ext_q4_f32_impl(`
`1938`	`1940`	`}`
`1939`	`1941`	`}`
`1940`	`1942`
	`1943`	`+ // reduce only the threads in each row`
`1941`	`1944`	`for (short ir1 = 0; ir1 < r1ptg; ++ir1) {`
`1942`	`1945`	`if (nxpsg >= 32) {`
`1943`	`1946`	`sumf[ir1] += simd_shuffle_down(sumf[ir1], 16);`
`@@ -1969,6 +1972,7 @@ void kernel_mul_mv_ext_q4_f32_impl(`
`1969`	`1972`	`}`
`1970`	`1973`	`}`
`1971`	`1974`
	`1975`	`+// mat-vec kernel processing in chunks of float4x4`
`1972`	`1976`	`template<short nxpsg, short r1ptg, typename q_t, short chpb, void (deq_t4x4)(device const q_t , short, thread float4x4 &) >`
`1973`	`1977`	`void kernel_mul_mv_ext_q4x4_f32_impl(`
`1974`	`1978`	`constant ggml_metal_kargs_mul_mv_ext & args,`
`@@ -2072,6 +2076,8 @@ void kernel_mul_mv_ext_q4x4_f32_impl(`
`2072`	`2076`	`}`
`2073`	`2077`	`}`
`2074`	`2078`
	`2079`	`+// dispatchers needed for compile-time nxpsg`
	`2080`	`+// epb - elements per quantization block`
`2075`	`2081`	`template<short r1ptg, typename q_t, short epb, void (deq_t4)(device const q_t , short, thread float4 &)>`
`2076`	`2082`	`kernel void kernel_mul_mv_ext_q4_f32_disp(`
`2077`	`2083`	`constant ggml_metal_kargs_mul_mv_ext & args,`