Update limits based on benchmarking the SME code on Apple M4

martin-frbg · web-flow · commit 47a66aef0f97 · 2025-10-08T14:36:17.000+02:00
diff --git a/kernel/arm64/sgemm_direct_performant.c b/kernel/arm64/sgemm_direct_performant.c
@@ -1,30 +1,14 @@
 #include "common.h"
-/* helper for the direct sgemm code written by Arjan van der Ven */
-
-
-
+/* helper for the direct sgemm code adapted from Arjan van der Ven's x86_64 version */
 
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
 {
-if (M<3 || M%2==1) return 0;
+if (M<3) return 0;
 	unsigned long long mnk = M * N * K;
-	/* large matrixes -> not performant */
-	if (mnk >= 28 * 512 * 512)
-		return 0;
-
-	/*
-	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
-	 * and the regular sgemm copy/realignment of data pays off much quicker
-	 */
-	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
-		return 0;
-
-#ifdef SMP
-	/* if we can run multithreaded, the threading changes the based threshold */
-	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+	/* benchmark performance on M4 peaks around 512 and crosses the graph of the NEON SGEMM at about 3100  */
+	if (mnk >= 3100 * 3100 * 3100)
 		return 0;
-#endif
-
+	
 	return 1;
 }