@@ -221,171 +221,214 @@ Loops
221221 :linenos:
222222
223223 ...
224- matmul_16_6_64:
225- ...
226- mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
227- mov x7, x2 // Store the initial value of x2, to be restored in the next loop iteration
224+ // Offset the used leading dimension by the size of floats
225+ lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
226+ lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
227+ lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
228228
229- mov x9, #64 // x9 iterator for K loop
230- matmul_loop_over_K:
231- sub x9, x9, #1
229+ mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
230+ mov x7, x2 // Store the initial value of x2, to be restored after the loop
232231
233- ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
232+ // Load first column from the 16x6 matrix c
233+ ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5
234+ // Load second column from the 16x6 matrix c
235+ ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
236+ // Load third column from the 16x6 matrix c
237+ ld1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
238+ // Load fourth column from the 16x6 matrix c
239+ ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5
240+ // Load fifth column from the 16x6 matrix c
241+ ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
242+ // Load sixth column from the 16x6 matrix c
243+ ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
244+
245+ mov x9, #64 // x9 iterator for K loop
246+ matmul_loop_over_K:
247+ sub x9, x9, #1
248+
249+ // Load first column data from the 16x1 matrix a
250+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x3
251+
252+ // run the known matmul_16_6_1_unrolled kernel
253+ // Load first element from the 1x6 matrix b
254+ ldr s4, [x1]
255+ add x1, x1, x4
234256
235- // offset x6 to the next element in the column
236- add x6, x6, #4 // #4 = sizeof(float)
257+ // Calculate first column of c
258+ fmla v25.4s, v0.4s, v4.s[0]
259+ fmla v26.4s, v1.4s, v4.s[0]
260+ fmla v27.4s, v2.4s, v4.s[0]
261+ fmla v28.4s, v3.4s, v4.s[0]
237262
238- // Restore x1 and x2 to be incremented again
239- mov x1, x6
240- mov x2, x7
241263
242- // Loop back
243- cbnz x9, matmul_loop_over_K
264+ // Load second element from the 1x6 matrix b
265+ ldr s4, [x1]
266+ add x1, x1, x4
244267
245- ret
246- ...
268+ // Calculate second column of c
269+ fmla v17.4s, v0.4s, v4.s[0]
270+ fmla v18.4s, v1.4s, v4.s[0]
271+ fmla v19.4s, v2.4s, v4.s[0]
272+ fmla v20.4s, v3.4s, v4.s[0]
247273
274+
275+ // Load third element from the 1x6 matrix b
276+ ldr s4, [x1]
277+ add x1, x1, x4
248278
249- 2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
250- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251- - File ``submissions/submission_25_05_01/neon_3_2.s ``
279+ // Calculated third column of c
280+ fmla v21.4s, v0.4s, v4.s[0]
281+ fmla v22.4s, v1.4s, v4.s[0]
282+ fmla v23.4s, v2.4s, v4.s[0]
283+ fmla v24.4s, v3.4s, v4.s[0]
252284
253- .. code-block :: asm
254- :linenos:
255285
256- ...
257- matmul_64_6_64:
258- ...
286+ // Load fourth element from the 1x6 matrix b
287+ ldr s4, [x1]
288+ add x1, x1, x4
259289
260- mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
261- mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
290+ // Calculate fourth column of c
291+ fmla v5.4s, v0.4s, v4.s[0]
292+ fmla v6.4s, v1.4s, v4.s[0]
293+ fmla v7.4s, v2.4s, v4.s[0]
294+ fmla v8.4s, v3.4s, v4.s[0]
262295
263- mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
264- mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
265296
266- mov x16, #4 // x16 iterator for M loop
267- matmul_loop_over_M:
268- sub x16, x16, #1
297+ // Load fifth element from the 1x6 matrix b
298+ ldr s4, [x1]
299+ add x1, x1, x4
269300
270- mov x15, #64 // x15 iterator for K loop
271- matmul_loop_over_K:
272- sub x15, x15, #1
301+ // Calculate fifth column of c
302+ fmla v9.4s, v0.4s, v4.s[0]
303+ fmla v10.4s, v1.4s, v4.s[0]
304+ fmla v11.4s, v2.4s, v4.s[0]
305+ fmla v12.4s, v3.4s, v4.s[0]
273306
274- ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
307+
308+ // Load sixth element from the 1x6 matrix b
309+ ldr s4, [x1]
310+ add x1, x1, x4
275311
276- // offset x6 to the next element in the column
277- add x6, x6, #4 // #4 = sizeof(float)
312+ // Calculated sixth column of c
313+ fmla v13.4s, v0.4s, v4.s[0]
314+ fmla v14.4s, v1.4s, v4.s[0]
315+ fmla v15.4s, v2.4s, v4.s[0]
316+ fmla v16.4s, v3.4s, v4.s[0]
278317
279- // Restore x1 and x2 to be incremented again
280- mov x1, x6
281- mov x2, x7
282318
283- // Loop back to K
284- cbnz x15, matmul_loop_over_K
319+ // offset x6 to the next element in the column
320+ add x6, x6, #4 // #4 = sizeof(float)
285321
286- // next M iteration on the matrix c and matrix a, both need offset about 16 values
287- // also matrix b needs to start at the initial location again
288- // Updates for the matrix c
289- add x7, x7, #16*4 // column height * sizeof(float)
290- mov x2, x7 // also apply offset to x2
322+ // Restore x1 to be incremented again
323+ mov x1, x6
291324
292- // Updates for the matrix a
293- add x8, x8, #16*4 // column height * sizeof(float)
294- mov x0, x8 // also apply offset to x0
325+ // Loop back
326+ cbnz x9, matmul_loop_over_K
295327
296- // Updates for the matrix b
297- mov x6, x9 // Update the restore register for x1 for the K loop
298- mov x1, x9 // Update the x1 register itself
328+ // Restore initial value of x2 that was changed by the loads
329+ mov x2, x7
299330
300- // Loop back to M
301- cbnz x16, matmul_loop_over_M
331+ // Store first column back to memory
332+ st1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5
333+ // Store second column back to memory
334+ st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
335+ // Store third column back to memory
336+ st1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
337+ // Store fourth column back to memory
338+ st1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5
339+ // Store fifth column back to memory
340+ st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
341+ // Store sixth column back to memory
342+ st1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
302343
303- ret
304- ...
305344
306- 3 . Loop over N : Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
307- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308- - File ``submissions/submission_25_05_01/neon_3_3 .s ``
345+ 2 . Loop over M : Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
346+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
347+ - File ``submissions/submission_25_05_01/neon_3_2 .s ``
309348
310349.. code-block :: asm
311350 :linenos:
312351
313- ...
314- matmul_64_48_64:
315- ...
352+ // Offset the used leading dimension by the size of floats
353+ lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
354+ lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
355+ lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
356+
357+ mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
358+ mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
359+
360+ mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
361+ mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
362+
363+ mov x16, #4 // x16 iterator for M loop
364+ matmul_loop_over_M:
365+ sub x16, x16, #1
366+
367+ ... <logic of loop over K - neon_3_1>
316368
317- mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
318- mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
369+ // next M iteration on the matrix c and matrix a, both need offset about 16 values
370+ // also matrix b needs to start at the initial location again
371+ // Updates for the matrix c
372+ add x7, x7, #16*4 // column height * sizeof(float)
373+ mov x2, x7 // also apply offset to x2
319374
320- mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
321- mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
375+ // Updates for the matrix a
376+ add x8, x8, #16*4 // column height * sizeof(float)
377+ mov x0, x8 // also apply offset to x0
322378
323- mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
324- mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
325- mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
379+ // Updates for the matrix b
380+ mov x6, x9 // Update the restore register for x1 for the K loop
381+ mov x1, x9 // Update the x1 register itself
326382
327- mov x17, #8 // x17 iterator for N loop
328- matmul_loop_over_N:
329- sub x17, x17, #1
383+ // Loop back to M
384+ cbnz x16, matmul_loop_over_M
330385
331- mov x16, #4 // x16 iterator for M loop
332- matmul_loop_over_M:
333- sub x16, x16, #1
386+ 3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
387+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
388+ - File `` submissions/submission_25_05_01/neon_3_3.s ``
334389
335- mov x15, #64 // x15 iterator for K loop
336- matmul_loop_over_K:
337- sub x15, x15, #1
390+ .. code-block :: asm
391+ :linenos:
392+
393+ // Offset the used leading dimension by the size of floats
394+ lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
395+ lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
396+ lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
338397
339- ... logic of matmul_16_6_1 - neon_2_unrolled.s ...
398+ mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
399+ mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
340400
341- // offset x6 to the next element in the column
342- add x6, x6, #4 // #4 = sizeof(float)
401+ mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
402+ mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
343403
344- // Restore x1 and x2 to be incremented again
345- mov x1, x6
346- mov x2, x7
404+ mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
405+ mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
406+ mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
347407
348- // Loop back to K
349- cbnz x15, matmul_loop_over_K
408+ mov x17, #8 // x17 iterator for N loop
409+ matmul_loop_over_N:
410+ sub x17, x17, #1
350411
351- // next M iteration on the matrix c and matrix a, both need offset about 16 values
352- // also matrix b needs to start at the initial location again
353- // Updates for the matrix a
354- add x8, x8, #16*4 // column height * sizeof(float)
355- mov x0, x8 // also apply offset to x0
412+ ... <logic of loop over M - neon_3_2>
356413
357- // Updates for the matrix c
358- add x7, x7, #16*4 // column height * sizeof(float)
359- mov x2, x7 // also apply offset to x2
414+ // next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
415+ // also matrix a needs to start at the initial location again
416+ // Update for the matrix a
417+ mov x8, x10 // Update the restore register for x0 for the M loop
418+ mov x0, x10 // Update the x0 register itself
360419
361- // Updates for the matrix b
362- mov x6, x9 // Update the restore register for x1 for the K loop
363- mov x1, x9 // Update the x1 register itself
420+ // Updates for the matrix b
421+ madd x9, x4, x12, x9 // ldb * 6 + initial position
422+ mov x6, x9 // Update the restore register of x1 for the K loop
423+ mov x1, x9 // Update the x1 register itself
364424
365- // Loop back to M
366- cbnz x16, matmul_loop_over_M
367-
368- // next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
369- // also matrix a needs to start at the initial location again
370- // Update for the matrix a
371- mov x8, x10 // Update the restore register for x0 for the M loop
372- mov x0, x10 // Update the x0 register itself
373-
374- // Updates for the matrix b
375- madd x9, x4, x12, x9 // ldb * 6 + initial position
376- mov x6, x9 // Update the restore register of x1 for the K loop
377- mov x1, x9 // Update the x1 register itself
378-
379- // Updates for the matrix c
380- madd x11, x5, x12, x11 // ldc * 6 + initial position
381- mov x7, x11 // Update the restore register of x2 for the K loop
382- mov x2, x11 // Update the x2 register itself
383-
384- // Loop back to N
385- cbnz x17, matmul_loop_over_N
386-
387- ret
388- ...
425+ // Updates for the matrix c
426+ madd x11, x5, x12, x11 // ldc * 6 + initial position
427+ mov x7, x11 // Update the restore register of x2 for the K loop
428+ mov x2, x11 // Update the x2 register itself
429+
430+ // Loop back to N
431+ cbnz x17, matmul_loop_over_N
389432
390433 4. Test and optimize the kernels. Report your performance in GFLOPS.
391434^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -412,20 +455,20 @@ We run the benchmark with the following command:
412455 ----------------------------------------------------------------------------------------------------------------------------------
413456 Benchmark Time CPU Iterations FLOPS
414457 ----------------------------------------------------------------------------------------------------------------------------------
415- GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 396 ns 396 ns 10 31.0266G /s
416- GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 396 ns 396 ns 10 31.0281G /s
417- GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.069 ns 0.057 ns 10 4.50274M /s
418- GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.02 % 0.01 % 10 0.01 %
419- GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 1728 ns 1728 ns 10 28.4438G /s
420- GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 1728 ns 1728 ns 10 28.4445G /s
421- GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 0.115 ns 0.106 ns 10 1.7484M /s
422- GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01 %
423- GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 13078 ns 13077 ns 10 22.5524G /s
424- GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 13078 ns 13077 ns 10 22.552G /s
425- GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 1.83 ns 1.60 ns 10 2.76464M /s
426- GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01 %
427-
428-
429- - Mean FLOPS for loop over K: **31.0 GFLOPS **.
430- - Mean FLOPS for loop over M: **28.4 GFLOPS **.
431- - Mean FLOPS for loop over N: **22.6 GFLOPS **.
458+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 97.8 ns 97.4 ns 10 126.12G /s
459+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 97.7 ns 97.3 ns 10 126.245G /s
460+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.581 ns 0.563 ns 10 720.109M /s
461+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.59 % 0.58 % 10 0.57 %
462+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 386 ns 385 ns 10 127.812G /s
463+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 385 ns 384 ns 10 127.95G /s
464+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 2.16 ns 2.11 ns 10 693.069M /s
465+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.56 % 0.55 % 10 0.54 %
466+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 3103 ns 3092 ns 10 95.3736G /s
467+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 3097 ns 3087 ns 10 95.5363G /s
468+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 16.0 ns 15.6 ns 10 475.851M /s
469+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.52 % 0.50 % 10 0.50 %
470+
471+
472+ - Mean FLOPS for loop over K: **126.1 GFLOPS **.
473+ - Mean FLOPS for loop over M: **127.8 GFLOPS **.
474+ - Mean FLOPS for loop over N: **95.4 GFLOPS **.
0 commit comments