@@ -213,4 +213,234 @@ Therefore we do 10 repetitions of the benchmark which do about ``120 000 000`` i
213213 Gemm16x6x1Fixture/BM_matmul_16_6_1_unrolled/min_warmup_time:1.000_cv 0.90 % 0.88 % 10 0.87%
214214
215215 We see that the simple first implementation of our matmul kernel gets about **32.7 GFLOPS **.
216- The optimized unrolled version gets about 0.8 GFLOPS more resulting in **33.5 GFLOPS **.
216+ The optimized unrolled version gets about 0.8 GFLOPS more resulting in **33.5 GFLOPS **.
217+
218+
219+ Loops
220+ -----
221+
222+ 1. Loop over K: Implement a kernel that computes C+=AB for M=16, N=6 and K=64.
223+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
224+ - File ``submissions/submission_25_05_01/neon_3_1.s ``
225+
226+ .. code-block :: asm
227+ :linenos:
228+ ...
229+
230+ matmul_16_6_64:
231+
232+ ...
233+
234+ mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
235+ mov x7, x2 // Store the initial value of x2, to be restored in the next loop iteration
236+
237+ mov x9, #64 // x9 iterator for K loop
238+ matmul_loop_over_K:
239+ sub x9, x9, #1
240+
241+ [logic of matmul_16_6_1 - neon_2_1_unrolled.s]
242+
243+ // offset x6 to the next element in the column
244+ add x6, x6, #4 // #4 = sizeof(float)
245+
246+ // Restore x1 and x2 to be incremented again
247+ mov x1, x6
248+ mov x2, x7
249+
250+ // Loop back
251+ cbnz x9, matmul_loop_over_K
252+
253+ ret
254+
255+ ...
256+
257+
258+ 2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
259+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260+ - File ``submissions/submission_25_05_01/neon_3_2.s ``
261+
262+ .. code-block :: asm
263+ :linenos:
264+ ...
265+
266+ matmul_64_6_64:
267+
268+ ...
269+
270+ mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
271+ mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
272+
273+ mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
274+ mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
275+
276+ mov x16, #4 // x16 iterator for M loop
277+ matmul_loop_over_M:
278+ sub x16, x16, #1
279+
280+ mov x15, #64 // x15 iterator for K loop
281+ matmul_loop_over_K:
282+ sub x15, x15, #1
283+
284+ [logic of matmul_16_6_1 - neon_2_1_unrolled.s]
285+
286+ // offset x6 to the next element in the column
287+ add x6, x6, #4 // #4 = sizeof(float)
288+
289+ // Restore x1 and x2 to be incremented again
290+ mov x1, x6
291+ mov x2, x7
292+
293+ // Loop back to K
294+ cbnz x15, matmul_loop_over_K
295+
296+ // next M iteration on the matrix c and matrix a, both need offset about 16 values
297+ // also matrix b needs to start at the initial location again
298+ // Updates for the matrix c
299+ add x7, x7, #16*4 // column height * sizeof(float)
300+ mov x2, x7 // also apply offset to x2
301+
302+ // Updates for the matrix a
303+ add x8, x8, #16*4 // column height * sizeof(float)
304+ mov x0, x8 // also apply offset to x0
305+
306+ // Updates for the matrix b
307+ mov x6, x9 // Update the restore register for x1 for the K loop
308+ mov x1, x9 // Update the x1 register itself
309+
310+ // Loop back to M
311+ cbnz x16, matmul_loop_over_M
312+
313+ ret
314+
315+ ...
316+
317+ 3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
318+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
319+ - File ``submissions/submission_25_05_01/neon_3_3.s ``
320+
321+ .. code-block :: asm
322+ :linenos:
323+ ...
324+
325+ matmul_64_48_64:
326+
327+ ...
328+
329+ mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
330+ mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
331+
332+ mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
333+ mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
334+
335+ mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
336+ mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
337+ mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
338+
339+ mov x17, #8 // x17 iterator for N loop
340+ matmul_loop_over_N:
341+ sub x17, x17, #1
342+
343+ mov x16, #4 // x16 iterator for M loop
344+ matmul_loop_over_M:
345+ sub x16, x16, #1
346+
347+ mov x15, #64 // x15 iterator for K loop
348+ matmul_loop_over_K:
349+ sub x15, x15, #1
350+
351+ [logic of matmul_16_6_1 - neon_2_1_unrolled.s]
352+
353+ // offset x6 to the next element in the column
354+ add x6, x6, #4 // #4 = sizeof(float)
355+
356+ // Restore x1 and x2 to be incremented again
357+ mov x1, x6
358+ mov x2, x7
359+
360+ // Loop back to K
361+ cbnz x15, matmul_loop_over_K
362+
363+ // next M iteration on the matrix c and matrix a, both need offset about 16 values
364+ // also matrix b needs to start at the initial location again
365+ // Updates for the matrix a
366+ add x8, x8, #16*4 // column height * sizeof(float)
367+ mov x0, x8 // also apply offset to x0
368+
369+ // Updates for the matrix c
370+ add x7, x7, #16*4 // column height * sizeof(float)
371+ mov x2, x7 // also apply offset to x2
372+
373+ // Updates for the matrix b
374+ mov x6, x9 // Update the restore register for x1 for the K loop
375+ mov x1, x9 // Update the x1 register itself
376+
377+ // Loop back to M
378+ cbnz x16, matmul_loop_over_M
379+
380+ // next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
381+ // also matrix a needs to start at the initial location again
382+ // Update for the matrix a
383+ mov x8, x10 // Update the restore register for x0 for the M loop
384+ mov x0, x10 // Update the x0 register itself
385+
386+ // Updates for the matrix b
387+ madd x9, x4, x12, x9 // ldb * 6 + initial position
388+ mov x6, x9 // Update the restore register of x1 for the K loop
389+ mov x1, x9 // Update the x1 register itself
390+
391+ // Updates for the matrix c
392+ madd x11, x5, x12, x11 // ldc * 6 + initial position
393+ mov x7, x11 // Update the restore register of x2 for the K loop
394+ mov x2, x11 // Update the x2 register itself
395+
396+ // Loop back to N
397+ cbnz x17, matmul_loop_over_N
398+
399+ ret
400+
401+ ...
402+
403+ 4. Test and optimize the kernels. Report your performance in GFLOPS.
404+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
405+ - File ``submissions/submission_25_05_01/neon_3.h ``
406+ - Tests ``submissions/submission_25_05_01/neon_3.test.cpp ``
407+ - Benchmarks ``submissions/submission_25_05_01/neon_3.bench.cpp ``
408+
409+ Optimization
410+ ############
411+
412+ Usage of already optmiized `matmul_16_6_1 ` from task 2.
413+
414+ Benchmarks
415+ ##########
416+
417+ We run the benchmark with the following command:
418+
419+ .. code-block ::
420+
421+ ./benchmarks --benchmark_counters_tabular=true --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
422+
423+
424+ .. code-block ::
425+ :emphasize-lines: 4, 8, 12
426+
427+ ----------------------------------------------------------------------------------------------------------------------------------
428+ Benchmark Time CPU Iterations FLOPS
429+ ----------------------------------------------------------------------------------------------------------------------------------
430+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 396 ns 396 ns 10 31.0266G/s
431+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 396 ns 396 ns 10 31.0281G/s
432+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.069 ns 0.057 ns 10 4.50274M/s
433+ GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.02 % 0.01 % 10 0.01%
434+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 1728 ns 1728 ns 10 28.4438G/s
435+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 1728 ns 1728 ns 10 28.4445G/s
436+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 0.115 ns 0.106 ns 10 1.7484M/s
437+ GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
438+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 13078 ns 13077 ns 10 22.5524G/s
439+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 13078 ns 13077 ns 10 22.552G/s
440+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 1.83 ns 1.60 ns 10 2.76464M/s
441+ GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
442+
443+
444+ - Mean FLOPS for loop over K: **31.0 GFLOPS **.
445+ - Mean FLOPS for loop over M: **28.4 GFLOPS **.
446+ - Mean FLOPS for loop over N: **22.6 GFLOPS **.
0 commit comments