Skip to content

Commit 515c7ca

Browse files
committed
doc: neon 3
1 parent e225bfe commit 515c7ca

File tree

2 files changed

+233
-3
lines changed

2 files changed

+233
-3
lines changed

docs_sphinx/submissions/report_25_05_01.rst

Lines changed: 231 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,4 +213,234 @@ Therefore we do 10 repetitions of the benchmark which do about ``120 000 000`` i
213213
Gemm16x6x1Fixture/BM_matmul_16_6_1_unrolled/min_warmup_time:1.000_cv 0.90 % 0.88 % 10 0.87%
214214
215215
We see that the simple first implementation of our matmul kernel gets about **32.7 GFLOPS**.
216-
The optimized unrolled version gets about 0.8 GFLOPS more resulting in **33.5 GFLOPS**.
216+
The optimized unrolled version gets about 0.8 GFLOPS more resulting in **33.5 GFLOPS**.
217+
218+
219+
Loops
220+
-----
221+
222+
1. Loop over K: Implement a kernel that computes C+=AB for M=16, N=6 and K=64.
223+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
224+
- File ``submissions/submission_25_05_01/neon_3_1.s``
225+
226+
.. code-block:: asm
227+
:linenos:
228+
...
229+
230+
matmul_16_6_64:
231+
232+
...
233+
234+
mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
235+
mov x7, x2 // Store the initial value of x2, to be restored in the next loop iteration
236+
237+
mov x9, #64 // x9 iterator for K loop
238+
matmul_loop_over_K:
239+
sub x9, x9, #1
240+
241+
[logic of matmul_16_6_1 - neon_2_1_unrolled.s]
242+
243+
// offset x6 to the next element in the column
244+
add x6, x6, #4 // #4 = sizeof(float)
245+
246+
// Restore x1 and x2 to be incremented again
247+
mov x1, x6
248+
mov x2, x7
249+
250+
// Loop back
251+
cbnz x9, matmul_loop_over_K
252+
253+
ret
254+
255+
...
256+
257+
258+
2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
259+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260+
- File ``submissions/submission_25_05_01/neon_3_2.s``
261+
262+
.. code-block:: asm
263+
:linenos:
264+
...
265+
266+
matmul_64_6_64:
267+
268+
...
269+
270+
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
271+
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
272+
273+
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
274+
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
275+
276+
mov x16, #4 // x16 iterator for M loop
277+
matmul_loop_over_M:
278+
sub x16, x16, #1
279+
280+
mov x15, #64 // x15 iterator for K loop
281+
matmul_loop_over_K:
282+
sub x15, x15, #1
283+
284+
[logic of matmul_16_6_1 - neon_2_1_unrolled.s]
285+
286+
// offset x6 to the next element in the column
287+
add x6, x6, #4 // #4 = sizeof(float)
288+
289+
// Restore x1 and x2 to be incremented again
290+
mov x1, x6
291+
mov x2, x7
292+
293+
// Loop back to K
294+
cbnz x15, matmul_loop_over_K
295+
296+
// next M iteration on the matrix c and matrix a, both need offset about 16 values
297+
// also matrix b needs to start at the initial location again
298+
// Updates for the matrix c
299+
add x7, x7, #16*4 // column height * sizeof(float)
300+
mov x2, x7 // also apply offset to x2
301+
302+
// Updates for the matrix a
303+
add x8, x8, #16*4 // column height * sizeof(float)
304+
mov x0, x8 // also apply offset to x0
305+
306+
// Updates for the matrix b
307+
mov x6, x9 // Update the restore register for x1 for the K loop
308+
mov x1, x9 // Update the x1 register itself
309+
310+
// Loop back to M
311+
cbnz x16, matmul_loop_over_M
312+
313+
ret
314+
315+
...
316+
317+
3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
318+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
319+
- File ``submissions/submission_25_05_01/neon_3_3.s``
320+
321+
.. code-block:: asm
322+
:linenos:
323+
...
324+
325+
matmul_64_48_64:
326+
327+
...
328+
329+
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
330+
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
331+
332+
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
333+
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
334+
335+
mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
336+
mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
337+
mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
338+
339+
mov x17, #8 // x17 iterator for N loop
340+
matmul_loop_over_N:
341+
sub x17, x17, #1
342+
343+
mov x16, #4 // x16 iterator for M loop
344+
matmul_loop_over_M:
345+
sub x16, x16, #1
346+
347+
mov x15, #64 // x15 iterator for K loop
348+
matmul_loop_over_K:
349+
sub x15, x15, #1
350+
351+
[logic of matmul_16_6_1 - neon_2_1_unrolled.s]
352+
353+
// offset x6 to the next element in the column
354+
add x6, x6, #4 // #4 = sizeof(float)
355+
356+
// Restore x1 and x2 to be incremented again
357+
mov x1, x6
358+
mov x2, x7
359+
360+
// Loop back to K
361+
cbnz x15, matmul_loop_over_K
362+
363+
// next M iteration on the matrix c and matrix a, both need offset about 16 values
364+
// also matrix b needs to start at the initial location again
365+
// Updates for the matrix a
366+
add x8, x8, #16*4 // column height * sizeof(float)
367+
mov x0, x8 // also apply offset to x0
368+
369+
// Updates for the matrix c
370+
add x7, x7, #16*4 // column height * sizeof(float)
371+
mov x2, x7 // also apply offset to x2
372+
373+
// Updates for the matrix b
374+
mov x6, x9 // Update the restore register for x1 for the K loop
375+
mov x1, x9 // Update the x1 register itself
376+
377+
// Loop back to M
378+
cbnz x16, matmul_loop_over_M
379+
380+
// next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
381+
// also matrix a needs to start at the initial location again
382+
// Update for the matrix a
383+
mov x8, x10 // Update the restore register for x0 for the M loop
384+
mov x0, x10 // Update the x0 register itself
385+
386+
// Updates for the matrix b
387+
madd x9, x4, x12, x9 // ldb * 6 + initial position
388+
mov x6, x9 // Update the restore register of x1 for the K loop
389+
mov x1, x9 // Update the x1 register itself
390+
391+
// Updates for the matrix c
392+
madd x11, x5, x12, x11 // ldc * 6 + initial position
393+
mov x7, x11 // Update the restore register of x2 for the K loop
394+
mov x2, x11 // Update the x2 register itself
395+
396+
// Loop back to N
397+
cbnz x17, matmul_loop_over_N
398+
399+
ret
400+
401+
...
402+
403+
4. Test and optimize the kernels. Report your performance in GFLOPS.
404+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
405+
- File ``submissions/submission_25_05_01/neon_3.h``
406+
- Tests ``submissions/submission_25_05_01/neon_3.test.cpp``
407+
- Benchmarks ``submissions/submission_25_05_01/neon_3.bench.cpp``
408+
409+
Optimization
410+
############
411+
412+
Usage of already optmiized `matmul_16_6_1` from task 2.
413+
414+
Benchmarks
415+
##########
416+
417+
We run the benchmark with the following command:
418+
419+
.. code-block::
420+
421+
./benchmarks --benchmark_counters_tabular=true --benchmark_repetitions=10 --benchmark_report_aggregates_only=true
422+
423+
424+
.. code-block::
425+
:emphasize-lines: 4, 8, 12
426+
427+
----------------------------------------------------------------------------------------------------------------------------------
428+
Benchmark Time CPU Iterations FLOPS
429+
----------------------------------------------------------------------------------------------------------------------------------
430+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 396 ns 396 ns 10 31.0266G/s
431+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 396 ns 396 ns 10 31.0281G/s
432+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.069 ns 0.057 ns 10 4.50274M/s
433+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.02 % 0.01 % 10 0.01%
434+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 1728 ns 1728 ns 10 28.4438G/s
435+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 1728 ns 1728 ns 10 28.4445G/s
436+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 0.115 ns 0.106 ns 10 1.7484M/s
437+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
438+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 13078 ns 13077 ns 10 22.5524G/s
439+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 13078 ns 13077 ns 10 22.552G/s
440+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 1.83 ns 1.60 ns 10 2.76464M/s
441+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
442+
443+
444+
- Mean FLOPS for loop over K: **31.0 GFLOPS**.
445+
- Mean FLOPS for loop over M: **28.4 GFLOPS**.
446+
- Mean FLOPS for loop over N: **22.6 GFLOPS**.

submissions/submission_25_05_01/neon_3_2.s

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
// using the neon_2_1_unrolled as base kernel as it is the fast based on benchmarks
22

33
/**
4-
* @param x0 = a pointer to column-major 16x64 matrix A.
4+
* @param x0 = a pointer to column-major 64x64 matrix A.
55
* @param x1 = b pointer to column-major 64x6 matrix B.
6-
* @param x2 = c pointer to column-major 16x6 matrix C.
6+
* @param x2 = c pointer to column-major 64x6 matrix C.
77
* @param x3 = lda leading dimension of A.
88
* @param x4 = ldb leading dimension of B.
99
* @param x5 = ldc leading dimension of C.

0 commit comments

Comments
 (0)