Skip to content

Commit 86a1527

Browse files
committed
perf: fixed the neon_3 loads stores
1 parent c6eca17 commit 86a1527

File tree

4 files changed

+481
-195
lines changed

4 files changed

+481
-195
lines changed

docs_sphinx/submissions/report_25_05_01.rst

Lines changed: 183 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -221,171 +221,214 @@ Loops
221221
:linenos:
222222
223223
...
224-
matmul_16_6_64:
225-
...
226-
mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
227-
mov x7, x2 // Store the initial value of x2, to be restored in the next loop iteration
224+
// Offset the used leading dimension by the size of floats
225+
lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
226+
lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
227+
lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
228228
229-
mov x9, #64 // x9 iterator for K loop
230-
matmul_loop_over_K:
231-
sub x9, x9, #1
229+
mov x6, x1 // Store the initial value of x1, to be restored in the next loop iteration
230+
mov x7, x2 // Store the initial value of x2, to be restored after the loop
232231
233-
... logic of matmul_16_6_1 - neon_2_unrolled.s ...
232+
// Load first column from the 16x6 matrix c
233+
ld1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5
234+
// Load second column from the 16x6 matrix c
235+
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
236+
// Load third column from the 16x6 matrix c
237+
ld1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
238+
// Load fourth column from the 16x6 matrix c
239+
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5
240+
// Load fifth column from the 16x6 matrix c
241+
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
242+
// Load sixth column from the 16x6 matrix c
243+
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
244+
245+
mov x9, #64 // x9 iterator for K loop
246+
matmul_loop_over_K:
247+
sub x9, x9, #1
248+
249+
// Load first column data from the 16x1 matrix a
250+
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x3
251+
252+
// run the known matmul_16_6_1_unrolled kernel
253+
// Load first element from the 1x6 matrix b
254+
ldr s4, [x1]
255+
add x1, x1, x4
234256
235-
// offset x6 to the next element in the column
236-
add x6, x6, #4 // #4 = sizeof(float)
257+
// Calculate first column of c
258+
fmla v25.4s, v0.4s, v4.s[0]
259+
fmla v26.4s, v1.4s, v4.s[0]
260+
fmla v27.4s, v2.4s, v4.s[0]
261+
fmla v28.4s, v3.4s, v4.s[0]
237262
238-
// Restore x1 and x2 to be incremented again
239-
mov x1, x6
240-
mov x2, x7
241263
242-
// Loop back
243-
cbnz x9, matmul_loop_over_K
264+
// Load second element from the 1x6 matrix b
265+
ldr s4, [x1]
266+
add x1, x1, x4
244267
245-
ret
246-
...
268+
// Calculate second column of c
269+
fmla v17.4s, v0.4s, v4.s[0]
270+
fmla v18.4s, v1.4s, v4.s[0]
271+
fmla v19.4s, v2.4s, v4.s[0]
272+
fmla v20.4s, v3.4s, v4.s[0]
247273
274+
275+
// Load third element from the 1x6 matrix b
276+
ldr s4, [x1]
277+
add x1, x1, x4
248278
249-
2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
250-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
251-
- File ``submissions/submission_25_05_01/neon_3_2.s``
279+
// Calculated third column of c
280+
fmla v21.4s, v0.4s, v4.s[0]
281+
fmla v22.4s, v1.4s, v4.s[0]
282+
fmla v23.4s, v2.4s, v4.s[0]
283+
fmla v24.4s, v3.4s, v4.s[0]
252284
253-
.. code-block:: asm
254-
:linenos:
255285
256-
...
257-
matmul_64_6_64:
258-
...
286+
// Load fourth element from the 1x6 matrix b
287+
ldr s4, [x1]
288+
add x1, x1, x4
259289
260-
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
261-
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
290+
// Calculate fourth column of c
291+
fmla v5.4s, v0.4s, v4.s[0]
292+
fmla v6.4s, v1.4s, v4.s[0]
293+
fmla v7.4s, v2.4s, v4.s[0]
294+
fmla v8.4s, v3.4s, v4.s[0]
262295
263-
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
264-
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
265296
266-
mov x16, #4 // x16 iterator for M loop
267-
matmul_loop_over_M:
268-
sub x16, x16, #1
297+
// Load fifth element from the 1x6 matrix b
298+
ldr s4, [x1]
299+
add x1, x1, x4
269300
270-
mov x15, #64 // x15 iterator for K loop
271-
matmul_loop_over_K:
272-
sub x15, x15, #1
301+
// Calculate fifth column of c
302+
fmla v9.4s, v0.4s, v4.s[0]
303+
fmla v10.4s, v1.4s, v4.s[0]
304+
fmla v11.4s, v2.4s, v4.s[0]
305+
fmla v12.4s, v3.4s, v4.s[0]
273306
274-
... logic of matmul_16_6_1 - neon_2_unrolled.s ...
307+
308+
// Load sixth element from the 1x6 matrix b
309+
ldr s4, [x1]
310+
add x1, x1, x4
275311
276-
// offset x6 to the next element in the column
277-
add x6, x6, #4 // #4 = sizeof(float)
312+
// Calculated sixth column of c
313+
fmla v13.4s, v0.4s, v4.s[0]
314+
fmla v14.4s, v1.4s, v4.s[0]
315+
fmla v15.4s, v2.4s, v4.s[0]
316+
fmla v16.4s, v3.4s, v4.s[0]
278317
279-
// Restore x1 and x2 to be incremented again
280-
mov x1, x6
281-
mov x2, x7
282318
283-
// Loop back to K
284-
cbnz x15, matmul_loop_over_K
319+
// offset x6 to the next element in the column
320+
add x6, x6, #4 // #4 = sizeof(float)
285321
286-
// next M iteration on the matrix c and matrix a, both need offset about 16 values
287-
// also matrix b needs to start at the initial location again
288-
// Updates for the matrix c
289-
add x7, x7, #16*4 // column height * sizeof(float)
290-
mov x2, x7 // also apply offset to x2
322+
// Restore x1 to be incremented again
323+
mov x1, x6
291324
292-
// Updates for the matrix a
293-
add x8, x8, #16*4 // column height * sizeof(float)
294-
mov x0, x8 // also apply offset to x0
325+
// Loop back
326+
cbnz x9, matmul_loop_over_K
295327
296-
// Updates for the matrix b
297-
mov x6, x9 // Update the restore register for x1 for the K loop
298-
mov x1, x9 // Update the x1 register itself
328+
// Restore initial value of x2 that was changed by the loads
329+
mov x2, x7
299330
300-
// Loop back to M
301-
cbnz x16, matmul_loop_over_M
331+
// Store first column back to memory
332+
st1 {v25.4s, v26.4s, v27.4s, v28.4s}, [x2], x5
333+
// Store second column back to memory
334+
st1 {v17.4s, v18.4s, v19.4s, v20.4s}, [x2], x5
335+
// Store third column back to memory
336+
st1 {v21.4s, v22.4s, v23.4s, v24.4s}, [x2], x5
337+
// Store fourth column back to memory
338+
st1 {v5.4s, v6.4s, v7.4s, v8.4s}, [x2], x5
339+
// Store fifth column back to memory
340+
st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x2], x5
341+
// Store sixth column back to memory
342+
st1 {v13.4s, v14.4s, v15.4s, v16.4s}, [x2], x5
302343
303-
ret
304-
...
305344
306-
3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
307-
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
308-
- File ``submissions/submission_25_05_01/neon_3_3.s``
345+
2. Loop over M: Implement a kernel that computes C+=AB for M=64, N=6 and K=64.
346+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
347+
- File ``submissions/submission_25_05_01/neon_3_2.s``
309348

310349
.. code-block:: asm
311350
:linenos:
312351
313-
...
314-
matmul_64_48_64:
315-
...
352+
// Offset the used leading dimension by the size of floats
353+
lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
354+
lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
355+
lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
356+
357+
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
358+
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
359+
360+
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
361+
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
362+
363+
mov x16, #4 // x16 iterator for M loop
364+
matmul_loop_over_M:
365+
sub x16, x16, #1
366+
367+
... <logic of loop over K - neon_3_1>
316368
317-
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
318-
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
369+
// next M iteration on the matrix c and matrix a, both need offset about 16 values
370+
// also matrix b needs to start at the initial location again
371+
// Updates for the matrix c
372+
add x7, x7, #16*4 // column height * sizeof(float)
373+
mov x2, x7 // also apply offset to x2
319374
320-
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
321-
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
375+
// Updates for the matrix a
376+
add x8, x8, #16*4 // column height * sizeof(float)
377+
mov x0, x8 // also apply offset to x0
322378
323-
mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
324-
mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
325-
mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
379+
// Updates for the matrix b
380+
mov x6, x9 // Update the restore register for x1 for the K loop
381+
mov x1, x9 // Update the x1 register itself
326382
327-
mov x17, #8 // x17 iterator for N loop
328-
matmul_loop_over_N:
329-
sub x17, x17, #1
383+
// Loop back to M
384+
cbnz x16, matmul_loop_over_M
330385
331-
mov x16, #4 // x16 iterator for M loop
332-
matmul_loop_over_M:
333-
sub x16, x16, #1
386+
3. Loop over N: Implement a kernel that computes C+=AB for M=64, N=48 and K=64.
387+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
388+
- File ``submissions/submission_25_05_01/neon_3_3.s``
334389

335-
mov x15, #64 // x15 iterator for K loop
336-
matmul_loop_over_K:
337-
sub x15, x15, #1
390+
.. code-block:: asm
391+
:linenos:
392+
393+
// Offset the used leading dimension by the size of floats
394+
lsl x3, x3, #2 // x3 * 4 = x3 * sizeof(float)
395+
lsl x4, x4, #2 // x4 * 4 = x4 * sizeof(float)
396+
lsl x5, x5, #2 // x5 * 4 = x5 * sizeof(float)
338397
339-
... logic of matmul_16_6_1 - neon_2_unrolled.s ...
398+
mov x6, x1 // Store the initial value of x1, to be restored in the K loop iteration
399+
mov x7, x2 // Store the initial value of x2, to be restored in the K loop iteration
340400
341-
// offset x6 to the next element in the column
342-
add x6, x6, #4 // #4 = sizeof(float)
401+
mov x8, x0 // Store the initial value of x0, to be restored in the M loop iteration
402+
mov x9, x1 // Store the initial value of x1, to be restored in the M loop iteration
343403
344-
// Restore x1 and x2 to be incremented again
345-
mov x1, x6
346-
mov x2, x7
404+
mov x10, x0 // Store the initial value of x0, to be restored in the N loop iteration
405+
mov x11, x2 // Store the initial value of x2, to bes restored in the N loop iteration
406+
mov x12, #6 // hold the size of N that are processed in one loop, needed for offset calculation
347407
348-
// Loop back to K
349-
cbnz x15, matmul_loop_over_K
408+
mov x17, #8 // x17 iterator for N loop
409+
matmul_loop_over_N:
410+
sub x17, x17, #1
350411
351-
// next M iteration on the matrix c and matrix a, both need offset about 16 values
352-
// also matrix b needs to start at the initial location again
353-
// Updates for the matrix a
354-
add x8, x8, #16*4 // column height * sizeof(float)
355-
mov x0, x8 // also apply offset to x0
412+
... <logic of loop over M - neon_3_2>
356413
357-
// Updates for the matrix c
358-
add x7, x7, #16*4 // column height * sizeof(float)
359-
mov x2, x7 // also apply offset to x2
414+
// next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
415+
// also matrix a needs to start at the initial location again
416+
// Update for the matrix a
417+
mov x8, x10 // Update the restore register for x0 for the M loop
418+
mov x0, x10 // Update the x0 register itself
360419
361-
// Updates for the matrix b
362-
mov x6, x9 // Update the restore register for x1 for the K loop
363-
mov x1, x9 // Update the x1 register itself
420+
// Updates for the matrix b
421+
madd x9, x4, x12, x9 // ldb * 6 + initial position
422+
mov x6, x9 // Update the restore register of x1 for the K loop
423+
mov x1, x9 // Update the x1 register itself
364424
365-
// Loop back to M
366-
cbnz x16, matmul_loop_over_M
367-
368-
// next M iteration on the matrix b and matrix c, both need offset about 6*ldb/ldc values
369-
// also matrix a needs to start at the initial location again
370-
// Update for the matrix a
371-
mov x8, x10 // Update the restore register for x0 for the M loop
372-
mov x0, x10 // Update the x0 register itself
373-
374-
// Updates for the matrix b
375-
madd x9, x4, x12, x9 // ldb * 6 + initial position
376-
mov x6, x9 // Update the restore register of x1 for the K loop
377-
mov x1, x9 // Update the x1 register itself
378-
379-
// Updates for the matrix c
380-
madd x11, x5, x12, x11 // ldc * 6 + initial position
381-
mov x7, x11 // Update the restore register of x2 for the K loop
382-
mov x2, x11 // Update the x2 register itself
383-
384-
// Loop back to N
385-
cbnz x17, matmul_loop_over_N
386-
387-
ret
388-
...
425+
// Updates for the matrix c
426+
madd x11, x5, x12, x11 // ldc * 6 + initial position
427+
mov x7, x11 // Update the restore register of x2 for the K loop
428+
mov x2, x11 // Update the x2 register itself
429+
430+
// Loop back to N
431+
cbnz x17, matmul_loop_over_N
389432
390433
4. Test and optimize the kernels. Report your performance in GFLOPS.
391434
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -412,20 +455,20 @@ We run the benchmark with the following command:
412455
----------------------------------------------------------------------------------------------------------------------------------
413456
Benchmark Time CPU Iterations FLOPS
414457
----------------------------------------------------------------------------------------------------------------------------------
415-
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 396 ns 396 ns 10 31.0266G/s
416-
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 396 ns 396 ns 10 31.0281G/s
417-
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.069 ns 0.057 ns 10 4.50274M/s
418-
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.02 % 0.01 % 10 0.01%
419-
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 1728 ns 1728 ns 10 28.4438G/s
420-
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 1728 ns 1728 ns 10 28.4445G/s
421-
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 0.115 ns 0.106 ns 10 1.7484M/s
422-
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
423-
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 13078 ns 13077 ns 10 22.5524G/s
424-
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 13078 ns 13077 ns 10 22.552G/s
425-
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 1.83 ns 1.60 ns 10 2.76464M/s
426-
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.01 % 0.01 % 10 0.01%
427-
428-
429-
- Mean FLOPS for loop over K: **31.0 GFLOPS**.
430-
- Mean FLOPS for loop over M: **28.4 GFLOPS**.
431-
- Mean FLOPS for loop over N: **22.6 GFLOPS**.
458+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_mean 97.8 ns 97.4 ns 10 126.12G/s
459+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_median 97.7 ns 97.3 ns 10 126.245G/s
460+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_stddev 0.581 ns 0.563 ns 10 720.109M/s
461+
GemmMxNxKFixture<16, 6, 64>/BM_matmul_16_6_64/min_warmup_time:1.000_cv 0.59 % 0.58 % 10 0.57%
462+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_mean 386 ns 385 ns 10 127.812G/s
463+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_median 385 ns 384 ns 10 127.95G/s
464+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_stddev 2.16 ns 2.11 ns 10 693.069M/s
465+
GemmMxNxKFixture<64, 6, 64>/BM_matmul_64_6_64/min_warmup_time:1.000_cv 0.56 % 0.55 % 10 0.54%
466+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_mean 3103 ns 3092 ns 10 95.3736G/s
467+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_median 3097 ns 3087 ns 10 95.5363G/s
468+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_stddev 16.0 ns 15.6 ns 10 475.851M/s
469+
GemmMxNxKFixture<64, 48, 64>/BM_matmul_64_48_64/min_warmup_time:1.000_cv 0.52 % 0.50 % 10 0.50%
470+
471+
472+
- Mean FLOPS for loop over K: **126.1 GFLOPS**.
473+
- Mean FLOPS for loop over M: **127.8 GFLOPS**.
474+
- Mean FLOPS for loop over N: **95.4 GFLOPS**.

0 commit comments

Comments
 (0)