Skip to content

Commit eb10c12

Browse files
DavidMansellgunes-arm
authored andcommitted
fix: a64_hgemm_8x24: Fix over-eager read ahead of operands.
Resolves: COMPMID-8303 Signed-off-by: David Mansell <[email protected]> Change-Id: I15e1e0789e8bb456683701a834e3c727a04220ca Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/14540 Tested-by: Arm Jenkins <[email protected]> Reviewed-by: Gunes Bayir <[email protected]> Comments-Addressed: Arm Jenkins <[email protected]> Benchmark: Arm Jenkins <[email protected]>
1 parent 96d6de4 commit eb10c12

File tree

1 file changed

+9
-9
lines changed
  • src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24

1 file changed

+9
-9
lines changed

src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2017-2018, 2024 Arm Limited.
2+
* Copyright (c) 2017-2018, 2024-2025 Arm Limited.
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -76,9 +76,7 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
7676
"movi v11.8h, #0x0\n"
7777
"ldr %q[b2], [%[b_ptr], #32]\n"
7878
"movi v12.8h, #0x0\n"
79-
"ldr %q[b0a], [%[b_ptr], #48]\n"
8079
"movi v13.8h, #0x0\n"
81-
"ldr %q[b1a], [%[b_ptr], #64]\n"
8280
"movi v14.8h, #0x0\n"
8381
ASM_PREFETCH("[%[b_ptr], #64]")
8482
"movi v15.8h, #0x0\n"
@@ -109,25 +107,27 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
109107

110108
"1:\n"
111109
"fmla v8.8h , %[b0].8h, %[a0].h[0]\n"
112-
"fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
110+
"fmla v9.8h , %[b0].8h, %[a0].h[1]\n"
113111
"ldr %q[a0a], [%[a_ptr], #16]\n"
114112
"fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
115113
"fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
116-
"ldr %q[b2a], [%[b_ptr], #80]\n"
114+
"ldr %q[b0a], [%[b_ptr], #48]\n"
117115
"fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
118116
"fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
117+
"ldr %q[b1a], [%[b_ptr], #64]\n"
119118
"fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
120119
"fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
121120
"ldr %q[b0], [%[b_ptr], #96]\n"
122121

123122
"fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
124123
"fmla v17.8h, %[b1].8h, %[a0].h[1]\n"
125-
ASM_PREFETCH("[%[a_ptr], #128]")
124+
"ldr %q[b2a], [%[b_ptr], #80]\n"
126125
"fmla v18.8h, %[b1].8h, %[a0].h[2]\n"
127126
"fmla v19.8h, %[b1].8h, %[a0].h[3]\n"
128127
"add %[b_ptr], %[b_ptr], #96\n"
129128
"fmla v20.8h, %[b1].8h, %[a0].h[4]\n"
130129
"fmla v21.8h, %[b1].8h, %[a0].h[5]\n"
130+
ASM_PREFETCH("[%[a_ptr], #128]")
131131
"fmla v22.8h, %[b1].8h, %[a0].h[6]\n"
132132
"fmla v23.8h, %[b1].8h, %[a0].h[7]\n"
133133
"ldr %q[b1], [%[b_ptr], #16]\n"
@@ -152,7 +152,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
152152
"fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n"
153153
"fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n"
154154
"fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n"
155-
"ldr %q[b0a], [%[b_ptr], #48]\n"
156155

157156
"fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n"
158157
"fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n"
@@ -163,7 +162,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
163162
"fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n"
164163
"fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n"
165164
"fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n"
166-
"ldr %q[b1a], [%[b_ptr], #64]\n"
167165

168166
"fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n"
169167
"fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n"
@@ -188,11 +186,13 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
188186
"ldr %q[a0a], [%[a_ptr], #16]\n"
189187
"fmla v10.8h, %[b0].8h, %[a0].h[2]\n"
190188
"fmla v11.8h, %[b0].8h, %[a0].h[3]\n"
191-
"ldr %q[b2a], [%[b_ptr], #80]\n"
189+
"ldr %q[b0a], [%[b_ptr], #48]\n"
192190
"fmla v12.8h, %[b0].8h, %[a0].h[4]\n"
193191
"fmla v13.8h, %[b0].8h, %[a0].h[5]\n"
192+
"ldr %q[b1a], [%[b_ptr], #64]\n"
194193
"fmla v14.8h, %[b0].8h, %[a0].h[6]\n"
195194
"fmla v15.8h, %[b0].8h, %[a0].h[7]\n"
195+
"ldr %q[b2a], [%[b_ptr], #80]\n"
196196

197197
"fmla v16.8h, %[b1].8h, %[a0].h[0]\n"
198198
"fmla v17.8h, %[b1].8h, %[a0].h[1]\n"

0 commit comments

Comments
 (0)