11/*
2- * Copyright (c) 2017-2018, 2024 Arm Limited.
2+ * Copyright (c) 2017-2018, 2024-2025 Arm Limited.
33 *
44 * SPDX-License-Identifier: MIT
55 *
@@ -76,9 +76,7 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
7676 " movi v11.8h, #0x0\n "
7777 " ldr %q[b2], [%[b_ptr], #32]\n "
7878 " movi v12.8h, #0x0\n "
79- " ldr %q[b0a], [%[b_ptr], #48]\n "
8079 " movi v13.8h, #0x0\n "
81- " ldr %q[b1a], [%[b_ptr], #64]\n "
8280 " movi v14.8h, #0x0\n "
8381 ASM_PREFETCH (" [%[b_ptr], #64]" )
8482 " movi v15.8h, #0x0\n "
@@ -109,25 +107,27 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
109107
110108 " 1:\n "
111109 " fmla v8.8h , %[b0].8h, %[a0].h[0]\n "
112- " fmla v9.8h , %[b0].8h, %[a0].h[1]\n "
110+ " fmla v9.8h , %[b0].8h, %[a0].h[1]\n "
113111 " ldr %q[a0a], [%[a_ptr], #16]\n "
114112 " fmla v10.8h, %[b0].8h, %[a0].h[2]\n "
115113 " fmla v11.8h, %[b0].8h, %[a0].h[3]\n "
116- " ldr %q[b2a ], [%[b_ptr], #80 ]\n "
114+ " ldr %q[b0a ], [%[b_ptr], #48 ]\n "
117115 " fmla v12.8h, %[b0].8h, %[a0].h[4]\n "
118116 " fmla v13.8h, %[b0].8h, %[a0].h[5]\n "
117+ " ldr %q[b1a], [%[b_ptr], #64]\n "
119118 " fmla v14.8h, %[b0].8h, %[a0].h[6]\n "
120119 " fmla v15.8h, %[b0].8h, %[a0].h[7]\n "
121120 " ldr %q[b0], [%[b_ptr], #96]\n "
122121
123122 " fmla v16.8h, %[b1].8h, %[a0].h[0]\n "
124123 " fmla v17.8h, %[b1].8h, %[a0].h[1]\n "
125- ASM_PREFETCH ( " [%[a_ptr ], #128] " )
124+ " ldr %q[b2a], [%[b_ptr ], #80] \n "
126125 " fmla v18.8h, %[b1].8h, %[a0].h[2]\n "
127126 " fmla v19.8h, %[b1].8h, %[a0].h[3]\n "
128127 " add %[b_ptr], %[b_ptr], #96\n "
129128 " fmla v20.8h, %[b1].8h, %[a0].h[4]\n "
130129 " fmla v21.8h, %[b1].8h, %[a0].h[5]\n "
130+ ASM_PREFETCH (" [%[a_ptr], #128]" )
131131 " fmla v22.8h, %[b1].8h, %[a0].h[6]\n "
132132 " fmla v23.8h, %[b1].8h, %[a0].h[7]\n "
133133 " ldr %q[b1], [%[b_ptr], #16]\n "
@@ -152,7 +152,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
152152 " fmla v13.8h, %[b0a].8h, %[a0a].h[5]\n "
153153 " fmla v14.8h, %[b0a].8h, %[a0a].h[6]\n "
154154 " fmla v15.8h, %[b0a].8h, %[a0a].h[7]\n "
155- " ldr %q[b0a], [%[b_ptr], #48]\n "
156155
157156 " fmla v16.8h, %[b1a].8h, %[a0a].h[0]\n "
158157 " fmla v17.8h, %[b1a].8h, %[a0a].h[1]\n "
@@ -163,7 +162,6 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
163162 " fmla v21.8h, %[b1a].8h, %[a0a].h[5]\n "
164163 " fmla v22.8h, %[b1a].8h, %[a0a].h[6]\n "
165164 " fmla v23.8h, %[b1a].8h, %[a0a].h[7]\n "
166- " ldr %q[b1a], [%[b_ptr], #64]\n "
167165
168166 " fmla v24.8h, %[b2a].8h, %[a0a].h[0]\n "
169167 " fmla v25.8h, %[b2a].8h, %[a0a].h[1]\n "
@@ -188,11 +186,13 @@ void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cp
188186 " ldr %q[a0a], [%[a_ptr], #16]\n "
189187 " fmla v10.8h, %[b0].8h, %[a0].h[2]\n "
190188 " fmla v11.8h, %[b0].8h, %[a0].h[3]\n "
191- " ldr %q[b2a ], [%[b_ptr], #80 ]\n "
189+ " ldr %q[b0a ], [%[b_ptr], #48 ]\n "
192190 " fmla v12.8h, %[b0].8h, %[a0].h[4]\n "
193191 " fmla v13.8h, %[b0].8h, %[a0].h[5]\n "
192+ " ldr %q[b1a], [%[b_ptr], #64]\n "
194193 " fmla v14.8h, %[b0].8h, %[a0].h[6]\n "
195194 " fmla v15.8h, %[b0].8h, %[a0].h[7]\n "
195+ " ldr %q[b2a], [%[b_ptr], #80]\n "
196196
197197 " fmla v16.8h, %[b1].8h, %[a0].h[0]\n "
198198 " fmla v17.8h, %[b1].8h, %[a0].h[1]\n "
0 commit comments