Skip to content

Commit 10d5f3c

Browse files
authored
Merge pull request #2602 from ashwinyes/thunderx2_develop
DAXPY Optimizations for ThunderX2
2 parents ec2dd7b + 8353cb2 commit 10d5f3c

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

kernel/arm64/daxpy_thunderx2t99.S

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
9898
add X, X, #128
9999
.endm
100100

101+
/*
102+
* No need to do software prefetches if the vector fits
103+
* into L1 cache
104+
*/
105+
.macro KERNEL_F16_L1CACHE
106+
ldp q4, q5, [X]
107+
ldp q16, q17, [Y]
108+
109+
ldp q6, q7, [X, #32]
110+
ldp q18, q19, [Y, #32]
111+
112+
fmla v16.2d, v4.2d, v0.d[0]
113+
fmla v17.2d, v5.2d, v0.d[0]
114+
115+
stp q16, q17, [Y]
116+
117+
ldp q20, q21, [X, #64]
118+
ldp q24, q25, [Y, #64]
119+
120+
fmla v18.2d, v6.2d, v0.d[0]
121+
fmla v19.2d, v7.2d, v0.d[0]
122+
123+
stp q18, q19, [Y, #32]
124+
125+
ldp q22, q23, [X, #96]
126+
ldp q26, q27, [Y, #96]
127+
128+
fmla v24.2d, v20.2d, v0.d[0]
129+
fmla v25.2d, v21.2d, v0.d[0]
130+
131+
stp q24, q25, [Y, #64]
132+
133+
fmla v26.2d, v22.2d, v0.d[0]
134+
fmla v27.2d, v23.2d, v0.d[0]
135+
136+
stp q26, q27, [Y, #96]
137+
138+
add Y, Y, #128
139+
add X, X, #128
140+
.endm
141+
101142
.macro KERNEL_F32
102143
KERNEL_F16
103144
KERNEL_F16
104145
.endm
105146

147+
148+
.macro KERNEL_F32_L1CACHE
149+
KERNEL_F16_L1CACHE
150+
KERNEL_F16_L1CACHE
151+
.endm
152+
106153
.macro INIT_S
107154
lsl INC_X, INC_X, #3
108155
lsl INC_Y, INC_Y, #3
@@ -138,13 +185,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
138185
cmp I, xzr
139186
beq .Ldaxpy_kernel_F1
140187

188+
cmp N, #2048
189+
ble .Ldaxpy_kernel_F32_L1CACHE
190+
141191
.align 5
142192
.Ldaxpy_kernel_F32:
143193

144194
KERNEL_F32
145195

146196
subs I, I, #1
147197
bne .Ldaxpy_kernel_F32
198+
b .Ldaxpy_kernel_F1
199+
200+
.align 5
201+
.Ldaxpy_kernel_F32_L1CACHE:
202+
203+
KERNEL_F32_L1CACHE
204+
205+
subs I, I, #1
206+
bne .Ldaxpy_kernel_F32_L1CACHE
148207

149208
.Ldaxpy_kernel_F1:
150209

0 commit comments

Comments
 (0)