Skip to content

Commit e33bcdb

Browse files
authored
Merge pull request #3115 from martin-frbg/issue2532
Replace unoptimized OMATCOPY_RT with 4x4 blocked version
2 parents 1caa44b + 292d1af commit e33bcdb

File tree

3 files changed

+573
-25
lines changed

3 files changed

+573
-25
lines changed

kernel/arm/omatcopy_rt.c

Lines changed: 197 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
Copyright (c) 2013, The OpenBLAS Project
2+
Copyright (c) 2021, The OpenBLAS Project
33
All rights reserved.
44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
#include "common.h"
2929

30-
/*****************************************************
31-
* 2014/06/09 Saar
32-
*
33-
* Order rowMajor
34-
* Trans
35-
*
36-
******************************************************/
37-
3830
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
3931
{
40-
BLASLONG i,j;
41-
FLOAT *aptr,*bptr;
32+
BLASLONG i, j;
33+
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
34+
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
4235

43-
if ( rows <= 0 ) return(0);
44-
if ( cols <= 0 ) return(0);
36+
if (rows <= 0) return 0;
37+
if (cols <= 0) return 0;
4538

46-
aptr = a;
39+
a_offset = a;
40+
b_offset = b;
4741

48-
for ( i=0; i<rows ; i++ )
49-
{
50-
bptr = &b[i];
51-
for(j=0; j<cols; j++)
52-
{
53-
bptr[j*ldb] = alpha * aptr[j];
54-
}
55-
aptr += lda;
56-
}
42+
i = (rows >> 2);
43+
if (i > 0) {
44+
do {
45+
a_offset1 = a_offset;
46+
a_offset2 = a_offset1 + lda;
47+
a_offset3 = a_offset2 + lda;
48+
a_offset4 = a_offset3 + lda;
49+
a_offset += 4 * lda;
5750

58-
return(0);
51+
b_offset1 = b_offset;
52+
b_offset2 = b_offset1 + ldb;
53+
b_offset3 = b_offset2 + ldb;
54+
b_offset4 = b_offset3 + ldb;
55+
b_offset += 4;
56+
57+
j = (cols >> 2);
58+
if (j > 0) {
59+
do {
60+
/* Column 1 of MAT_B */
61+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
62+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
63+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
64+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
65+
66+
/* Column 2 of MAT_B */
67+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
68+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
69+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
70+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
71+
72+
/* Column 3 of MAT_B */
73+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
74+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
75+
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
76+
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
77+
78+
/* Column 4 of MAT_B */
79+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
80+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
81+
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
82+
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
83+
84+
a_offset1 += 4;
85+
a_offset2 += 4;
86+
a_offset3 += 4;
87+
a_offset4 += 4;
88+
b_offset1 += ldb * 4;
89+
b_offset2 += ldb * 4;
90+
b_offset3 += ldb * 4;
91+
b_offset4 += ldb * 4;
92+
93+
j--;
94+
} while (j > 0);
95+
} // if(j > 0)
96+
97+
98+
if (cols & 2) {
99+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
100+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
101+
102+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
103+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
104+
105+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
106+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
107+
108+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
109+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
110+
111+
a_offset1 += 2;
112+
a_offset2 += 2;
113+
a_offset3 += 2;
114+
a_offset4 += 2;
115+
116+
b_offset1 += ldb*2;
117+
118+
}
119+
120+
if (cols & 1) {
121+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
122+
123+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
124+
125+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
126+
127+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
128+
}
129+
130+
i--;
131+
} while (i > 0);
132+
}
59133

60-
}
61134

135+
if (rows & 2) {
136+
a_offset1 = a_offset;
137+
a_offset2 = a_offset1 + lda;
138+
a_offset += 2 * lda;
139+
140+
b_offset1 = b_offset;
141+
b_offset2 = b_offset1 + ldb;
142+
b_offset3 = b_offset2 + ldb;
143+
b_offset4 = b_offset3 + ldb;
144+
b_offset += 2;
145+
146+
j = (cols >> 2);
147+
if (j > 0){
148+
do {
149+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
150+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
151+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
152+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
153+
154+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
155+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
156+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
157+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
158+
159+
a_offset1 += 4;
160+
a_offset2 += 4;
161+
b_offset1 += ldb * 4;
162+
b_offset2 += ldb * 4;
163+
b_offset3 += ldb * 4;
164+
b_offset4 += ldb * 4;
165+
166+
j--;
167+
} while (j > 0);
168+
}
169+
170+
171+
if (cols & 2){
172+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
173+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
174+
175+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
176+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
177+
178+
a_offset1 += 2;
179+
a_offset2 += 2;
180+
b_offset1 += ldb*2;
181+
182+
}
183+
184+
185+
if (cols & 1){
186+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
187+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
188+
}
189+
} // if (rows & 2)
190+
191+
192+
if (rows & 1) {
193+
a_offset1 = a_offset;
194+
a_offset += lda;
195+
196+
b_offset1 = b_offset;
197+
b_offset2 = b_offset1 + ldb;
198+
b_offset3 = b_offset2 + ldb;
199+
b_offset4 = b_offset3 + ldb;
200+
201+
j = (cols >> 2);
202+
if (j > 0){
203+
do {
204+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
205+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
206+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
207+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
208+
209+
a_offset1 += 4;
210+
b_offset1 += ldb * 4;
211+
b_offset2 += ldb * 4;
212+
b_offset3 += ldb * 4;
213+
b_offset4 += ldb * 4;
214+
215+
j--;
216+
} while (j > 0);
217+
}
218+
219+
if (cols & 2){
220+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
221+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
222+
223+
a_offset1 += 2;
224+
b_offset1 += ldb * 2;
225+
}
226+
227+
if (cols & 1){
228+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
229+
}
230+
}
231+
232+
return 0;
233+
}
62234

kernel/x86_64/KERNEL

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S
489489

490490
SSUMKERNEL = ../arm/sum.c
491491
DSUMKERNEL = ../arm/sum.c
492+
493+
SOMATCOPY_RT = omatcopy_rt.c
494+
DOMATCOPY_RT = omatcopy_rt.c

0 commit comments

Comments
 (0)