Skip to content

Commit b1eed27

Browse files
authored
Replace naive omatcopy_rt with 4x4 blocked implementation
as suggested by MigMuc in issue 2532
1 parent 86a5f98 commit b1eed27

File tree

1 file changed

+198
-26
lines changed

1 file changed

+198
-26
lines changed

kernel/arm/omatcopy_rt.c

Lines changed: 198 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/***************************************************************************
2-
Copyright (c) 2013, The OpenBLAS Project
2+
Copyright (c) 2021, The OpenBLAS Project
33
All rights reserved.
44
Redistribution and use in source and binary forms, with or without
55
modification, are permitted provided that the following conditions are
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2727

2828
#include "common.h"
2929

30-
/*****************************************************
31-
* 2014/06/09 Saar
32-
*
33-
* Order rowMajor
34-
* Trans
35-
*
36-
******************************************************/
37-
3830
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
39-
{
40-
BLASLONG i,j;
41-
FLOAT *aptr,*bptr;
4231

43-
if ( rows <= 0 ) return(0);
44-
if ( cols <= 0 ) return(0);
32+
BLASLONG i, j;
33+
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
34+
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
4535

46-
aptr = a;
36+
if (rows <= 0) return 0;
37+
if (cols <= 0) return 0;
4738

48-
for ( i=0; i<rows ; i++ )
49-
{
50-
bptr = &b[i];
51-
for(j=0; j<cols; j++)
52-
{
53-
bptr[j*ldb] = alpha * aptr[j];
54-
}
55-
aptr += lda;
56-
}
39+
a_offset = a;
40+
b_offset = b;
5741

58-
return(0);
42+
i = (rows >> 2);
43+
if (i > 0) {
44+
do {
45+
a_offset1 = a_offset;
46+
a_offset2 = a_offset1 + lda;
47+
a_offset3 = a_offset2 + lda;
48+
a_offset4 = a_offset3 + lda;
49+
a_offset += 4 * lda;
50+
51+
b_offset1 = b_offset;
52+
b_offset2 = b_offset1 + ldb;
53+
b_offset3 = b_offset2 + ldb;
54+
b_offset4 = b_offset3 + ldb;
55+
b_offset += 4;
56+
57+
j = (cols >> 2);
58+
if (j > 0) {
59+
do {
60+
/* Column 1 of MAT_B */
61+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
62+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
63+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
64+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
65+
66+
/* Column 2 of MAT_B */
67+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
68+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
69+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
70+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
71+
72+
/* Column 3 of MAT_B */
73+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
74+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
75+
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
76+
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
77+
78+
/* Column 4 of MAT_B */
79+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
80+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
81+
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
82+
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
83+
84+
a_offset1 += 4;
85+
a_offset2 += 4;
86+
a_offset3 += 4;
87+
a_offset4 += 4;
88+
b_offset1 += ldb * 4;
89+
b_offset2 += ldb * 4;
90+
b_offset3 += ldb * 4;
91+
b_offset4 += ldb * 4;
92+
93+
j--;
94+
} while (j > 0);
95+
} // if(j > 0)
96+
97+
98+
if (cols & 2) {
99+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
100+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
101+
102+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
103+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
104+
105+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
106+
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
107+
108+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
109+
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
110+
111+
a_offset1 += 2;
112+
a_offset2 += 2;
113+
a_offset3 += 2;
114+
a_offset4 += 2;
115+
116+
b_offset1 += ldb*2;
117+
118+
}
119+
120+
if (cols & 1) {
121+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
122+
123+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
124+
125+
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
126+
127+
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
128+
}
129+
130+
i--;
131+
} while (i > 0);
132+
}
59133

60-
}
61134

135+
if (rows & 2) {
136+
a_offset1 = a_offset;
137+
a_offset2 = a_offset1 + lda;
138+
a_offset += 2 * lda;
139+
140+
b_offset1 = b_offset;
141+
b_offset2 = b_offset1 + ldb;
142+
b_offset3 = b_offset2 + ldb;
143+
b_offset4 = b_offset3 + ldb;
144+
b_offset += 2;
145+
146+
j = (cols >> 2);
147+
if (j > 0){
148+
do {
149+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
150+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
151+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
152+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
153+
154+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
155+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
156+
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
157+
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
158+
159+
a_offset1 += 4;
160+
a_offset2 += 4;
161+
b_offset1 += ldb * 4;
162+
b_offset2 += ldb * 4;
163+
b_offset3 += ldb * 4;
164+
b_offset4 += ldb * 4;
165+
166+
j--;
167+
} while (j > 0);
168+
}
169+
170+
171+
if (cols & 2){
172+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
173+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
174+
175+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
176+
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
177+
178+
a_offset1 += 2;
179+
a_offset2 += 2;
180+
b_offset1 += ldb*2;
181+
182+
}
183+
184+
185+
if (cols & 1){
186+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
187+
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
188+
}
189+
} // if (rows & 2)
190+
191+
192+
if (rows & 1) {
193+
a_offset1 = a_offset;
194+
a_offset += lda;
195+
196+
b_offset1 = b_offset;
197+
b_offset2 = b_offset1 + ldb;
198+
b_offset3 = b_offset2 + ldb;
199+
b_offset4 = b_offset3 + ldb;
200+
201+
j = (cols >> 2);
202+
if (j > 0){
203+
do {
204+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
205+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
206+
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
207+
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
208+
209+
a_offset1 += 4;
210+
b_offset1 += ldb * 4;
211+
b_offset2 += ldb * 4;
212+
b_offset3 += ldb * 4;
213+
b_offset4 += ldb * 4;
214+
215+
j--;
216+
} while (j > 0);
217+
}
218+
219+
if (cols & 2){
220+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
221+
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
222+
223+
a_offset1 += 2;
224+
b_offset1 += ldb * 2;
225+
}
226+
227+
if (cols & 1){
228+
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
229+
}
230+
}
231+
232+
return 0;
233+
}
62234

0 commit comments

Comments
 (0)