Skip to content

Commit b46f680

Browse files
committed
Merge pull request #887 from ksraste/develop
STRSM optimization for MIPS P5600 and I6400 using MSA
2 parents a8fcd89 + ad9f317 commit b46f680

11 files changed

+8549
-12
lines changed

CONTRIBUTORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,3 +160,4 @@ In chronological order:
160160

161161
* Kaustubh Raste <https://github.com/ksraste/>
162162
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
163+
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA

kernel/mips/KERNEL.P5600

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
113113
ZGEMMONCOPYOBJ = zgemm_oncopy.o
114114
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
115115

116-
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
117-
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
118-
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
119-
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
116+
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
117+
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
118+
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
119+
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
120120

121121
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
122122
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c

kernel/mips/dtrsm_kernel_LN_8x4_msa.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
11701170

11711171
for (j = (n >> 2); j--;)
11721172
{
1173-
kk = m;
1173+
kk = m + offset;
11741174

11751175
if (m & 7)
11761176
{
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
12331233
{
12341234
if (n & 2)
12351235
{
1236-
kk = m;
1236+
kk = m + offset;
12371237

12381238
if (m & 7)
12391239
{
@@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
12911291

12921292
if (n & 1)
12931293
{
1294-
kk = m;
1294+
kk = m + offset;
12951295

12961296
if (m & 7)
12971297
{

kernel/mips/dtrsm_kernel_LT_8x4_msa.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
11821182

11831183
for (j = (n >> 2); j--;)
11841184
{
1185-
kk = 0;
1185+
kk = offset;
11861186
aa = a;
11871187
cc = c;
11881188

@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
12331233
{
12341234
if (n & 2)
12351235
{
1236-
kk = 0;
1236+
kk = offset;
12371237
aa = a;
12381238
cc = c;
12391239

@@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
12821282

12831283
if (n & 1)
12841284
{
1285-
kk = 0;
1285+
kk = offset;
12861286
aa = a;
12871287
cc = c;
12881288

kernel/mips/dtrsm_kernel_RN_8x4_msa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
809809
BLASLONG i, j, kk;
810810
FLOAT *aa, *cc;
811811

812-
kk = 0;
812+
kk = -offset;
813813

814814
for (j = (n >> 2); j--;)
815815
{

kernel/mips/dtrsm_kernel_RT_8x4_msa.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
865865
BLASLONG i, j, kk;
866866
FLOAT *aa, *cc, *bb;
867867

868-
kk = n;
868+
kk = n - offset;
869869
c += n * ldc;
870870
b += n * k;
871871

kernel/mips/macros_msa.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
137137
}
138138
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
139139

140+
/* Description : Indexed word element values are replicated to all
141+
elements in output vector
142+
Arguments : Inputs - in, stidx
143+
Outputs - out0, out1
144+
Return Type - as per RTYPE
145+
Details : 'stidx' element value from 'in' vector is replicated to all
146+
elements in 'out0' vector
147+
'stidx + 1' element value from 'in' vector is replicated to all
148+
elements in 'out1' vector
149+
Valid index range for word operation is 0-3
150+
*/
151+
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
152+
{ \
153+
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
154+
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
155+
}
156+
157+
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
158+
{ \
159+
SPLATI_W2(RTYPE, in, 0, out0, out1); \
160+
SPLATI_W2(RTYPE, in, 2, out2, out3); \
161+
}
162+
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
163+
140164
/* Description : Transpose 4x4 block with word elements in vectors
141165
Arguments : Inputs - in0, in1, in2, in3
142166
Outputs - out0, out1, out2, out3

0 commit comments

Comments
 (0)