Skip to content

Commit d0e8366

Browse files
authored
Merge pull request #1804 from fenrus75/sgemm
Add a C+intrinsics version of the SGEMM/skylakex kernel
2 parents 065763a + d4bad73 commit d0e8366

File tree

4 files changed

+2470
-0
lines changed

4 files changed

+2470
-0
lines changed

kernel/x86_64/sgemm_beta_skylakex.c

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/*********************************************************************/
2+
/* Copyright 2009, 2010 The University of Texas at Austin. */
3+
/* All rights reserved. */
4+
/* */
5+
/* Redistribution and use in source and binary forms, with or */
6+
/* without modification, are permitted provided that the following */
7+
/* conditions are met: */
8+
/* */
9+
/* 1. Redistributions of source code must retain the above */
10+
/* copyright notice, this list of conditions and the following */
11+
/* disclaimer. */
12+
/* */
13+
/* 2. Redistributions in binary form must reproduce the above */
14+
/* copyright notice, this list of conditions and the following */
15+
/* disclaimer in the documentation and/or other materials */
16+
/* provided with the distribution. */
17+
/* */
18+
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19+
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20+
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21+
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22+
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23+
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24+
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25+
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26+
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27+
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28+
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29+
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30+
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31+
/* POSSIBILITY OF SUCH DAMAGE. */
32+
/* */
33+
/* The views and conclusions contained in the software and */
34+
/* documentation are those of the authors and should not be */
35+
/* interpreted as representing official policies, either expressed */
36+
/* or implied, of The University of Texas at Austin. */
37+
/*********************************************************************/
38+
39+
#include "common.h"
40+
41+
#include <immintrin.h>
42+
43+
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
44+
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
45+
FLOAT *c, BLASLONG ldc){
46+
47+
BLASLONG i, j;
48+
FLOAT *c_offset1, *c_offset;
49+
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
50+
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
51+
52+
/* fast path.. just zero the whole matrix */
53+
if (m == ldc && (unsigned long)beta == (unsigned long)ZERO) {
54+
memset(c, 0, m * n * sizeof(FLOAT));
55+
return 0;
56+
}
57+
58+
59+
c_offset = c;
60+
61+
if (beta == ZERO){
62+
__m512 z_zero;
63+
64+
z_zero = _mm512_setzero_ps();
65+
j = n;
66+
do {
67+
c_offset1 = c_offset;
68+
c_offset += ldc;
69+
70+
i = m;
71+
72+
while (i > 32) {
73+
_mm512_storeu_ps(c_offset1, z_zero);
74+
_mm512_storeu_ps(c_offset1 + 8, z_zero);
75+
_mm512_storeu_ps(c_offset1 + 16, z_zero);
76+
_mm512_storeu_ps(c_offset1 + 24 , z_zero);
77+
c_offset1 += 32;
78+
i -= 32;
79+
}
80+
while (i > 8) {
81+
_mm512_storeu_ps(c_offset1, z_zero);
82+
c_offset1 += 8;
83+
i -= 8;
84+
}
85+
86+
while (i > 0) {
87+
*c_offset1 = ZERO;
88+
c_offset1 ++;
89+
i --;
90+
}
91+
j --;
92+
} while (j > 0);
93+
94+
} else {
95+
96+
j = n;
97+
do {
98+
c_offset1 = c_offset;
99+
c_offset += ldc;
100+
101+
i = (m >> 3);
102+
if (i > 0){
103+
do {
104+
ctemp1 = *(c_offset1 + 0);
105+
ctemp2 = *(c_offset1 + 1);
106+
ctemp3 = *(c_offset1 + 2);
107+
ctemp4 = *(c_offset1 + 3);
108+
ctemp5 = *(c_offset1 + 4);
109+
ctemp6 = *(c_offset1 + 5);
110+
ctemp7 = *(c_offset1 + 6);
111+
ctemp8 = *(c_offset1 + 7);
112+
113+
ctemp1 *= beta;
114+
ctemp2 *= beta;
115+
ctemp3 *= beta;
116+
ctemp4 *= beta;
117+
ctemp5 *= beta;
118+
ctemp6 *= beta;
119+
ctemp7 *= beta;
120+
ctemp8 *= beta;
121+
122+
*(c_offset1 + 0) = ctemp1;
123+
*(c_offset1 + 1) = ctemp2;
124+
*(c_offset1 + 2) = ctemp3;
125+
*(c_offset1 + 3) = ctemp4;
126+
*(c_offset1 + 4) = ctemp5;
127+
*(c_offset1 + 5) = ctemp6;
128+
*(c_offset1 + 6) = ctemp7;
129+
*(c_offset1 + 7) = ctemp8;
130+
c_offset1 += 8;
131+
i --;
132+
} while (i > 0);
133+
}
134+
135+
i = (m & 7);
136+
if (i > 0){
137+
do {
138+
ctemp1 = *c_offset1;
139+
ctemp1 *= beta;
140+
*c_offset1 = ctemp1;
141+
c_offset1 ++;
142+
i --;
143+
} while (i > 0);
144+
}
145+
j --;
146+
} while (j > 0);
147+
148+
}
149+
return 0;
150+
};

0 commit comments

Comments
 (0)