Skip to content

Commit a0aeba6

Browse files
author
Chip Kerchner
committed
Merge branch 'develop' into betterPowerGEMVTail
2 parents 083faf7 + eba8615 commit a0aeba6

File tree

7 files changed

+138
-29
lines changed

7 files changed

+138
-29
lines changed

.cirrus.yml

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,20 +89,21 @@ task:
8989
type: text/plain
9090

9191
macos_instance:
92-
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
92+
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
9393
task:
9494
name: AppleM1/LLVM armv7-androidndk xbuild
9595
compile_script:
96-
- brew install android-ndk
96+
- brew install --cask android-ndk
9797
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
9898
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
99-
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
100-
- ls /System/Volumes/Data/opt/homebrew
99+
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
100+
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
101+
- ls /opt/homebrew
101102
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk
102-
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
103+
- find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
103104
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
104105
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
105-
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
106+
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
106107
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
107108
always:
108109
config_artifacts:

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ Examples:
8585
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
8686
```
8787

88+
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.
89+
8890
### Debug version
8991

9092
A debug version can be built using `make DEBUG=1`.

cpuid_x86.c

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1527,6 +1527,19 @@ int get_cpuname(void){
15271527
break;
15281528
case 10: //family 6 exmodel 10
15291529
switch (model) {
1530+
case 13: // Granite Rapids
1531+
if(support_amx_bf16())
1532+
return CPUTYPE_SAPPHIRERAPIDS;
1533+
if(support_avx512_bf16())
1534+
return CPUTYPE_COOPERLAKE;
1535+
if(support_avx512())
1536+
return CPUTYPE_SKYLAKEX;
1537+
if(support_avx2())
1538+
return CPUTYPE_HASWELL;
1539+
if(support_avx())
1540+
return CPUTYPE_SANDYBRIDGE;
1541+
else
1542+
return CPUTYPE_NEHALEM;
15301543
case 5: // Comet Lake H and S
15311544
case 6: // Comet Lake U
15321545
case 10: // Meteor Lake
@@ -2352,8 +2365,22 @@ int get_coretype(void){
23522365

23532366
case 10:
23542367
switch (model) {
2368+
case 13: // Granite Rapids
2369+
if(support_amx_bf16())
2370+
return CORE_SAPPHIRERAPIDS;
2371+
if(support_avx512_bf16())
2372+
return CORE_COOPERLAKE;
2373+
if(support_avx512())
2374+
return CORE_SKYLAKEX;
2375+
if(support_avx2())
2376+
return CORE_HASWELL;
2377+
if(support_avx())
2378+
return CORE_SANDYBRIDGE;
2379+
else
2380+
return CORE_NEHALEM;
23552381
case 5: // Comet Lake H and S
23562382
case 6: // Comet Lake U
2383+
case 10: // Meteor Lake
23572384
if(support_avx())
23582385
#ifndef NO_AVX2
23592386
return CORE_HASWELL;
@@ -2362,6 +2389,7 @@ int get_coretype(void){
23622389
#endif
23632390
else
23642391
return CORE_NEHALEM;
2392+
case 0: // Meteor Lake
23652393
case 7:// Rocket Lake
23662394
#ifndef NO_AVX512
23672395
if(support_avx512())

driver/others/blas_server.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l
10761076
main_status[cpu] = MAIN_RUNNING1;
10771077
#endif
10781078

1079+
if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2);
1080+
10791081
//For target LOONGSON3R5, applying an offset to the buffer is essential
10801082
//for minimizing cache conflicts and optimizing performance.
10811083
#if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY)

exports/gensymbol

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c
880880
# clatrs3
881881

882882
lapackobjs2d="$lapackobjs2d
883-
dgelqs
884883
dgelst
885884
dgeqp3rk
886-
dgeqrs
887885
dlaqp2rk
888886
dlaqp3rk
889887
dlarmm
@@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d
897895
# dlaqz4
898896

899897
lapackobjs2z="$lapackobjs2z
900-
zgelqs
901898
zgelst
902899
zgeqp3rk
903-
zgeqrs
904900
zlaqp2rk
905901
zlaqp3rk
906902
zlatrs3
@@ -918,6 +914,7 @@ lapack_extendedprecision_objs="
918914
"
919915

920916
lapack_deprecated_objsc="
917+
cgelqs cgeqrs
921918
cgegs cggsvd
922919
cgegv cggsvp
923920
cgelsx clahrd
@@ -926,13 +923,16 @@ lapack_deprecated_objsc="
926923
"
927924

928925
lapack_deprecated_objsd="
926+
dgelqs dgeqrs
929927
dgegs dgeqpf
930928
dgegv dggsvd
931929
dgelsx dggsvp
932930
dlahrd
933931
dlatzm dtzrqf"
934932

935933
lapack_deprecated_objss="
934+
sgelqs
935+
sgeqrs
936936
sgelsx
937937
sgegs
938938
sgegv
@@ -945,6 +945,8 @@ lapack_deprecated_objss="
945945
"
946946

947947
lapack_deprecated_objsz="
948+
zgelqs
949+
zgeqrs
948950
zgegs
949951
zgegv
950952
zgelsx

kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -131,11 +131,11 @@
131131
sd $21, 40($sp)
132132
sd $22, 48($sp)
133133

134-
ST $f24, 56($sp)
135-
ST $f25, 64($sp)
136-
ST $f26, 72($sp)
137-
ST $f27, 80($sp)
138-
ST $f28, 88($sp)
134+
sdc1 $f24, 56($sp)
135+
sdc1 $f25, 64($sp)
136+
sdc1 $f26, 72($sp)
137+
sdc1 $f27, 80($sp)
138+
sdc1 $f28, 88($sp)
139139

140140
#if defined(TRMMKERNEL)
141141
sd $23, 96($sp)
@@ -146,10 +146,10 @@
146146
#endif
147147

148148
#ifndef __64BIT__
149-
ST $f20,120($sp)
150-
ST $f21,128($sp)
151-
ST $f22,136($sp)
152-
ST $f23,144($sp)
149+
sdc1 $f20,120($sp)
150+
sdc1 $f21,128($sp)
151+
sdc1 $f22,136($sp)
152+
sdc1 $f23,144($sp)
153153
#endif
154154

155155
.align 4
@@ -4000,11 +4000,11 @@
40004000
ld $21, 40($sp)
40014001
ld $22, 48($sp)
40024002

4003-
LD $f24, 56($sp)
4004-
LD $f25, 64($sp)
4005-
LD $f26, 72($sp)
4006-
LD $f27, 80($sp)
4007-
LD $f28, 88($sp)
4003+
ldc1 $f24, 56($sp)
4004+
ldc1 $f25, 64($sp)
4005+
ldc1 $f26, 72($sp)
4006+
ldc1 $f27, 80($sp)
4007+
ldc1 $f28, 88($sp)
40084008

40094009
#if defined(TRMMKERNEL)
40104010
ld $23, 96($sp)
@@ -4013,10 +4013,10 @@
40134013
#endif
40144014

40154015
#ifndef __64BIT__
4016-
LD $f20,120($sp)
4017-
LD $f21,128($sp)
4018-
LD $f22,136($sp)
4019-
LD $f23,144($sp)
4016+
ldc1 $f20,120($sp)
4017+
ldc1 $f21,128($sp)
4018+
ldc1 $f22,136($sp)
4019+
ldc1 $f23,144($sp)
40204020
#endif
40214021

40224022
daddiu $sp,$sp,STACKSIZE

test/compare_sgemm_sbgemm.c

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2929
#include "../common.h"
3030
#define SGEMM BLASFUNC(sgemm)
3131
#define SBGEMM BLASFUNC(sbgemm)
32+
#define SGEMV BLASFUNC(sgemv)
33+
#define SBGEMV BLASFUNC(sbgemv)
3234
typedef union
3335
{
3436
unsigned short v;
@@ -187,7 +189,79 @@ main (int argc, char *argv[])
187189
free(CC);
188190
}
189191

190-
if (ret != 0)
192+
if (ret != 0) {
191193
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
194+
return ret;
195+
}
196+
197+
k = 1;
198+
for (x = 1; x <= loop; x++)
199+
{
200+
float *A = (float *)malloc(x * x * sizeof(FLOAT));
201+
float *B = (float *)malloc(x * sizeof(FLOAT));
202+
float *C = (float *)malloc(x * sizeof(FLOAT));
203+
bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits));
204+
bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits));
205+
float *DD = (float *)malloc(x * sizeof(FLOAT));
206+
float *CC = (float *)malloc(x * sizeof(FLOAT));
207+
if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) ||
208+
(DD == NULL) || (CC == NULL))
209+
return 1;
210+
bfloat16 atmp, btmp;
211+
blasint one = 1;
212+
213+
for (j = 0; j < x; j++)
214+
{
215+
for (i = 0; i < x; i++)
216+
{
217+
A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
218+
sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one);
219+
AA[j * x + i].v = atmp;
220+
}
221+
B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
222+
sbstobf16_(&one, &B[j], &one, &btmp, &one);
223+
BB[j].v = btmp;
224+
}
225+
for (y = 0; y < 2; y++)
226+
{
227+
if (y == 0) {
228+
transA = 'N';
229+
} else {
230+
transA = 'T';
231+
}
232+
233+
memset(CC, 0, x * sizeof(FLOAT));
234+
memset(DD, 0, x * sizeof(FLOAT));
235+
memset(C, 0, x * sizeof(FLOAT));
236+
237+
SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k);
238+
SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k);
239+
240+
for (j = 0; j < x; j++)
241+
for (i = 0; i < x; i++)
242+
if (transA == 'N') {
243+
DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]);
244+
} else if (transA == 'T') {
245+
DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]);
246+
}
247+
248+
for (j = 0; j < x; j++) {
249+
if (fabs (CC[j] - C[j]) > 1.0)
250+
ret++;
251+
if (fabs (CC[j] - DD[j]) > 1.0)
252+
ret++;
253+
}
254+
}
255+
free(A);
256+
free(B);
257+
free(C);
258+
free(AA);
259+
free(BB);
260+
free(DD);
261+
free(CC);
262+
}
263+
264+
if (ret != 0)
265+
fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret);
192266
return ret;
193267
}

0 commit comments

Comments
 (0)