Skip to content

Commit b87a77d

Browse files
authored
Merge pull request #79 from xianyi/develop
rebase
2 parents fb3d80c + f42e84d commit b87a77d

File tree

11 files changed

+306
-200
lines changed

11 files changed

+306
-200
lines changed

Makefile.install

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
1313
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
1414
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
1515
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
16+
PKG_EXTRALIB := $(EXTRALIB)
17+
ifeq ($(USE_OPENMP), 1)
18+
ifeq ($(C_COMPILER), PGI)
19+
PKG_EXTRALIB += -lomp
20+
else
21+
PKG_EXTRALIB += -lgomp
22+
endif
23+
endif
1624

1725
.PHONY : install
1826
.NOTPARALLEL : install
@@ -147,7 +155,7 @@ endif
147155
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
148156
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
149157
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
150-
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
158+
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
151159
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
152160

153161

Makefile.system

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ else ifeq ($(ARCH), powerpc)
2525
override ARCH=power
2626
else ifeq ($(ARCH), i386)
2727
override ARCH=x86
28+
else ifeq ($(ARCH), armv6)
29+
override ARCH=arm
30+
else ifeq ($(ARCH), armv7)
31+
override ARCH=arm
2832
else ifeq ($(ARCH), aarch64)
2933
override ARCH=arm64
3034
else ifeq ($(ARCH), zarch)

cmake/openblas.pc.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,5 @@ Name: OpenBLAS
77
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
88
Version: @OPENBLAS_VERSION@
99
URL: https://github.com/xianyi/OpenBLAS
10-
Libs: -L${libdir} -lopenblas${libsuffix}
10+
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
1111
Cflags: -I${includedir}

cmake/system_check.cmake

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
5454
endif()
5555
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
5656
set(X86 1)
57-
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
58-
set(ARM 1)
59-
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
57+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
6058
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
6159
set(ARM64 1)
6260
else()
6361
set(ARM 1)
6462
endif()
63+
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
64+
set(ARM 1)
6565
elseif (${CMAKE_CROSSCOMPILING})
6666
if (${TARGET} STREQUAL "CORE2")
6767
if (NOT BINARY)

cpuid_arm64.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ int detect(void)
197197

198198

199199
}
200+
#else
201+
return CPU_ARMV8;
200202
#endif
201203

202204
return CPU_UNKNOWN;

cpuid_power.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ int detect(void){
145145
if (implementation >= 0x40000u) return CPUTYPE_POWER10;
146146
else if (implementation & 0x20000) return CPUTYPE_POWER9;
147147
else if (implementation & 0x10000) return CPUTYPE_POWER8;
148-
else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7
148+
else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7
149149
else if (implementation & 0x04000) return CPUTYPE_POWER6;
150150
else if (implementation & 0x02000) return CPUTYPE_POWER5;
151151
else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450

driver/others/blas_server_omp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
335335
break;
336336
}
337337

338-
#pragma omp parallel for schedule(OMP_SCHED)
338+
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
339339
for (i = 0; i < num; i ++) {
340340

341341
#ifndef USE_SIMPLE_THREADED_LEVEL3

kernel/power/dgemm_kernel_power10.c

Lines changed: 94 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
8787
rowC[0] += result[1] * alpha;
8888
#endif
8989

90-
#define SET_ACC_ZERO4() \
91-
__builtin_mma_xxsetaccz (&acc0); \
92-
__builtin_mma_xxsetaccz (&acc1); \
93-
__builtin_mma_xxsetaccz (&acc2); \
94-
__builtin_mma_xxsetaccz (&acc3);
95-
96-
#define SET_ACC_ZERO8() \
97-
__builtin_mma_xxsetaccz (&acc0); \
98-
__builtin_mma_xxsetaccz (&acc1); \
99-
__builtin_mma_xxsetaccz (&acc2); \
100-
__builtin_mma_xxsetaccz (&acc3); \
101-
__builtin_mma_xxsetaccz (&acc4); \
102-
__builtin_mma_xxsetaccz (&acc5); \
103-
__builtin_mma_xxsetaccz (&acc6); \
104-
__builtin_mma_xxsetaccz (&acc7);
105-
10690
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
10791

10892
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
@@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
210194
PREFETCH1 (CO + ldc + ldc, 128);
211195
PREFETCH1 (CO + ldc + ldc + ldc, 128);
212196
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
213-
SET_ACC_ZERO8 ();
214-
for (l = 0; l < temp; l++)
197+
vec_t *rowA = (vec_t *) & AO[0];
198+
__vector_pair rowB;
199+
vec_t *rb = (vec_t *) & BO[0];
200+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
201+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
202+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
203+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
204+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
205+
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
206+
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
207+
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
208+
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
209+
for (l = 1; l < temp; l++)
215210
{
216-
vec_t *rowA = (vec_t *) & AO[l << 4];
217-
__vector_pair rowB;
218-
vec_t *rb = (vec_t *) & BO[l << 2];
211+
rowA = (vec_t *) & AO[l << 4];
212+
rb = (vec_t *) & BO[l << 2];
219213
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
220214
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
221215
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
254248
v4sf_t *rowC;
255249
v4sf_t result[4];
256250
__vector_quad acc0, acc1, acc2, acc3;
257-
SET_ACC_ZERO4 ();
258251
BLASLONG l = 0;
259-
for (l = 0; l < temp; l++)
252+
vec_t *rowA = (vec_t *) & AO[0];
253+
__vector_pair rowB;
254+
vec_t *rb = (vec_t *) & BO[0];
255+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
256+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
257+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
258+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
259+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
260+
for (l = 1; l < temp; l++)
260261
{
261-
vec_t *rowA = (vec_t *) & AO[l << 3];
262-
__vector_pair rowB;
263-
vec_t *rb = (vec_t *) & BO[l << 2];
262+
rowA = (vec_t *) & AO[l << 3];
263+
rb = (vec_t *) & BO[l << 2];
264264
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
265265
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
266266
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
291291
v4sf_t *rowC;
292292
v4sf_t result[4];
293293
__vector_quad acc0, acc1;
294-
__builtin_mma_xxsetaccz (&acc0);
295-
__builtin_mma_xxsetaccz (&acc1);
296294
BLASLONG l = 0;
297-
for (l = 0; l < temp; l++)
295+
vec_t *rowA = (vec_t *) & AO[0];
296+
__vector_pair rowB;
297+
vec_t *rb = (vec_t *) & BO[0];
298+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
299+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
300+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
301+
for (l = 1; l < temp; l++)
298302
{
299-
vec_t *rowA = (vec_t *) & AO[l << 2];
300-
__vector_pair rowB;
301-
vec_t *rb = (vec_t *) & BO[l << 2];
303+
rowA = (vec_t *) & AO[l << 2];
304+
rb = (vec_t *) & BO[l << 2];
302305
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
303306
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
304307
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
325328
v4sf_t *rowC;
326329
v4sf_t result[4];
327330
__vector_quad acc0;
328-
__builtin_mma_xxsetaccz (&acc0);
329331
BLASLONG l = 0;
330-
for (l = 0; l < temp; l++)
332+
vec_t *rowA = (vec_t *) & AO[0];
333+
__vector_pair rowB;
334+
vec_t *rb = (vec_t *) & BO[0];
335+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
336+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
337+
for (l = 1; l < temp; l++)
331338
{
332-
vec_t *rowA = (vec_t *) & AO[l << 1];
333-
__vector_pair rowB;
334-
vec_t *rb = (vec_t *) & BO[l << 2];
339+
rowA = (vec_t *) & AO[l << 1];
340+
rb = (vec_t *) & BO[l << 2];
335341
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
336342
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
337343
}
@@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
414420
v4sf_t *rowC;
415421
v4sf_t result[4];
416422
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
417-
SET_ACC_ZERO8 ();
418423
BLASLONG l = 0;
419-
for (l = 0; l < temp; l++)
424+
FLOAT t[4] = { 0, 0, 0, 0 };
425+
t[0] = BO[0], t[1] = BO[1];
426+
__vector_pair rowB;
427+
vec_t *rb = (vec_t *) & t[0];
428+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
429+
vec_t *rowA = (vec_t *) & AO[0];
430+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
431+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
432+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
433+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
434+
__builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
435+
__builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
436+
__builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
437+
__builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
438+
for (l = 1; l < temp; l++)
420439
{
421-
FLOAT t[4] = { 0, 0, 0, 0 };
422440
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
423-
__vector_pair rowB;
424-
vec_t *rb = (vec_t *) & t[0];
441+
rb = (vec_t *) & t[0];
425442
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
426-
vec_t *rowA = (vec_t *) & AO[l << 4];
443+
rowA = (vec_t *) & AO[l << 4];
427444
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
428445
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
429446
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
461478
v4sf_t *rowC;
462479
v4sf_t result[4];
463480
__vector_quad acc0, acc1, acc2, acc3;
464-
SET_ACC_ZERO4 ();
465481
BLASLONG l = 0;
466-
for (l = 0; l < temp; l++)
482+
FLOAT t[4] = { 0, 0, 0, 0 };
483+
t[0] = BO[0], t[1] = BO[1];
484+
__vector_pair rowB;
485+
vec_t *rb = (vec_t *) & t[0];
486+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
487+
vec_t *rowA = (vec_t *) & AO[0];
488+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
489+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
490+
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
491+
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
492+
for (l = 1; l < temp; l++)
467493
{
468-
FLOAT t[4] = { 0, 0, 0, 0 };
469494
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
470-
__vector_pair rowB;
471-
vec_t *rb = (vec_t *) & t[0];
495+
rb = (vec_t *) & t[0];
472496
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
473-
vec_t *rowA = (vec_t *) & AO[l << 3];
497+
rowA = (vec_t *) & AO[l << 3];
474498
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
475499
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
476500
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
500524
v4sf_t *rowC;
501525
v4sf_t result[4];
502526
__vector_quad acc0, acc1;
503-
__builtin_mma_xxsetaccz (&acc0);
504-
__builtin_mma_xxsetaccz (&acc1);
505527
BLASLONG l = 0;
506-
for (l = 0; l < temp; l++)
528+
FLOAT t[4] = { 0, 0, 0, 0 };
529+
t[0] = BO[0], t[1] = BO[1];
530+
__vector_pair rowB;
531+
vec_t *rb = (vec_t *) & t[0];
532+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
533+
vec_t *rowA = (vec_t *) & AO[0];
534+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
535+
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
536+
for (l = 1; l < temp; l++)
507537
{
508-
FLOAT t[4] = { 0, 0, 0, 0 };
509538
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
510-
__vector_pair rowB;
511-
vec_t *rb = (vec_t *) & t[0];
539+
rb = (vec_t *) & t[0];
512540
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
513-
vec_t *rowA = (vec_t *) & AO[l << 2];
541+
rowA = (vec_t *) & AO[l << 2];
514542
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
515543
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
516544
}
@@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
536564
v4sf_t *rowC;
537565
v4sf_t result[4];
538566
__vector_quad acc0;
539-
__builtin_mma_xxsetaccz (&acc0);
540567
BLASLONG l = 0;
541-
for (l = 0; l < temp; l++)
568+
FLOAT t[4] = { 0, 0, 0, 0 };
569+
t[0] = BO[0], t[1] = BO[1];
570+
__vector_pair rowB;
571+
vec_t *rb = (vec_t *) & t[0];
572+
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
573+
vec_t *rowA = (vec_t *) & AO[0];
574+
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
575+
for (l = 1; l < temp; l++)
542576
{
543-
FLOAT t[4] = { 0, 0, 0, 0 };
544577
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
545-
__vector_pair rowB;
546-
vec_t *rb = (vec_t *) & t[0];
578+
rb = (vec_t *) & t[0];
547579
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
548-
vec_t *rowA = (vec_t *) & AO[l << 1];
580+
rowA = (vec_t *) & AO[l << 1];
549581
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
550582
}
551583
SAVE2x4_ACC (&acc0, 0);

0 commit comments

Comments
 (0)