Skip to content

Commit fc101b6

Browse files
authored
Merge pull request #23 from xianyi/develop
rebase
2 parents 0492f0f + b0239a0 commit fc101b6

File tree

13 files changed

+1293
-42
lines changed

13 files changed

+1293
-42
lines changed

Makefile.x86

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
11
# COMPILER_PREFIX = mingw32-
22

3+
ifndef DYNAMIC_ARCH
4+
ADD_CPUFLAGS = 1
5+
else
6+
ifdef TARGET_CORE
7+
ADD_CPUFLAGS = 1
8+
endif
9+
endif
10+
11+
ifdef ADD_CPUFLAGS
312
ifdef HAVE_SSE
413
CCOMMON_OPT += -msse
14+
ifneq ($(F_COMPILER), NAG)
515
FCOMMON_OPT += -msse
616
endif
7-
17+
endif
18+
endif
819

920
ifeq ($(OSNAME), Interix)
1021
ARFLAGS = -m x86

Makefile.x86_64

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@ endif
88
endif
99
endif
1010

11+
12+
ifndef DYNAMIC_ARCH
13+
ADD_CPUFLAGS = 1
14+
else
15+
ifdef TARGET_CORE
16+
ADD_CPUFLAGS = 1
17+
endif
18+
endif
19+
20+
ifdef ADD_CPUFLAGS
1121
ifdef HAVE_SSE3
1222
CCOMMON_OPT += -msse3
1323
ifneq ($(F_COMPILER), NAG)
@@ -44,7 +54,6 @@ endif
4454
endif
4555

4656
ifeq ($(CORE), SKYLAKEX)
47-
ifndef DYNAMIC_ARCH
4857
ifndef NO_AVX512
4958
CCOMMON_OPT += -march=skylake-avx512
5059
ifneq ($(F_COMPILER), NAG)
@@ -62,10 +71,8 @@ endif
6271
endif
6372
endif
6473
endif
65-
endif
6674

6775
ifeq ($(CORE), COOPERLAKE)
68-
ifndef DYNAMIC_ARCH
6976
ifndef NO_AVX512
7077
ifeq ($(C_COMPILER), GCC)
7178
# cooperlake support was added in 10.1
@@ -88,7 +95,6 @@ endif
8895
endif
8996
endif
9097
endif
91-
endif
9298

9399
ifdef HAVE_AVX2
94100
ifndef NO_AVX2
@@ -120,6 +126,7 @@ endif
120126
endif
121127
endif
122128

129+
endif
123130

124131

125132
ifeq ($(OSNAME), Interix)

cmake/system.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,10 @@ if (NO_AVX2)
299299
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
300300
endif ()
301301

302+
if (NO_AVX512)
303+
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
304+
endif ()
305+
302306
if (USE_THREAD)
303307
# USE_SIMPLE_THREADED_LEVEL3 = 1
304308
# NO_AFFINITY = 1

driver/others/dynamic_arm64.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
126126
#endif
127127

128128
#define get_cpu_ftr(id, var) ({ \
129-
__asm__ ("mrs %0, "#id : "=r" (var)); \
129+
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
130130
})
131131

132132
static char *corename[] = {

kernel/power/KERNEL.POWER10

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ ZSWAPKERNEL = zswap.c
186186
SGEMVNKERNEL = sgemv_n.c
187187
DGEMVNKERNEL = dgemv_n_power10.c
188188
CGEMVNKERNEL = cgemv_n.c
189-
ZGEMVNKERNEL = zgemv_n_4.c
189+
ZGEMVNKERNEL = zgemv_n_power10.c
190190
#
191191
SGEMVTKERNEL = sgemv_t.c
192192
DGEMVTKERNEL = dgemv_t_power10.c

kernel/power/dgemm_kernel_power10.c

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
190190
__vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
191191
BLASLONG l = 0;
192192
vec_t *rowA = (vec_t *) & AO[0];
193-
vec_t *rb = (vec_t *) & BO[0];
194193
__vector_pair rowB, rowB1;
195-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
196-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
194+
rowB = *((__vector_pair *)((void *)&BO[0]));
195+
rowB1 = *((__vector_pair *)((void *)&BO[4]));
197196
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
198197
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
199198
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
205204
for (l = 1; l < temp; l++)
206205
{
207206
rowA = (vec_t *) & AO[l << 3];
208-
rb = (vec_t *) & BO[l << 3];
209-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
210-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
207+
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
208+
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
211209
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
212210
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
213211
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -247,19 +245,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
247245
BLASLONG l = 0;
248246
vec_t *rowA = (vec_t *) & AO[0];
249247
__vector_pair rowB, rowB1;
250-
vec_t *rb = (vec_t *) & BO[0];
251-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
252-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
248+
rowB = *((__vector_pair *)((void *)&BO[0]));
249+
rowB1 = *((__vector_pair *)((void *)&BO[4]));
253250
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
254251
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
255252
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
256253
__builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
257254
for (l = 1; l < temp; l++)
258255
{
259256
rowA = (vec_t *) & AO[l << 2];
260-
rb = (vec_t *) & BO[l << 3];
261-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
262-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
257+
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
258+
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
263259
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
264260
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
265261
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
291287
BLASLONG l = 0;
292288
vec_t *rowA = (vec_t *) & AO[0];
293289
__vector_pair rowB, rowB1;
294-
vec_t *rb = (vec_t *) & BO[0];
295-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
296-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
290+
rowB = *((__vector_pair *)((void *)&BO[0]));
291+
rowB1 = *((__vector_pair *)((void *)&BO[4]));
297292
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
298293
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
299294
for (l = 1; l < temp; l++)
300295
{
301296
rowA = (vec_t *) & AO[l << 1];
302-
rb = (vec_t *) & BO[l << 3];
303-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
304-
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
297+
rowB = *((__vector_pair *)((void *)&BO[l << 3]));
298+
rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4]));
305299
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
306300
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
307301
}
@@ -403,17 +397,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
403397
BLASLONG l = 0;
404398
vec_t *rowA = (vec_t *) & AO[0];
405399
__vector_pair rowB;
406-
vec_t *rb = (vec_t *) & BO[0];
407-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
400+
rowB = *((__vector_pair *)((void *)&BO[0]));
408401
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
409402
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
410403
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
411404
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
412405
for (l = 1; l < temp; l++)
413406
{
414407
rowA = (vec_t *) & AO[l << 3];
415-
rb = (vec_t *) & BO[l << 2];
416-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
408+
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
417409
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
418410
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
419411
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
445437
BLASLONG l = 0;
446438
vec_t *rowA = (vec_t *) & AO[0];
447439
__vector_pair rowB;
448-
vec_t *rb = (vec_t *) & BO[0];
449-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
440+
rowB = *((__vector_pair *)((void *)&BO[0]));
450441
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
451442
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
452443
for (l = 1; l < temp; l++)
453444
{
454445
rowA = (vec_t *) & AO[l << 2];
455-
rb = (vec_t *) & BO[l << 2];
456-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
446+
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
457447
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
458448
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
459449
}
@@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
481471
BLASLONG l = 0;
482472
vec_t *rowA = (vec_t *) & AO[0];
483473
__vector_pair rowB;
484-
vec_t *rb = (vec_t *) & BO[0];
485-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
474+
rowB = *((__vector_pair *)((void *)&BO[0]));
486475
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
487476
for (l = 1; l < temp; l++)
488477
{
489478
rowA = (vec_t *) & AO[l << 1];
490-
rb = (vec_t *) & BO[l << 2];
491-
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
479+
rowB = *((__vector_pair *)((void *)&BO[l << 2]));
492480
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
493481
}
494482
SAVE_ACC (&acc0, 0);

0 commit comments

Comments
 (0)