Skip to content

Commit f1bf040

Browse files
authored
Merge pull request #2988 from xiegengxin/smp-asum
Improve the performance of dasum and sasum when SMP is defined
2 parents 613e3b2 + d6e7e05 commit f1bf040

File tree

2 files changed

+110
-15
lines changed

2 files changed

+110
-15
lines changed

kernel/x86_64/dasum.c

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -58,25 +58,73 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
5858
}
5959

6060
#endif
61-
62-
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
61+
static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
6362
{
64-
BLASLONG i=0;
63+
BLASLONG i = 0;
6564
FLOAT sumf = 0.0;
65+
66+
if (n <= 0 || inc_x <= 0) return (sumf);
6667

67-
if (n <= 0 || inc_x <= 0) return(sumf);
68-
69-
if ( inc_x == 1 ) {
68+
if (inc_x == 1) {
7069
sumf = dasum_kernel(n, x);
71-
}
70+
}
7271
else {
7372
n *= inc_x;
74-
75-
while(i < n) {
73+
while (i < n) {
7674
sumf += ABS_K(x[i]);
7775
i += inc_x;
7876
}
7977
}
8078
return(sumf);
8179
}
8280

81+
#if defined(SMP)
82+
static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
83+
{
84+
*(FLOAT *)result = asum_compute(n, x, inc_x);
85+
return 0;
86+
}
87+
88+
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
89+
#endif
90+
91+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
92+
{
93+
#if defined(SMP)
94+
int nthreads;
95+
FLOAT dummy_alpha;
96+
FLOAT * dummy_b;
97+
#endif
98+
FLOAT sumf = 0.0;
99+
100+
#if defined(SMP)
101+
int num_cpu = num_cpu_avail(1);
102+
if (n <= 100000 || inc_x <= 0)
103+
nthreads = 1;
104+
else
105+
nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
106+
107+
if (nthreads == 1) {
108+
sumf = asum_compute(n, x, inc_x);
109+
} else {
110+
int mode, i;
111+
char result[MAX_CPU_NUMBER * sizeof(double) *2];
112+
FLOAT *ptr;
113+
#if !defined(DOUBLE)
114+
mode = BLAS_SINGLE | BLAS_REAL;
115+
#else
116+
mode = BLAS_DOUBLE | BLAS_REAL;
117+
#endif
118+
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
119+
ptr = (FLOAT *)result;
120+
for (i = 0; i < nthreads; i++) {
121+
sumf += (*ptr);
122+
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
123+
}
124+
}
125+
#else
126+
sumf = asum_compute(n, x, inc_x);
127+
#endif
128+
return(sumf);
129+
}
130+

kernel/x86_64/sasum.c

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
6767

6868
#endif
6969

70-
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
70+
static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x)
7171
{
72-
BLASLONG i=0;
72+
BLASLONG i = 0;
7373
FLOAT sumf = 0.0;
74+
75+
if (n <= 0 || inc_x <= 0) return (sumf);
7476

75-
if (n <= 0 || inc_x <= 0) return(sumf);
76-
77-
if ( inc_x == 1 ) {
77+
if (inc_x == 1) {
7878
sumf = sasum_kernel(n, x);
7979
}
8080
else {
81-
8281
n *= inc_x;
8382
while(i < n) {
8483
sumf += ABS_K(x[i]);
8584
i += inc_x;
8685
}
86+
}
87+
return (sumf);
88+
}
8789

90+
#if defined(SMP)
91+
static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
92+
{
93+
*(FLOAT *)result = asum_compute(n, x, inc_x);
94+
return 0;
95+
}
96+
97+
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads);
98+
#endif
99+
100+
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
101+
{
102+
#if defined(SMP)
103+
int nthreads;
104+
FLOAT dummy_alpha;
105+
#endif
106+
FLOAT sumf = 0.0;
107+
108+
#if defined(SMP)
109+
int num_cpu = num_cpu_avail(1);
110+
if (n <= 100000 || inc_x <= 0)
111+
nthreads = 1;
112+
else
113+
nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
114+
if (nthreads == 1) {
115+
sumf = asum_compute(n, x, inc_x);
88116
}
117+
else {
118+
int mode, i;
119+
char result[MAX_CPU_NUMBER * sizeof(double) *2];
120+
FLOAT * ptr;
121+
#if !defined(DOUBLE)
122+
mode = BLAS_SINGLE | BLAS_REAL;
123+
#else
124+
mode = BLAS_DOUBLE | BLAS_REAL;
125+
#endif
126+
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
127+
ptr = (FLOAT *)result;
128+
for (i = 0; i < nthreads; i++) {
129+
sumf += (*ptr);
130+
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
131+
}
132+
}
133+
#else
134+
sumf = asum_compute(n, x, inc_x);
135+
#endif
89136
return(sumf);
90137
}

0 commit comments

Comments
 (0)