Skip to content

Commit e1b2502

Browse files
authored
Merge pull request #1316 from timmoon10/develop
Variable thread count for multi-threaded GEMMs
2 parents ebe8421 + a89d671 commit e1b2502

File tree

2 files changed

+87
-110
lines changed

2 files changed

+87
-110
lines changed

benchmark/gemm.c

Lines changed: 78 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -121,100 +121,124 @@ static void *huge_malloc(BLASLONG size){
121121
int main(int argc, char *argv[]){
122122

123123
FLOAT *a, *b, *c;
124-
FLOAT alpha[] = {1.0, 1.0};
124+
FLOAT alpha[] = {1.0, 0.0};
125125
FLOAT beta [] = {0.0, 0.0};
126-
char trans='N';
127-
blasint m, n, i, j;
126+
char transa = 'N';
127+
char transb = 'N';
128+
blasint m, n, k, i, j, lda, ldb, ldc;
128129
int loops = 1;
129-
int has_param_n=0;
130-
int l;
130+
int has_param_m = 0;
131+
int has_param_n = 0;
132+
int has_param_k = 0;
131133
char *p;
132134

133135
int from = 1;
134136
int to = 200;
135137
int step = 1;
136138

137139
struct timeval start, stop;
138-
double time1,timeg;
140+
double time1, timeg;
139141

140142
argc--;argv++;
141143

142-
if (argc > 0) { from = atol(*argv); argc--; argv++;}
143-
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
144-
if (argc > 0) { step = atol(*argv); argc--; argv++;}
144+
if (argc > 0) { from = atol(*argv); argc--; argv++; }
145+
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++; }
146+
if (argc > 0) { step = atol(*argv); argc--; argv++; }
145147

146-
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
148+
if ((p = getenv("OPENBLAS_TRANS"))) {
149+
transa=*p;
150+
transb=*p;
151+
}
152+
if ((p = getenv("OPENBLAS_TRANSA"))) {
153+
transa=*p;
154+
}
155+
if ((p = getenv("OPENBLAS_TRANSB"))) {
156+
transb=*p;
157+
}
158+
TOUPPER(transa);
159+
TOUPPER(transb);
147160

148-
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c\n", from, to, step, trans);
161+
fprintf(stderr, "From : %3d To : %3d Step=%d : Transa=%c : Transb=%c\n", from, to, step, transa, transb);
149162

150-
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
151-
fprintf(stderr,"Out of Memory!!\n");exit(1);
163+
p = getenv("OPENBLAS_LOOPS");
164+
if ( p != NULL ) {
165+
loops = atoi(p);
152166
}
153167

154-
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
155-
fprintf(stderr,"Out of Memory!!\n");exit(1);
168+
if ((p = getenv("OPENBLAS_PARAM_M"))) {
169+
m = atoi(p);
170+
has_param_m=1;
171+
} else {
172+
m = to;
173+
}
174+
if ((p = getenv("OPENBLAS_PARAM_N"))) {
175+
n = atoi(p);
176+
has_param_n=1;
177+
} else {
178+
n = to;
179+
}
180+
if ((p = getenv("OPENBLAS_PARAM_K"))) {
181+
k = atoi(p);
182+
has_param_k=1;
183+
} else {
184+
k = to;
156185
}
157186

158-
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
187+
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
159188
fprintf(stderr,"Out of Memory!!\n");exit(1);
160189
}
161-
162-
p = getenv("OPENBLAS_LOOPS");
163-
if ( p != NULL )
164-
loops = atoi(p);
165-
166-
if ((p = getenv("OPENBLAS_PARAM_N"))) {
167-
n = atoi(p);
168-
has_param_n=1;
190+
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
191+
fprintf(stderr,"Out of Memory!!\n");exit(1);
192+
}
193+
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
194+
fprintf(stderr,"Out of Memory!!\n");exit(1);
169195
}
170196

171197
#ifdef linux
172198
srandom(getpid());
173199
#endif
174-
175-
for(j = 0; j < to; j++){
176-
for(i = 0; i < to * COMPSIZE; i++){
177-
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
178-
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
179-
c[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
180-
}
181-
}
182-
183-
184-
185-
fprintf(stderr, " SIZE Flops Time\n");
186200

187-
for(m = from; m <= to; m += step)
188-
{
201+
for (i = 0; i < m * k * COMPSIZE; i++) {
202+
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
203+
}
204+
for (i = 0; i < k * n * COMPSIZE; i++) {
205+
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
206+
}
207+
for (i = 0; i < m * n * COMPSIZE; i++) {
208+
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
209+
}
210+
211+
fprintf(stderr, " SIZE Flops Time\n");
189212

213+
for (i = from; i <= to; i += step) {
214+
190215
timeg=0;
191216

192-
if ( has_param_n == 1 && n <= m )
193-
n=n;
194-
else
195-
n=m;
217+
if (!has_param_m) { m = i; }
218+
if (!has_param_n) { n = i; }
219+
if (!has_param_k) { k = i; }
196220

221+
if (transa == 'N') { lda = m; }
222+
else { lda = k; }
223+
if (transb == 'N') { ldb = k; }
224+
else { ldb = n; }
225+
ldc = m;
197226

198-
199-
fprintf(stderr, " %6dx%d : ", (int)m, (int)n);
227+
fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
200228
gettimeofday( &start, (struct timezone *)0);
201229

202-
for (l=0; l<loops; l++)
203-
{
204-
205-
GEMM (&trans, &trans, &m, &n, &m, alpha, a, &m, b, &m, beta, c, &m );
206-
207-
208-
230+
for (j=0; j<loops; j++) {
231+
GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
209232
}
210-
gettimeofday( &stop, (struct timezone *)0);
211-
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
233+
234+
gettimeofday( &stop, (struct timezone *)0);
235+
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
212236

213237
timeg = time1/loops;
214238
fprintf(stderr,
215239
" %10.2f MFlops %10.6f sec\n",
216240
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
217-
241+
218242
}
219243

220244
return 0;

driver/level3/level3_thread.c

Lines changed: 9 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,6 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
684684
BLASLONG m = args -> m;
685685
BLASLONG n = args -> n;
686686
BLASLONG nthreads = args -> nthreads;
687-
BLASLONG divN, divT;
688-
int mode;
689687

690688
if (nthreads == 1) {
691689
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
@@ -706,66 +704,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
706704
n = n_to - n_from;
707705
}
708706

709-
if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
707+
if ((m < 2 * SWITCH_RATIO) || (n < 2 * SWITCH_RATIO)) {
710708
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
711709
return 0;
712710
}
713711

714-
divT = nthreads;
715-
divN = 1;
716-
717-
#if 0
718-
while ((GEMM_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
719-
do {
720-
divT --;
721-
divN = 1;
722-
while (divT * divN < nthreads) divN ++;
723-
} while ((divT * divN != nthreads) && (divT > 1));
712+
if (m < nthreads * SWITCH_RATIO) {
713+
nthreads = blas_quickdivide(m, SWITCH_RATIO);
724714
}
725-
#endif
726-
727-
// fprintf(stderr, "divN = %4ld divT = %4ld\n", divN, divT);
728-
729-
args -> nthreads = divT;
730-
731-
if (divN == 1){
732-
733-
gemm_driver(args, range_m, range_n, sa, sb, 0);
734-
} else {
735-
#ifndef COMPLEX
736-
#ifdef XDOUBLE
737-
mode = BLAS_XDOUBLE | BLAS_REAL;
738-
#elif defined(DOUBLE)
739-
mode = BLAS_DOUBLE | BLAS_REAL;
740-
#else
741-
mode = BLAS_SINGLE | BLAS_REAL;
742-
#endif
743-
#else
744-
#ifdef XDOUBLE
745-
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
746-
#elif defined(DOUBLE)
747-
mode = BLAS_DOUBLE | BLAS_COMPLEX;
748-
#else
749-
mode = BLAS_SINGLE | BLAS_COMPLEX;
750-
#endif
751-
#endif
752-
753-
#if defined(TN) || defined(TT) || defined(TR) || defined(TC) || \
754-
defined(CN) || defined(CT) || defined(CR) || defined(CC)
755-
mode |= (BLAS_TRANSA_T);
756-
#endif
757-
#if defined(NT) || defined(TT) || defined(RT) || defined(CT) || \
758-
defined(NC) || defined(TC) || defined(RC) || defined(CC)
759-
mode |= (BLAS_TRANSB_T);
760-
#endif
761-
762-
#ifdef OS_WINDOWS
763-
gemm_thread_n(mode, args, range_m, range_n, GEMM_LOCAL, sa, sb, divN);
764-
#else
765-
gemm_thread_n(mode, args, range_m, range_n, gemm_driver, sa, sb, divN);
766-
#endif
767-
715+
if (n < nthreads * SWITCH_RATIO) {
716+
nthreads = blas_quickdivide(n, SWITCH_RATIO);
768717
}
769718

719+
args -> nthreads = nthreads;
720+
721+
gemm_driver(args, range_m, range_n, sa, sb, 0);
722+
770723
return 0;
771724
}

0 commit comments

Comments
 (0)