Skip to content

Commit 31c9214

Browse files
author
Timmy
committed
mod192 dtrsm using dtrtri
1 parent ba1bbdd commit 31c9214

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

src/library/blas/xtrsm.cc

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ static void force_trsm_column_major(Args & args)
100100
assert(false); \
101101
}
102102

103+
#define min(x,y) ((x)<(y)?(x):(y))
104+
103105
static void makeKernel(
104106
cl_kernel *clKernel,
105107
cl_command_queue clQueue,
@@ -589,6 +591,81 @@ static clblasStatus gpu_dtrsm192(
589591

590592
diag_dtrtri192(commandQueues[0], N, uplo, diag, A, offA, InvA, lda, inner_block_size, outer_block_size, events);
591593

594+
if (transA == clblasNoTrans)
595+
{
596+
/* the non-transpose case */
597+
if (uplo == clblasLower)
598+
{
599+
/* the lower case */
600+
/* handle the first block seperately with alpha */
601+
// lower is not implemented yet
602+
603+
604+
}
605+
else
606+
{
607+
/* the upper case */
608+
/* handle the first block seperately with alpha */
609+
int nn = min(outer_block_size, (int)N);
610+
//DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) );
611+
err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldb, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1, commandQueues, 0, NULL, events);
612+
CL_CHECK(err);
613+
614+
if (outer_block_size < N)
615+
{
616+
617+
//DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) );
618+
err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - outer_block_size, outer_block_size, neg_one, X, offX, ldX, A, offA + lda*outer_block_size, lda, alpha, B, offB + outer_block_size*ldb, ldb, 1, commandQueues, 0, NULL, events);
619+
assert(err == CL_SUCCESS);
620+
621+
/* the rest blocks */
622+
for (i = outer_block_size; i < N; i += outer_block_size)
623+
{
624+
nn = min(outer_block_size, (int)N - i);
625+
//DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) );
626+
err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB + i*ldb, ldb, InvA, offInvA + i*outer_block_size, ldInvA, zero, X, offX + i*ldX, ldX, 1, commandQueues, 0, NULL, events);
627+
assert(err == CL_SUCCESS);
628+
629+
if (i + outer_block_size >= N)
630+
break;
631+
632+
//DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) );
633+
err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - outer_block_size, outer_block_size, neg_one, X, offX + i*ldX, ldX, A, offA + i + (outer_block_size + i)*lda, lda, one, B, offB + (i + outer_block_size)*ldb, ldb, 1, commandQueues, 0, NULL, events);
634+
assert(err == CL_SUCCESS);
635+
}
636+
}
637+
}
638+
}
639+
else
640+
{
641+
642+
/* the transpose case */
643+
// trans is not implemented yet
644+
}
645+
646+
{
647+
size_t src_origin[3] = { 0, 0, 0 };
648+
size_t dst_origin[3] = { offB*sizeof(double), 0, 0 };
649+
size_t region[3] = { M*sizeof(double), N, 1 };
650+
651+
652+
err = clEnqueueCopyBufferRect(commandQueues[0],
653+
X,
654+
B,
655+
src_origin,
656+
dst_origin,
657+
region,
658+
ldX*sizeof(double), 0,
659+
ldb*sizeof(double), 0,
660+
0, NULL,
661+
events);
662+
CL_CHECK(err);
663+
664+
clReleaseMemObject(InvA);
665+
clReleaseMemObject(X);
666+
667+
}
668+
592669
specialCaseHandled = true;
593670
return clblasSuccess;
594671

0 commit comments

Comments
 (0)