@@ -100,6 +100,8 @@ static void force_trsm_column_major(Args & args)
100100 assert (false ); \
101101 }
102102
103+ #define min (x,y ) ((x)<(y)?(x):(y))
104+
103105static void makeKernel (
104106 cl_kernel *clKernel,
105107 cl_command_queue clQueue,
@@ -589,6 +591,81 @@ static clblasStatus gpu_dtrsm192(
589591
590592 diag_dtrtri192 (commandQueues[0 ], N, uplo, diag, A, offA, InvA, lda, inner_block_size, outer_block_size, events);
591593
594+ if (transA == clblasNoTrans)
595+ {
596+ /* the non-transpose case */
597+ if (uplo == clblasLower)
598+ {
599+ /* the lower case */
600+ /* handle the first block seperately with alpha */
601+ // lower is not implemented yet
602+
603+
604+ }
605+ else
606+ {
607+ /* the upper case */
608+ /* handle the first block seperately with alpha */
609+ int nn = min (outer_block_size, (int )N);
610+ // DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) );
611+ err = clblasDgemm (clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldb, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1 , commandQueues, 0 , NULL , events);
612+ CL_CHECK (err);
613+
614+ if (outer_block_size < N)
615+ {
616+
617+ // DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) );
618+ err = clblasDgemm (clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - outer_block_size, outer_block_size, neg_one, X, offX, ldX, A, offA + lda*outer_block_size, lda, alpha, B, offB + outer_block_size*ldb, ldb, 1 , commandQueues, 0 , NULL , events);
619+ assert (err == CL_SUCCESS);
620+
621+ /* the rest blocks */
622+ for (i = outer_block_size; i < N; i += outer_block_size)
623+ {
624+ nn = min (outer_block_size, (int )N - i);
625+ // DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) );
626+ err = clblasDgemm (clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB + i*ldb, ldb, InvA, offInvA + i*outer_block_size, ldInvA, zero, X, offX + i*ldX, ldX, 1 , commandQueues, 0 , NULL , events);
627+ assert (err == CL_SUCCESS);
628+
629+ if (i + outer_block_size >= N)
630+ break ;
631+
632+ // DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) );
633+ err = clblasDgemm (clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - outer_block_size, outer_block_size, neg_one, X, offX + i*ldX, ldX, A, offA + i + (outer_block_size + i)*lda, lda, one, B, offB + (i + outer_block_size)*ldb, ldb, 1 , commandQueues, 0 , NULL , events);
634+ assert (err == CL_SUCCESS);
635+ }
636+ }
637+ }
638+ }
639+ else
640+ {
641+
642+ /* the transpose case */
643+ // trans is not implemented yet
644+ }
645+
646+ {
647+ size_t src_origin[3 ] = { 0 , 0 , 0 };
648+ size_t dst_origin[3 ] = { offB*sizeof (double ), 0 , 0 };
649+ size_t region[3 ] = { M*sizeof (double ), N, 1 };
650+
651+
652+ err = clEnqueueCopyBufferRect (commandQueues[0 ],
653+ X,
654+ B,
655+ src_origin,
656+ dst_origin,
657+ region,
658+ ldX*sizeof (double ), 0 ,
659+ ldb*sizeof (double ), 0 ,
660+ 0 , NULL ,
661+ events);
662+ CL_CHECK (err);
663+
664+ clReleaseMemObject (InvA);
665+ clReleaseMemObject (X);
666+
667+ }
668+
592669 specialCaseHandled = true ;
593670 return clblasSuccess;
594671
0 commit comments