@@ -165,7 +165,6 @@ static void makeKernel(
165165 CL_CHECK (err)
166166 }
167167 else {
168- // std::cout << kernelSource << std::endl;
169168 clProgram = clCreateProgramWithSource (
170169 clContext,
171170 1 , &kernelSource,
@@ -749,7 +748,6 @@ cl_int diag_dtrtri128(
749748 int outer_block_size,
750749 cl_event *event)
751750{
752- std::cout << " enter diag_dtrtri128 " << std::endl;
753751 const char *diag_dtrtri_kernel_upper_KernelSource = NULL ;
754752 cl_kernel *diag_dtrtri_kernel_upper_ClKernel = NULL ;
755753 size_t diag_dtrtri_kernel_upper_KernelBinarySize = 0 ;
@@ -871,6 +869,9 @@ cl_int diag_dtrtri128(
871869 err = clEnqueueNDRangeKernel (queue, *diag_dtrtri_kernel_upper_ClKernel, 1 , NULL ,
872870 globalThreads, globalLocal,
873871 0 , NULL , NULL );
872+ CL_CHECK (err);
873+ // err = clFinish(queue);
874+ // CL_CHECK(err);
874875
875876 if (err != CL_SUCCESS) {
876877 // printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err );
@@ -899,7 +900,8 @@ cl_int diag_dtrtri128(
899900 queue,
900901 A, offA, d_dinvA, i, lda, M, event);
901902 CL_CHECK (err);
902-
903+ // err = clFinish(queue);
904+ // CL_CHECK(err);
903905 break ;
904906
905907 case 32 :
@@ -915,6 +917,8 @@ cl_int diag_dtrtri128(
915917 queue,
916918 A, offA, d_dinvA, i, lda, M, event);
917919 CL_CHECK (err);
920+ // err = clFinish(queue);
921+ // CL_CHECK(err);
918922 err = call_kernel_triple_update128 (&triple_dgemm_update_128_32_PART2_R_clKernel,
919923 triple_dgemm_update_128_32_PART2_R_src,
920924 TrtriBuildOptions,
@@ -924,6 +928,8 @@ cl_int diag_dtrtri128(
924928 queue,
925929 A, offA, d_dinvA, i, lda, M, event);
926930 CL_CHECK (err);
931+ // err = clFinish(queue);
932+ // CL_CHECK(err);
927933
928934 break ;
929935
@@ -940,6 +946,8 @@ cl_int diag_dtrtri128(
940946 queue,
941947 A, offA, d_dinvA, i, lda, M, event);
942948 CL_CHECK (err);
949+ // err = clFinish(queue);
950+ // CL_CHECK(err);
943951
944952 err = call_kernel_triple_update128 (&triple_dgemm_update_128_64_PART2_R_clKernel,
945953 triple_dgemm_update_128_64_PART2_R_src,
@@ -950,6 +958,8 @@ cl_int diag_dtrtri128(
950958 queue,
951959 A, offA, d_dinvA, i, lda, M, event);
952960 CL_CHECK (err);
961+ // err = clFinish(queue);
962+ // CL_CHECK(err);
953963
954964 break ;
955965
@@ -1020,8 +1030,6 @@ static clblasStatus gpu_dtrsm128(
10201030{
10211031 if (order != clblasColumnMajor)
10221032 return clblasNotImplemented;
1023- if (M < 16 || N < 16 )
1024- return clblasNotImplemented;
10251033
10261034 // for now
10271035 if (side == clblasRight)
@@ -1076,7 +1084,7 @@ static clblasStatus gpu_dtrsm128(
10761084 err = clearBuffer (commandQueues[0 ], InvA, size_InvA);
10771085 CL_CHECK (err);
10781086
1079- err = diag_dtrtri128 (commandQueues[0 ], N , uplo, diag, A, offA, InvA, ldA, inner_block_size, outer_block_size, events);
1087+ err = diag_dtrtri128 (commandQueues[0 ], M , uplo, diag, A, offA, InvA, ldA, inner_block_size, outer_block_size, events);
10801088 CL_CHECK (err);
10811089
10821090 //
@@ -1134,23 +1142,22 @@ static clblasStatus gpu_dtrsm128(
11341142 {
11351143 /* the upper case */
11361144 /* handle the first block seperately with alpha */
1137- std::cout << " dtrtri trsm " << std::endl;
11381145 int mm = (M % outer_block_size == 0 ) ? outer_block_size : (M % outer_block_size);
11391146 i = M - mm;
1140- // DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
1147+ DGEMM_LEFT (mm, N, mm, alpha, _ (InvA, 0 , i), _ (B, i, 0 ), zero, _ (X, i, 0 ));
11411148
11421149 if (i - outer_block_size >= 0 )
11431150 {
1144- // DGEMM_LEFT(i, N, mm, neg_one, _(A, 0, i), _(X, i, 0), alpha, _(B, 0, 0));
1151+ DGEMM_LEFT (i, N, mm, neg_one, _ (A, 0 , i), _ (X, i, 0 ), alpha, _ (B, 0 , 0 ));
11451152
11461153 /* the rest blocks */
11471154 for (i = M - mm - outer_block_size; i >= 0 ; i -= outer_block_size) {
1148- // DGEMM_LEFT(outer_block_size, N, outer_block_size, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0));
1155+ DGEMM_LEFT (outer_block_size, N, outer_block_size, one, _ (InvA, 0 , i), _ (B, i, 0 ), zero, _ (X, i, 0 ));
11491156
11501157 if (i - outer_block_size < 0 )
11511158 break ;
11521159
1153- // DGEMM_LEFT(i, N, outer_block_size, neg_one, _(A, 0, i), _(X, i, 0), one, _(B, 0, 0));
1160+ DGEMM_LEFT (i, N, outer_block_size, neg_one, _ (A, 0 , i), _ (X, i, 0 ), one, _ (B, 0 , 0 ));
11541161 }
11551162 }
11561163 }
@@ -1191,7 +1198,6 @@ static clblasStatus gpu_dtrsm128(
11911198 {
11921199 /* the upper case */
11931200 /* handle the first block seperately with alpha */
1194- std::cout << " dtrtri trsm " << std::endl;
11951201 int mm = min (outer_block_size, (int )M);
11961202 DGEMM_LEFT (mm, N, mm, alpha, _ (InvA, 0 , 0 ), _ (B, 0 , 0 ), zero, _ (X, 0 , 0 ));
11971203
0 commit comments