Skip to content

Commit d32081a

Browse files
committed
Fix barriers in dtrsm specialized kernels
1 parent 9c66a77 commit d32081a

File tree

4 files changed

+10
-3
lines changed

4 files changed

+10
-3
lines changed

src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ else\n
101101
}\n
102102
}\n
103103

104+
barrier(CLK_LOCAL_MEM_FENCE);\n
104105
/*
105106
* the lower case
106107
*/
@@ -135,6 +136,7 @@ for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n
135136
workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n
136137
x = workspace + i + 1; \n
137138
y = Bs + i*BLOCK_SIZE; \n
139+
barrier(CLK_LOCAL_MEM_FENCE);\n
138140

139141
txw = (tx - i - 1); \n
140142

src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ uint na)\n
9494
Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n
9595
}\n
9696
}\n
97+
barrier(CLK_LOCAL_MEM_FENCE);\n
9798

9899
/* the upper case */
99100
for( i=0; i < BLOCK_SIZE; i++ ) {\n
@@ -110,6 +111,7 @@ uint na)\n
110111
//dtrmv
111112
workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);\n
112113
y = Bs+i*BLOCK_SIZE;\n
114+
barrier(CLK_LOCAL_MEM_FENCE);\n
113115

114116
_Pragma("unroll")\n
115117
//for( j=tx; j < i; j++ )

src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ else\n
9494
Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n
9595
}\n
9696
}\n
97+
barrier(CLK_LOCAL_MEM_FENCE);\n
9798

9899

99100
/* the upper case */
@@ -111,6 +112,7 @@ for (i = 0; i < BLOCK_SIZE; i++) {\n
111112
//dtrmv
112113
workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n
113114
y = Bs + i*BLOCK_SIZE; \n
115+
barrier(CLK_LOCAL_MEM_FENCE);\n
114116

115117
_Pragma("unroll")\n
116118
//for( j=tx; j < i; j++ )

src/library/blas/xtrsm.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,7 +1683,7 @@ clblasDtrsm(
16831683
const cl_event *eventWaitList,
16841684
cl_event *events)
16851685
{
1686-
/*
1686+
#if 0
16871687
CHECK_QUEUES(numCommandQueues, commandQueues);
16881688
CHECK_EVENTS(numEventsInWaitList, eventWaitList);
16891689

@@ -1718,7 +1718,8 @@ clblasDtrsm(
17181718
functor->release();
17191719

17201720
return res;
1721-
*/
1721+
1722+
#else
17221723
bool specialCaseHandled = false;
17231724

17241725
//outer block size = 192
@@ -1780,7 +1781,7 @@ clblasDtrsm(
17801781
numEventsInWaitList,
17811782
eventWaitList,
17821783
events);
1783-
1784+
#endif
17841785
}
17851786

17861787
extern "C"

0 commit comments

Comments
 (0)