@@ -33,34 +33,44 @@ using namespace kernelParameters;
3333
3434
3535template <typename matrixType, typename vectorType>
36- __device__ void GaussEliminationKernel (matrixType* matrixCopy, vectorType* prod, unsigned int threadNo, matrixParameters matrixParam)
36+ __device__ void GaussEliminationKernel (matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition , matrixParameters matrixParam)
3737{
38- #define A (I, J ) matrixCopy[(I) * matrixParam.blockColSize + (J)]
38+ auto matrixIndex = [=](unsigned short x, unsigned short y){
39+ return (x * matrixParam.blockColSize + y);
40+ };
3941
40- int x = threadNo/matrixParam.blockRowSize ;
41- int y = threadNo % matrixParam.blockColSize ;
42+ auto vectorIndex = [=](unsigned short row, unsigned short elemNo){
43+ return (row * matrixParam.blockRowSize + elemNo);
44+ };
45+
46+ unsigned short x = threadNo/matrixParam.blockRowSize ;
47+ unsigned short y = threadNo % matrixParam.blockColSize ;
4248
4349 matrixType pivot, weight = 0.0 ;
4450
4551 __syncthreads ();
4652
47- for (int currentRow=0 ; currentRow < matrixParam.blockRowSize ; currentRow++)
53+ for (unsigned short currentRow=0 ; currentRow < matrixParam.blockRowSize ; currentRow++)
4854 {
49- pivot = A (currentRow, currentRow);
50-
55+ if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockColSize )){
56+ pivot = matrixCopy[matrixIndex (currentRow, currentRow)];
57+ }
5158 __syncthreads ();
5259
53- A (currentRow, y) /= pivot;
54- if (y==0 ) prod[currentRow] /= pivot;
60+ if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockColSize )){
61+
62+ matrixCopy[matrixIndex (currentRow, y)] /= pivot;
63+ if (y==0 ) prod[vectorIndex (row, currentRow)] /= pivot;
5564
56- weight = A (x, currentRow);
65+ weight = matrixCopy[ matrixIndex (x, currentRow)] ;
5766
58- if (x!=currentRow)
59- {
60- A (x, y) -= weight * A (currentRow, y);
61- if (y==0 ) prod[x] -= weight * prod[currentRow];
67+ if (x!=currentRow)
68+ {
69+ matrixCopy[matrixIndex (x, y)] -= weight * matrixCopy[matrixIndex (currentRow, y)];
70+ if (y==0 ) prod[vectorIndex (row, x)] -= weight * prod[vectorIndex (row, x)];
71+ }
6272 }
63-
73+
6474 __syncthreads ();
6575 }
6676
@@ -71,69 +81,66 @@ template<typename matrixType, typename vectorType>
7181__global__ void FirstSymmetricIterationKernel (matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long * d_partition_offsets, const unsigned long * d_row_ptr, const unsigned long * d_col_ind, const unsigned long * d_dia_ptr, matrixParameters matrixParam)
7282{
7383
74- auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
75- return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
76- };
77-
78- auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
79- return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
84+ auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long * ptr_array){
85+ return (ptr_array[row] * matrixParam.blockSize + threadNo);
8086 };
8187
8288 auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
8389 return (row * matrixParam.blockRowSize + elemNo);
8490 };
8591
86- unsigned long row = (blockIdx .x * blockDim .x + threadIdx .x )/MVP_WARP_SIZE;
87- unsigned short localRow = row % matrixParam.rowsPerBlock ;
92+ auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
93+ return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
94+ };
95+
96+ unsigned long origRow = (blockIdx .x * blockDim .x + threadIdx .x )/MVP_WARP_SIZE;
97+ unsigned short localRow = origRow % matrixParam.rowsPerBlock ;
8898 unsigned short threadNo = threadIdx .x % MVP_WARP_SIZE;
8999
90100 unsigned short blockCol = threadNo % matrixParam.blockColSize ;
91101
92- extern __shared__ vectorType tempBuffer[];
102+ extern __shared__ matrixType tempBuffer[];
93103
94- for (auto iPartition = 1ul ; iPartition < matrixParam.nPartition ; iPartition++)
104+ for (auto iPartition = 1ul ; iPartition < matrixParam.nPartition + 1 ; iPartition++)
95105 {
96106
97107 /* We move to the set of rows present in the current partition.*/
98- row += d_partition_offsets[iPartition-1 ];
99-
108+ unsigned long row = origRow + d_partition_offsets[iPartition-1 ];
100109 bool rowInPartition = (row < d_partition_offsets[iPartition]);
101110
102111 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockRowSize )) {
103112
104113 prod[vectorIndex (row, threadNo)] = 0.0 ;
105- tempBuffer[vectorIndex (localRow, threadNo)] = 0.0 ;
114+ tempBuffer[matrixParam. shrdMemIndex (localRow, threadNo)] = 0.0 ;
106115 }
107116
108117 __syncthreads ();
109118
110119 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.activeThreads )) {
111120
112- for (unsigned long index = matrixIndex (row, threadNo); index < matrixDiagonalIndex (row, 0 ); index+=matrixParam.activeThreads )
121+ for (unsigned long index = matrixIndex (row, threadNo, d_row_ptr ); index < matrixIndex (row, 0 , d_dia_ptr ); index+=matrixParam.activeThreads )
113122 {
114-
123+
115124 unsigned short blockNo = index/matrixParam.blockSize ;
116125 unsigned short blockRow = (index/matrixParam.blockColSize ) % matrixParam.blockRowSize ;
117126
118- atomicAdd (&tempBuffer[vectorIndex (localRow, blockRow)], matrix[index] * vec[(d_col_ind[ blockNo]) * matrixParam. blockColSize + blockCol]);
127+ atomicAdd (&tempBuffer[matrixParam. shrdMemIndex (localRow, blockRow)], matrix[index] * vec[vectorMultIndex ( blockNo, blockCol) ]);
119128 }
120129 }
121130
122131 __syncthreads ();
123132
124- if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockRowSize )) {
125-
126- prod[vectorIndex (row, threadNo)] = vec[vectorIndex (row, threadNo)] - tempBuffer[vectorIndex (localRow, threadNo)];
127- }
128-
129- __syncthreads ();
130-
131133 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockSize )) {
132134
133- tempBuffer[vectorIndex (localRow, threadNo)] = matrix[matrixDiagonalIndex (row, threadNo)];
134- GaussEliminationKernel (&tempBuffer[vectorIndex (localRow, threadNo)], &prod[vectorIndex (row, threadNo/matrixParam.blockColSize )], threadNo, matrixParam);
135+ /* Write the result to the product vector.*/
136+ if (threadNo < matrixParam.blockRowSize ) prod[vectorIndex (row, threadNo)] = vec[vectorIndex (row, threadNo)] - tempBuffer[matrixParam.shrdMemIndex (localRow, threadNo)];
137+
138+ /* Reinitialize the memory to hold the matrix diagonal elements now.*/
139+ tempBuffer[matrixParam.shrdMemIndex (localRow, threadNo)] = matrix[matrixIndex (row, threadNo, d_dia_ptr)];
135140 }
136141
142+ GaussEliminationKernel (&tempBuffer[matrixParam.shrdMemIndex (localRow, 0 )], prod, row, threadNo, rowInPartition, matrixParam);
143+
137144 __syncthreads ();
138145
139146 }
@@ -144,18 +151,18 @@ template<typename matrixType, typename vectorType>
144151__global__ void SecondSymmetricIterationKernel (matrixType* matrix, vectorType* prod, const unsigned long * d_partition_offsets, const unsigned long * d_row_ptr, const unsigned long * d_col_ind, const unsigned long * d_dia_ptr, matrixParameters matrixParam)
145152{
146153
147- auto matrixIndex = [=](unsigned long row, unsigned short threadNo){
148- return (d_row_ptr[row] * matrixParam.blockSize + threadNo);
149- };
150-
151- auto matrixDiagonalIndex = [=](unsigned long row, unsigned short threadNo){
152- return (d_dia_ptr[row] * matrixParam.blockSize + threadNo);
154+ auto matrixIndex = [=](unsigned long row, unsigned short threadNo, const unsigned long * ptr_array){
155+ return (ptr_array[row] * matrixParam.blockSize + threadNo);
153156 };
154157
155158 auto vectorIndex = [=](unsigned long row, unsigned short elemNo){
156159 return (row * matrixParam.blockRowSize + elemNo);
157160 };
158161
162+ auto vectorMultIndex = [=](unsigned short blockNo, unsigned short blockCol){
163+ return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
164+ };
165+
159166 unsigned long row = (blockIdx .x * blockDim .x + threadIdx .x )/MVP_WARP_SIZE;
160167 unsigned short localRow = row % matrixParam.rowsPerBlock ;
161168 unsigned short threadNo = threadIdx .x % MVP_WARP_SIZE;
@@ -164,58 +171,60 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
164171
165172 extern __shared__ vectorType tempBuffer[];
166173
167- for (auto iPartition = 1ul ; iPartition < matrixParam.nPartition ; iPartition++)
174+ for (auto iPartition = 1ul ; iPartition < matrixParam.nPartition + 1 ; iPartition++)
168175 {
169176
170177 /* We move to the set of rows present in the current partition.*/
178+ row = (blockIdx .x * blockDim .x + threadIdx .x )/MVP_WARP_SIZE;
171179 row += d_partition_offsets[iPartition-1 ];
172180
173181 bool rowInPartition = (row < d_partition_offsets[iPartition]);
174182
175183 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockRowSize )) {
176184
177- tempBuffer[vectorIndex (localRow, threadNo)] = 0.0 ;
185+ tempBuffer[matrixParam. shrdMemIndex (localRow, threadNo)] = 0.0 ;
178186 }
179187
180188 __syncthreads ();
181189
182190 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.activeThreads )) {
183191
184- for (unsigned long index = matrixDiagonalIndex (row, threadNo); index < matrixDiagonalIndex (row, 0 ) + matrixParam.blockSize ; index+=matrixParam.activeThreads )
192+ for (unsigned long index = matrixIndex (row, threadNo, d_dia_ptr ); index < matrixIndex (row, matrixParam.blockSize , d_dia_ptr) ; index+=matrixParam.activeThreads )
185193 {
186194
187195 unsigned short blockNo = index/matrixParam.blockSize ;
188196 unsigned short blockRow = (index/matrixParam.blockColSize ) % matrixParam.blockRowSize ;
189197
190- atomicAdd (&tempBuffer[vectorIndex (localRow, blockRow)], matrix[index] * prod[(d_col_ind[ blockNo]) * matrixParam. blockColSize + blockCol]);
198+ atomicAdd (&tempBuffer[matrixParam. shrdMemIndex (localRow, blockRow)], matrix[index] * prod[vectorMultIndex ( blockNo, blockCol) ]);
191199 }
192- }
193200
194- if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.activeThreads )) {
195-
196- for (unsigned long index = matrixDiagonalIndex (row, threadNo); index < matrixIndex (row+1 , 0 ); index+=matrixParam.activeThreads )
201+ for (unsigned long index = matrixIndex (row, threadNo, d_dia_ptr); index < matrixIndex (row+1 , 0 , d_row_ptr); index+=matrixParam.activeThreads )
197202 {
198203
199204 unsigned short blockNo = index/matrixParam.blockSize ;
200205 unsigned short blockRow = (index/matrixParam.blockColSize ) % matrixParam.blockRowSize ;
201206
202- atomicAdd (&tempBuffer[vectorIndex (localRow, blockRow)], -(matrix[index] * prod[(d_col_ind[ blockNo]) * matrixParam. blockColSize + blockCol]));
207+ atomicAdd (&tempBuffer[matrixParam. shrdMemIndex (localRow, blockRow)], -(matrix[index] * prod[vectorMultIndex ( blockNo, blockCol) ]));
203208 }
204209 }
205210
206211 __syncthreads ();
207212
208- if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockRowSize )) {
209-
210- prod[vectorIndex (row, threadNo)] = tempBuffer[vectorIndex (localRow, threadNo)];
213+ if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockSize )) {
214+
215+ /* Write the result to the product vector.*/
216+ if (threadNo < matrixParam.blockRowSize ) prod[vectorIndex (row, threadNo)] = tempBuffer[matrixParam.shrdMemIndex (localRow, threadNo)];
217+
218+ /* Reinitialize the memory to hold the matrix diagonal elements now.*/
219+ tempBuffer[matrixParam.shrdMemIndex (localRow, threadNo)] = matrix[matrixIndex (row, threadNo, d_dia_ptr)];
211220 }
212221
213222 __syncthreads ();
214223
215224 if (matrixParam.validParallelAccess (rowInPartition, threadNo, matrixParam.blockSize )) {
216225
217- tempBuffer[vectorIndex ( localRow, threadNo) ] = matrix[(d_dia_ptr[row] * matrixParam.blockSize ) + threadNo];
218- GaussEliminationKernel (&tempBuffer[vectorIndex ( localRow, threadNo) ], & prod[ vectorIndex ( row, threadNo/matrixParam. blockColSize )], threadNo , matrixParam);
226+ tempBuffer[localRow * matrixParam. blockSize + threadNo] = matrix[(d_dia_ptr[row] * matrixParam.blockSize ) + threadNo];
227+ GaussEliminationKernel (&tempBuffer[localRow * matrixParam. blockSize ], prod, row, threadNo, rowInPartition , matrixParam);
219228 }
220229
221230 __syncthreads ();
@@ -294,7 +303,7 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
294303 unsigned int gridx = rounded_up_division (MVP_BLOCK_SIZE, matrixParam.totalRows * MVP_WARP_SIZE);
295304 dim3 gridDim (gridx, 1 , 1 );
296305
297- MatrixVectorProductKernel<<<gridDim , blockDim >>> (d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
306+ MatrixVectorProductKernel<<<gridDim , blockDim , matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof (ScalarType) >>> (d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
298307 gpuErrChk ( cudaPeekAtLastError () );
299308
300309 prod.DtHTransfer ();
0 commit comments