11/* !
22 * \file CSysMatrixGPU.cu
33 * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
4+ *
5+ * The kernel implementations will feature a lot of if-statements.
6+ * The reason for such heavy usage of conditionals is to do a check
7+ * whether the memory locations being accessed by the threads are
8+ * within bounds or not. Usually the entire kernel is "wrapped" in
9+ * a single conditional for these checks. But, in our case, it is
10+ * necessary for us to use intermittent synchronization barriers like
11+ * __syncthreads() which will lead to thread divergence issues if used
12+ * inside a conditional.
413 * \author A. Raj
514 * \version 8.2.0 "Harrier"
615 *
2938#include " ../../include/geometry/CGeometry.hpp"
3039#include " ../../include/linear_algebra/GPUComms.cuh"
3140
32- using namespace kernelParameters ;
33-
41+ using namespace cudaKernelParameters ;
3442
3543template <typename matrixType, typename vectorType>
3644__device__ void DeviceGaussElimination (matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
@@ -96,9 +104,9 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
96104 return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
97105 };
98106
99- unsigned long origRow = (blockIdx .x * blockDim .x + threadIdx .x )/WARP_SIZE ;
100- unsigned short localRow = origRow % ROWS_PER_BLOCK ;
101- unsigned short threadNo = threadIdx .x % WARP_SIZE ;
107+ unsigned long origRow = (blockIdx .x * blockDim .x + threadIdx .x )/CUDA_WARP_SIZE ;
108+ unsigned short localRow = origRow % matrixParam. rowsPerBlock ;
109+ unsigned short threadNo = threadIdx .x % CUDA_WARP_SIZE ;
102110
103111 unsigned short blockCol = threadNo % matrixParam.blockColSize ;
104112
@@ -167,9 +175,9 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
167175 return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
168176 };
169177
170- unsigned long origRow = (blockIdx .x * blockDim .x + threadIdx .x )/WARP_SIZE ;
171- unsigned short localRow = origRow % ROWS_PER_BLOCK ;
172- unsigned short threadNo = threadIdx .x % WARP_SIZE ;
178+ unsigned long origRow = (blockIdx .x * blockDim .x + threadIdx .x )/CUDA_WARP_SIZE ;
179+ unsigned short localRow = origRow % matrixParam. rowsPerBlock ;
180+ unsigned short threadNo = threadIdx .x % CUDA_WARP_SIZE ;
173181
174182 unsigned short blockCol = threadNo % matrixParam.blockColSize ;
175183
@@ -238,9 +246,9 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
238246 return (row * matrixParam.blockRowSize + elemNo);
239247 };
240248
241- unsigned long row = (blockIdx .x * blockDim .x + threadIdx .x )/WARP_SIZE ;
242- unsigned short threadNo = threadIdx .x % WARP_SIZE ;
243- unsigned short localRow = row % ROWS_PER_BLOCK ;
249+ unsigned long row = (blockIdx .x * blockDim .x + threadIdx .x )/CUDA_WARP_SIZE ;
250+ unsigned short threadNo = threadIdx .x % CUDA_WARP_SIZE ;
251+ unsigned short localRow = row % matrixParam. rowsPerBlock ;
244252
245253 unsigned short blockCol = threadNo % matrixParam.blockColSize ;
246254
@@ -292,13 +300,13 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
292300 vec.HtDTransfer ();
293301 prod.GPUSetVal (0.0 );
294302
295- matrixParameters matrixParam (nPointDomain, nEqn, nVar, geometry->nPartition );
303+ matrixParameters matrixParam (nPointDomain, nEqn, nVar, geometry->nPartition , config-> GetRows_Per_Cuda_Block () );
296304
297- dim3 blockDim (BLOCK_SIZE ,1 ,1 );
298- unsigned int gridx = rounded_up_division (BLOCK_SIZE , matrixParam.totalRows * WARP_SIZE );
305+ dim3 blockDim (config-> GetCuda_Block_Size () ,1 ,1 );
306+ unsigned int gridx = rounded_up_division (config-> GetCuda_Block_Size () , matrixParam.totalRows * CUDA_WARP_SIZE );
299307 dim3 gridDim (gridx, 1 , 1 );
300308
301- MatrixVectorProductKernel<<<gridDim , blockDim , ROWS_PER_BLOCK * matrixParam.blockRowSize * sizeof (ScalarType)>>> (d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
309+ MatrixVectorProductKernel<<<gridDim , blockDim , matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof (ScalarType)>>> (d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
302310 gpuErrChk ( cudaPeekAtLastError () );
303311
304312 prod.DtHTransfer ();
@@ -316,18 +324,18 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
316324 vec.HtDTransfer ();
317325 prod.HtDTransfer ();
318326
319- matrixParameters matrixParam (nPointDomain, nEqn, nVar, geometry->nPartition );
327+ matrixParameters matrixParam (nPointDomain, nEqn, nVar, geometry->nPartition , config-> GetRows_Per_Cuda_Block () );
320328
321- dim3 blockDim (ROWS_PER_BLOCK * WARP_SIZE ,1 ,1 );
322- unsigned int gridx = rounded_up_division (ROWS_PER_BLOCK , geometry->maxPartitionSize );
329+ dim3 blockDim (matrixParam. rowsPerBlock * CUDA_WARP_SIZE ,1 ,1 );
330+ unsigned int gridx = rounded_up_division (matrixParam. rowsPerBlock , geometry->maxPartitionSize );
323331 dim3 gridDim (gridx, 1 , 1 );
324332
325333 for (auto elem = geometry->chainPtr .begin (); elem != geometry->chainPtr .end () - 1 ; elem++)
326334 {
327335 matrixParam.nChainStart = *(elem);
328336 matrixParam.nChainEnd = *(elem + 1 );
329337
330- FirstSymmetricIterationKernel<<<gridDim , blockDim , ROWS_PER_BLOCK * matrixParam.blockSize * sizeof (ScalarType)>>> (d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
338+ FirstSymmetricIterationKernel<<<gridDim , blockDim , matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof (ScalarType)>>> (d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
331339 gpuErrChk ( cudaPeekAtLastError () );
332340 }
333341
@@ -336,7 +344,7 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
336344 matrixParam.nChainStart = *(elem);
337345 matrixParam.nChainEnd = *(elem + 1 );
338346
339- SecondSymmetricIterationKernel<<<gridDim , blockDim , ROWS_PER_BLOCK * matrixParam.blockSize * sizeof (ScalarType)>>> (d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
347+ SecondSymmetricIterationKernel<<<gridDim , blockDim , matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof (ScalarType)>>> (d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
340348 gpuErrChk ( cudaPeekAtLastError () );
341349 }
342350
0 commit comments