Skip to content

Commit 52b90b6

Browse files
committed
Fixed the issue with the visibility of the rowsPerBlock variable. Also edited the nvcc flags to match machine-specific architecture
1 parent 0372099 commit 52b90b6

File tree

8 files changed

+95
-66
lines changed

8 files changed

+95
-66
lines changed

Common/include/CConfig.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -632,6 +632,8 @@ class CConfig {
632632
unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
633633
unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
634634
unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */
635+
unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks
636+
that are deployed by the CUDA Kernels. */
635637
su2double SemiSpan; /*!< \brief Wing Semi span. */
636638
su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */
637639
su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4204,6 +4206,18 @@ class CConfig {
42044206
*/
42054207
su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
42064208

4209+
/*!
4210+
* \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
4211+
* \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
4212+
*/
4213+
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
4214+
4215+
/*!
4216+
* \brief Get the number of matrix rows assigned per CUDA Block.
4217+
* \return The number of matrix rows assigned per CUDA Block.
4218+
*/
4219+
unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
4220+
42074221
/*!
42084222
* \brief Get the relaxation factor for solution updates of adjoint solvers.
42094223
*/

Common/include/linear_algebra/CGraphPartitioning.hpp

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,11 @@
2828

2929
#pragma once
3030

31-
#include "../CConfig.hpp"
3231
#include "../geometry/CGeometry.hpp"
33-
#include "../geometry/dual_grid/CPoint.hpp"
3432

3533
/*!
3634
* \class CGraphPartitioning
37-
* \brief Abstract base class for defining graph partitioning algorithms
35+
* \brief Abstract base class for defining graph partitioning algorithms.
3836
* \author A. Raj
3937
*
4038
* In order to use certain parallel algorithms in the solution process -
@@ -55,7 +53,7 @@ class CGraphPartitioning {
5553
public:
5654
virtual ~CGraphPartitioning() = 0;
5755
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
58-
vector<ScalarType>& chainPtr) = 0;
56+
vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
5957
};
6058
template <class ScalarType>
6159
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
@@ -89,9 +87,9 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
8987
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
9088
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
9189
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
92-
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
90+
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
9391
*/
94-
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
92+
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
9593
ScalarType levelWidth = 0;
9694
unsigned short chainLength = chainPtr.capacity();
9795

@@ -135,9 +133,10 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
135133
* \param[in] pointList - Ordered array that contains the list of all mesh points.
136134
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
137135
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
136+
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
138137
*/
139-
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
140-
vector<ScalarType>& chainPtr) override {
138+
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
139+
unsigned short rowsPerBlock) override {
141140
vector<ScalarType> inversePointList;
142141
inversePointList.reserve(nPointDomain);
143142
levels.reserve(nPointDomain);
@@ -179,10 +178,6 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
179178

180179
Reorder(pointList, inversePointList, levelOffsets);
181180

182-
#ifdef HAVE_CUDA
183-
CalculateChain(levelOffsets, chainPtr, 20);
184-
#elif
185-
chainPtr = NULL;
186-
#endif
181+
CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
187182
}
188183
};

Common/include/linear_algebra/GPUComms.cuh

Lines changed: 20 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,21 @@
2525
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
2626
*/
2727

28+
#pragma once
29+
2830
#include<cuda_runtime.h>
2931
#include<iostream>
32+
#include "../option_structure.hpp"
3033

31-
/*!< \brief Namespace that contains variables and helper functions that are
32-
utilized to launch CUDA Kernels. */
33-
namespace kernelParameters{
34-
35-
36-
/*!
37-
* \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
38-
*/
39-
inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
40-
41-
/*!
42-
* \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
43-
*/
44-
inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
45-
46-
static constexpr short BLOCK_SIZE = 640;
47-
static constexpr short WARP_SIZE = 32;
48-
static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
49-
50-
51-
};
52-
53-
/*!< \brief Structure containing information related to the Jacobian Matrix
54-
which is utilized by any launched Kernel. */
34+
/*!
35+
* \struct matrixParameters
36+
* \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
37+
*
38+
* This implementation alleviates the need to pass an excessive number of arguments
39+
* to a Kernel and, instead, packages it into a single structure. While this leads
40+
* to data duplication for a short period of time, this is a much cleaner and resuable approach.
41+
* \author A. Raj
42+
*/
5543
struct matrixParameters{
5644

5745
public:
@@ -63,29 +51,32 @@ struct matrixParameters{
6351
unsigned short blockSize; /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
6452
unsigned short activeThreads; /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
6553
dimensions of the Jacbian Matrix. */
54+
unsigned short rowsPerBlock; /*!< \brief Number of rows being processed by each thread block. This is equal to the number
55+
of warps present in the block as each row gets assigned a warp. */
6656

67-
matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
57+
matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
6858
totalRows = nPointDomain;
6959
blockRowSize = nEqn;
7060
blockColSize = nVar;
7161
nChainStart = 0;
7262
nChainEnd = 0;
7363
blockSize = nVar * nEqn;
74-
activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
64+
activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
65+
rowsPerBlock = rowsPrBlck;
7566
}
7667

7768
/*!
7869
* \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
7970
*/
80-
__device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
71+
__device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
8172
return (localRow * blockSize + threadNo);
8273
}
8374

8475
/*!
8576
* \brief Returns a boolean value to check whether the row is under the total number of rows and if the
8677
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
8778
*/
88-
__device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
79+
__device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
8980
return (row<totalRows && threadNo<threadLimit);
9081
}
9182

@@ -94,7 +85,7 @@ struct matrixParameters{
9485
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
9586
* \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
9687
*/
97-
__device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
88+
__device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
9889
return (rowInPartition && threadNo<threadLimit);
9990
}
10091

Common/include/option_structure.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,26 @@ enum class SU2_COMPONENT {
7171
SU2_SOL /*!< \brief Running the SU2_SOL software. */
7272
};
7373

74+
/*!
75+
* \namespace cudaKernelParameters
76+
* \brief Namespace that contains variables and helper functions that are utilized to calculate CUDA Kernel parameters.
77+
* \author A. Raj
78+
*/
79+
namespace cudaKernelParameters{
80+
81+
/*!
82+
* \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
83+
*/
84+
inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
85+
86+
/*!
87+
* \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
88+
*/
89+
inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
90+
91+
static constexpr short CUDA_WARP_SIZE = 32; /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
92+
};
93+
7494
const unsigned int EXIT_DIVERGENCE = 2; /*!< \brief Exit code (divergence). */
7595

7696
const unsigned int MAX_PARAMETERS = 10; /*!< \brief Maximum number of parameters for a design variable definition. */

Common/src/CConfig.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1850,7 +1850,7 @@ void CConfig::SetConfig_Options() {
18501850
/*--- Options related to the linear solvers ---*/
18511851

18521852
/*!\brief GRAPH_PARTIONING
1853-
* \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
1853+
* \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
18541854
addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
18551855
/*!\brief LINEAR_SOLVER
18561856
* \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
@@ -1890,7 +1890,8 @@ void CConfig::SetConfig_Options() {
18901890
addEnumOption("DISCADJ_LIN_SOLVER", Kind_DiscAdj_Linear_Solver, Linear_Solver_Map, FGMRES);
18911891
/* DESCRIPTION: Preconditioner for the discrete adjoint Krylov linear solvers */
18921892
addEnumOption("DISCADJ_LIN_PREC", Kind_DiscAdj_Linear_Prec, Linear_Solver_Prec_Map, ILU);
1893-
/* DESCRIPTION: Linear solver for the discete adjoint systems */
1893+
/* DESCRIPTION: Thread block size for CUDA GPUs */
1894+
addUnsignedShortOption("CUDA_BLOCK_SIZE", Cuda_Block_Size, 1024);
18941895

18951896
/*!\par CONFIG_CATEGORY: Convergence\ingroup Config*/
18961897
/*--- Options related to convergence ---*/

Common/src/geometry/CPhysicalGeometry.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
708708
switch (KindAlgorithm) {
709709
case LEVEL_SCHEDULING:
710710
auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
711-
levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
711+
levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
712712
nPartition = levelSchedule.nLevels;
713713
maxPartitionSize = levelSchedule.maxLevelWidth;
714714
break;

Common/src/linear_algebra/CSysMatrixGPU.cu

Lines changed: 28 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
/*!
22
* \file CSysMatrixGPU.cu
33
* \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
4+
*
5+
* The kernel implementations will feature a lot of if-statements.
6+
* The reason for such heavy usage of conditionals is to do a check
7+
* whether the memory locations being accessed by the threads are
8+
* within bounds or not. Usually the entire kernel is "wrapped" in
9+
* a single conditional for these checks. But, in our case, it is
10+
* necessary for us to use intermittent synchronization barriers like
11+
* __syncthreads() which will lead to thread divergence issues if used
12+
* inside a conditional.
413
* \author A. Raj
514
* \version 8.2.0 "Harrier"
615
*
@@ -29,8 +38,7 @@
2938
#include "../../include/geometry/CGeometry.hpp"
3039
#include "../../include/linear_algebra/GPUComms.cuh"
3140

32-
using namespace kernelParameters;
33-
41+
using namespace cudaKernelParameters;
3442

3543
template<typename matrixType, typename vectorType>
3644
__device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
@@ -96,9 +104,9 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
96104
return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
97105
};
98106

99-
unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
100-
unsigned short localRow = origRow % ROWS_PER_BLOCK;
101-
unsigned short threadNo = threadIdx.x % WARP_SIZE;
107+
unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
108+
unsigned short localRow = origRow % matrixParam.rowsPerBlock;
109+
unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
102110

103111
unsigned short blockCol = threadNo % matrixParam.blockColSize;
104112

@@ -167,9 +175,9 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
167175
return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
168176
};
169177

170-
unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
171-
unsigned short localRow = origRow % ROWS_PER_BLOCK;
172-
unsigned short threadNo = threadIdx.x % WARP_SIZE;
178+
unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
179+
unsigned short localRow = origRow % matrixParam.rowsPerBlock;
180+
unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
173181

174182
unsigned short blockCol = threadNo % matrixParam.blockColSize;
175183

@@ -238,9 +246,9 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
238246
return (row * matrixParam.blockRowSize + elemNo);
239247
};
240248

241-
unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
242-
unsigned short threadNo = threadIdx.x % WARP_SIZE;
243-
unsigned short localRow = row % ROWS_PER_BLOCK;
249+
unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
250+
unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
251+
unsigned short localRow = row % matrixParam.rowsPerBlock;
244252

245253
unsigned short blockCol = threadNo % matrixParam.blockColSize;
246254

@@ -292,13 +300,13 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
292300
vec.HtDTransfer();
293301
prod.GPUSetVal(0.0);
294302

295-
matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
303+
matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
296304

297-
dim3 blockDim(BLOCK_SIZE,1,1);
298-
unsigned int gridx = rounded_up_division(BLOCK_SIZE, matrixParam.totalRows * WARP_SIZE);
305+
dim3 blockDim(config->GetCuda_Block_Size(),1,1);
306+
unsigned int gridx = rounded_up_division(config->GetCuda_Block_Size(), matrixParam.totalRows * CUDA_WARP_SIZE);
299307
dim3 gridDim(gridx, 1, 1);
300308

301-
MatrixVectorProductKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
309+
MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
302310
gpuErrChk( cudaPeekAtLastError() );
303311

304312
prod.DtHTransfer();
@@ -316,18 +324,18 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
316324
vec.HtDTransfer();
317325
prod.HtDTransfer();
318326

319-
matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
327+
matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
320328

321-
dim3 blockDim(ROWS_PER_BLOCK * WARP_SIZE,1,1);
322-
unsigned int gridx = rounded_up_division(ROWS_PER_BLOCK, geometry->maxPartitionSize);
329+
dim3 blockDim(matrixParam.rowsPerBlock * CUDA_WARP_SIZE,1,1);
330+
unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
323331
dim3 gridDim(gridx, 1, 1);
324332

325333
for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++)
326334
{
327335
matrixParam.nChainStart = *(elem);
328336
matrixParam.nChainEnd = *(elem + 1);
329337

330-
FirstSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
338+
FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
331339
gpuErrChk( cudaPeekAtLastError() );
332340
}
333341

@@ -336,7 +344,7 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
336344
matrixParam.nChainStart = *(elem);
337345
matrixParam.nChainEnd = *(elem + 1);
338346

339-
SecondSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
347+
SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
340348
gpuErrChk( cudaPeekAtLastError() );
341349
}
342350

meson.build

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ python = pymod.find_installation()
1818

1919
if get_option('enable-cuda')
2020
add_languages('cuda')
21-
add_global_arguments('-arch=sm_86', language : 'cuda')
21+
add_global_arguments('-arch=native', language : 'cuda')
2222
endif
2323

2424
su2_cpp_args = []

0 commit comments

Comments
 (0)