Fixed the issue with the visibility of the rowsPerBlock variable. Also edited the nvcc flags to match machine-specific architecture

areenraj · areenraj · commit 52b90b6787bd · 2025-07-17T19:26:28.000+05:30
diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
@@ -632,6 +632,8 @@ class CConfig {
   unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
   unsigned long Linear_Solver_Prec_Threads;      /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
   unsigned short Linear_Solver_ILU_n;            /*!< \brief ILU fill=in level. */
+  unsigned short Cuda_Block_Size;                /*!< \brief  User-specified value for the X-Axis dimension of thread blocks
+                                                              that are deployed by the CUDA Kernels. */
   su2double SemiSpan;                   /*!< \brief Wing Semi span. */
   su2double Roe_Kappa;                  /*!< \brief Relaxation of the Roe scheme. */
   su2double Relaxation_Factor_Adjoint;  /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4204,6 +4206,18 @@ class CConfig {
    */
   su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
 
+  /*!
+   * \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
+   * \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
+   */
+  unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
+
+  /*!
+   * \brief Get the number of matrix rows assigned per CUDA Block.
+   * \return The number of matrix rows assigned per CUDA Block.
+   */
+  unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
+
   /*!
    * \brief Get the relaxation factor for solution updates of adjoint solvers.
    */
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -28,13 +28,11 @@
 
 #pragma once
 
-#include "../CConfig.hpp"
 #include "../geometry/CGeometry.hpp"
-#include "../geometry/dual_grid/CPoint.hpp"
 
 /*!
  * \class CGraphPartitioning
- * \brief Abstract base class for defining graph partitioning algorithms
+ * \brief Abstract base class for defining graph partitioning algorithms.
  * \author A. Raj
  *
  * In order to use certain parallel algorithms in the solution process -
@@ -55,7 +53,7 @@ class CGraphPartitioning {
  public:
   virtual ~CGraphPartitioning() = 0;
   virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
-                         vector<ScalarType>& chainPtr) = 0;
+                         vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
 };
 template <class ScalarType>
 CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
@@ -89,9 +87,9 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
    * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
    * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
-   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
-  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
     ScalarType levelWidth = 0;
     unsigned short chainLength = chainPtr.capacity();
 
@@ -135,9 +133,10 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
    * \param[in] pointList - Ordered array that contains the list  of all mesh points.
    * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
    * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
    */
-  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
-                 vector<ScalarType>& chainPtr) override {
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
+                 unsigned short rowsPerBlock) override {
     vector<ScalarType> inversePointList;
     inversePointList.reserve(nPointDomain);
     levels.reserve(nPointDomain);
@@ -179,10 +178,6 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
 
     Reorder(pointList, inversePointList, levelOffsets);
 
-#ifdef HAVE_CUDA
-    CalculateChain(levelOffsets, chainPtr, 20);
-#elif
-    chainPtr = NULL;
-#endif
+    CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
   }
 };
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -25,33 +25,21 @@
 * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
 */
 
+#pragma once
+
 #include<cuda_runtime.h>
 #include<iostream>
+#include "../option_structure.hpp"
 
-/*!< \brief Namespace that contains variables and helper functions that are
-    utilized to launch CUDA Kernels. */
-namespace kernelParameters{
-
-
-  /*!
-   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
-   */
-  inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
-
-  /*!
-   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
-   */
-  inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
-
-  static constexpr short BLOCK_SIZE = 640;
-  static constexpr short WARP_SIZE = 32;
-  static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
-
-
-};
-
-/*!< \brief Structure containing information related to the Jacobian Matrix
-    which is utilized by any launched Kernel. */
+/*!
+ * \struct matrixParameters
+ * \brief Structure containing information related to the Jacobian Matrix which is utilized by any launched Kernel.
+ *
+ *  This implementation alleviates the need to pass an excessive number of arguments
+ *  to a Kernel and, instead, packages it into a single structure. While this leads
+ *  to data duplication for a short period of time, this is a much cleaner and resuable approach.
+ * \author A. Raj
+ */
 struct matrixParameters{
 
   public:
@@ -63,29 +51,32 @@ struct matrixParameters{
     unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
     unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
                                         dimensions of the Jacbian Matrix. */
+    unsigned short rowsPerBlock;     /*!< \brief Number of rows being processed by each thread block. This is equal to the number
+                                        of warps present in the block as each row gets assigned a warp. */
 
-    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
+    matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions, unsigned short rowsPrBlck){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
       nChainStart = 0;
       nChainEnd = 0;
       blockSize = nVar * nEqn;
-      activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
+      activeThreads = nVar * (cudaKernelParameters::CUDA_WARP_SIZE/nVar);
+      rowsPerBlock = rowsPrBlck;
     }
 
     /*!
     * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
     */
-    __device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
+    __device__ __forceinline__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
       return (localRow * blockSize + threadNo);
     }
 
     /*!
     * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
     *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
     */
-    __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
+    __device__ __forceinline__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
       return (row<totalRows && threadNo<threadLimit);
     }
 
@@ -94,7 +85,7 @@ struct matrixParameters{
     *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
     * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
     */
-    __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
+    __device__ __forceinline__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
       return (rowInPartition && threadNo<threadLimit);
     }
 
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
@@ -71,6 +71,26 @@ enum class SU2_COMPONENT {
   SU2_SOL  /*!< \brief Running the SU2_SOL software. */
 };
 
+/*!
+ * \namespace cudaKernelParameters
+ * \brief Namespace that contains variables and helper functions that are utilized to calculate CUDA Kernel parameters.
+ * \author A. Raj
+ */
+namespace cudaKernelParameters{
+
+  /*!
+   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
+   */
+  inline unsigned int rounded_up_division(int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
+
+  /*!
+   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
+   */
+  inline unsigned int rounded_down_division(int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
+
+  static constexpr short CUDA_WARP_SIZE = 32;  /*!< \brief Outlines the numbers of threads per warp for a CUDA GPU. */
+};
+
 const unsigned int EXIT_DIVERGENCE = 2;   /*!< \brief Exit code (divergence). */
 
 const unsigned int MAX_PARAMETERS = 10;       /*!< \brief Maximum number of parameters for a design variable definition. */
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
@@ -1850,7 +1850,7 @@ void CConfig::SetConfig_Options() {
   /*--- Options related to the linear solvers ---*/
 
   /*!\brief GRAPH_PARTIONING
-   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of inear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
+   *  \n DESCRIPTION: Algorithm for partioning the matrix graph to facilitate parallel execution of linear algebra subroutines\n OPTIONS: see \link Graph_Part_Map \endlink \n DEFAULT: LEVEL_SCHEDULING \ingroup Config*/
   addEnumOption("GRAPH_PART_ALGORITHM", Kind_Graph_Part_Algo, Graph_Part_Map, LEVEL_SCHEDULING);
   /*!\brief LINEAR_SOLVER
    *  \n DESCRIPTION: Linear solver for the implicit, mesh deformation, or discrete adjoint systems \n OPTIONS: see \link Linear_Solver_Map \endlink \n DEFAULT: FGMRES \ingroup Config*/
@@ -1890,7 +1890,8 @@ void CConfig::SetConfig_Options() {
   addEnumOption("DISCADJ_LIN_SOLVER", Kind_DiscAdj_Linear_Solver, Linear_Solver_Map, FGMRES);
   /* DESCRIPTION: Preconditioner for the discrete adjoint Krylov linear solvers */
   addEnumOption("DISCADJ_LIN_PREC", Kind_DiscAdj_Linear_Prec, Linear_Solver_Prec_Map, ILU);
-  /* DESCRIPTION: Linear solver for the discete adjoint systems */
+  /* DESCRIPTION: Thread block size for CUDA GPUs */
+  addUnsignedShortOption("CUDA_BLOCK_SIZE", Cuda_Block_Size, 1024);
 
   /*!\par CONFIG_CATEGORY: Convergence\ingroup Config*/
   /*--- Options related to convergence ---*/
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
   switch (KindAlgorithm) {
     case LEVEL_SCHEDULING:
       auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
-      levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr, config->GetRows_Per_Cuda_Block());
       nPartition = levelSchedule.nLevels;
       maxPartitionSize = levelSchedule.maxLevelWidth;
       break;
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -1,6 +1,15 @@
 /*!
  * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
+ *
+ * The kernel implementations will feature a lot of if-statements.
+ * The reason for such heavy usage of conditionals is to do a check
+ * whether the memory locations being accessed by the threads are
+ * within bounds or not. Usually the entire kernel is "wrapped" in
+ * a single conditional for these checks. But, in our case, it is
+ * necessary for us to use intermittent synchronization barriers like
+ * __syncthreads() which will lead to thread divergence issues if used
+ * inside a conditional.
  * \author A. Raj
  * \version 8.2.0 "Harrier"
  *
@@ -29,8 +38,7 @@
 #include "../../include/geometry/CGeometry.hpp"
 #include "../../include/linear_algebra/GPUComms.cuh"
 
-using namespace kernelParameters;
-
+using namespace cudaKernelParameters;
 
 template<typename matrixType, typename vectorType>
 __device__ void DeviceGaussElimination(matrixType* matrixCopy, vectorType* prod, unsigned long row, unsigned int threadNo, bool rowInPartition, matrixParameters matrixParam)
@@ -96,9 +104,9 @@ __global__ void FirstSymmetricIterationKernel(matrixType* matrix, vectorType* ve
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short localRow = origRow % ROWS_PER_BLOCK;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -167,9 +175,9 @@ __global__ void SecondSymmetricIterationKernel(matrixType* matrix, vectorType* p
       return (d_col_ind[blockNo] * matrixParam.blockColSize + blockCol);
    };
 
-   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short localRow = origRow % ROWS_PER_BLOCK;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
+   unsigned long origRow = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short localRow = origRow % matrixParam.rowsPerBlock;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -238,9 +246,9 @@ __global__ void MatrixVectorProductKernel(matrixType* matrix, vectorType* vec, v
       return (row * matrixParam.blockRowSize + elemNo);
    };
 
-   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/WARP_SIZE;
-   unsigned short threadNo = threadIdx.x % WARP_SIZE;
-   unsigned short localRow = row % ROWS_PER_BLOCK;
+   unsigned long row = (blockIdx.x * blockDim.x + threadIdx.x)/CUDA_WARP_SIZE;
+   unsigned short threadNo = threadIdx.x % CUDA_WARP_SIZE;
+   unsigned short localRow = row % matrixParam.rowsPerBlock;
 
    unsigned short blockCol = threadNo % matrixParam.blockColSize;
 
@@ -292,13 +300,13 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    vec.HtDTransfer();
    prod.GPUSetVal(0.0);
 
-   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+   matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
 
-  dim3 blockDim(BLOCK_SIZE,1,1);
-  unsigned int gridx = rounded_up_division(BLOCK_SIZE, matrixParam.totalRows * WARP_SIZE);
+  dim3 blockDim(config->GetCuda_Block_Size(),1,1);
+  unsigned int gridx = rounded_up_division(config->GetCuda_Block_Size(), matrixParam.totalRows * CUDA_WARP_SIZE);
   dim3 gridDim(gridx, 1, 1);
 
-  MatrixVectorProductKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
+  MatrixVectorProductKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockRowSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, matrixParam);
   gpuErrChk( cudaPeekAtLastError() );
 
   prod.DtHTransfer();
@@ -316,18 +324,18 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
       vec.HtDTransfer();
       prod.HtDTransfer();
 
-      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition);
+      matrixParameters matrixParam(nPointDomain, nEqn, nVar, geometry->nPartition, config->GetRows_Per_Cuda_Block());
 
-      dim3 blockDim(ROWS_PER_BLOCK * WARP_SIZE,1,1);
-      unsigned int gridx = rounded_up_division(ROWS_PER_BLOCK, geometry->maxPartitionSize);
+      dim3 blockDim(matrixParam.rowsPerBlock * CUDA_WARP_SIZE,1,1);
+      unsigned int gridx = rounded_up_division(matrixParam.rowsPerBlock, geometry->maxPartitionSize);
       dim3 gridDim(gridx, 1, 1);
 
       for(auto elem = geometry->chainPtr.begin(); elem != geometry->chainPtr.end() - 1; elem++)
       {
          matrixParam.nChainStart = *(elem);
          matrixParam.nChainEnd = *(elem + 1);
 
-         FirstSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         FirstSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_vec, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
          gpuErrChk( cudaPeekAtLastError() );
       }
 
@@ -336,7 +344,7 @@ void CSysMatrix<ScalarType>::GPUComputeLU_SGSPreconditioner(const CSysVector<Sca
          matrixParam.nChainStart = *(elem);
          matrixParam.nChainEnd = *(elem + 1);
 
-         SecondSymmetricIterationKernel<<<gridDim, blockDim, ROWS_PER_BLOCK * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
+         SecondSymmetricIterationKernel<<<gridDim, blockDim, matrixParam.rowsPerBlock * matrixParam.blockSize * sizeof(ScalarType)>>>(d_matrix, d_prod, d_partition_offsets, d_row_ptr, d_col_ind, d_dia_ptr, matrixParam);
          gpuErrChk( cudaPeekAtLastError() );
       }
 
diff --git a/meson.build b/meson.build
@@ -18,7 +18,7 @@ python = pymod.find_installation()
 
 if get_option('enable-cuda')
   add_languages('cuda')
-  add_global_arguments('-arch=sm_86', language : 'cuda')
+  add_global_arguments('-arch=native', language : 'cuda')
 endif
 
 su2_cpp_args = []