Working GPU LU_SGS Preconditioner Port

areenraj · areenraj · commit 0372099a1ce0 · 2025-07-15T21:23:55.000+05:30
diff --git a/Common/include/geometry/CGeometry.hpp b/Common/include/geometry/CGeometry.hpp
@@ -260,11 +260,12 @@ class CGeometry {
   unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
                                                on all prior ranks in the linear partitioning. */
 
-  unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
-                               partitioning algorithms. */
+  unsigned long nPartition;       /*!< \brief Number of divisions of the matrix graph during execution of parallel
+                                     partitioning algorithms. */
   unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
   vector<unsigned long>
       partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
+  vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
 
   /*--- Data structures for point-to-point MPI communications. ---*/
 
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -152,9 +152,6 @@ class CPhysicalGeometry final : public CGeometry {
    * \brief Divide the graph produced by the matrix into parallel partitions.
    * \param[in] config - Definition of the particular problem.
    * \param[in] pointList - Ordered list of points in the mesh.
-   * \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
-   * \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
-   * list.
    */
   template <class ScalarType>
   void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
diff --git a/Common/include/linear_algebra/CGraphPartitioning.hpp b/Common/include/linear_algebra/CGraphPartitioning.hpp
@@ -1,6 +1,6 @@
 /*!
  * \file CGraphPartitioning.hpp
- * \brief Headers for the classes realted to the algorithms that are used 
+ * \brief Headers for the classes realted to the algorithms that are used
                 to divide the matrix acyclic graph into parallel partitions.
  * \author A. Raj
  * \version 8.2.0 "Harrier"
@@ -26,6 +26,8 @@
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
 
+#pragma once
+
 #include "../CConfig.hpp"
 #include "../geometry/CGeometry.hpp"
 #include "../geometry/dual_grid/CPoint.hpp"
@@ -35,65 +37,107 @@
  * \brief Abstract base class for defining graph partitioning algorithms
  * \author A. Raj
  *
- * In order to use certain parallel algorithms in the solution process - 
- * whether with linear solvers or preconditioners - we require the matrix 
- * to be partitioned into certain parallel divisions. These maybe in the form 
- * of levels, blocks, colors and so on. Since a number of different algorithms 
- * can be used to split the graph, we've introduced a base class containing the 
- * "Partition" member function from which child classes of the specific 
- * algorithm can be derived. Currently, we are only using direct declarations 
+ * In order to use certain parallel algorithms in the solution process -
+ * whether with linear solvers or preconditioners - we require the matrix
+ * to be partitioned into certain parallel divisions. These maybe in the form
+ * of levels, blocks, colors and so on. Since a number of different algorithms
+ * can be used to split the graph, we've introduced a base class containing the
+ * "Partition" member function from which child classes of the specific
+ * algorithm can be derived. Currently, we are only using direct declarations
  * of the derived classes in the code. However, this method was chosen as it
- * allows us to pass different child class algorithms to a single implementation 
+ * allows us to pass different child class algorithms to a single implementation
  * of the function that requires it - similar to the CMatrixVectorProduct class.
  */
 
 template <class ScalarType>
 
 class CGraphPartitioning {
-
  public:
   virtual ~CGraphPartitioning() = 0;
-  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets) = 0;  
+  virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
+                         vector<ScalarType>& chainPtr) = 0;
 };
 template <class ScalarType>
-CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}  
+CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
 
 template <class ScalarType>
 
-class CLevelScheduling final : public CGraphPartitioning<ScalarType> { 
-
+class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
  private:
   ScalarType nPointDomain;
   CPoint* nodes;
-  
+
  public:
   ScalarType nLevels;
   ScalarType maxLevelWidth;
   vector<ScalarType> levels;
 
- /*!
+  /*!
    * \brief constructor of the class
    * \param[in] nPointDomain_ref - number of points associated with the problem
-   * \param[in] nodes - represents the relationships between the points
+   * \param[in] nodes_ref - represents the relationships between the points
    */
- inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref) 
-                                                        : nPointDomain(nPointDomain_ref), nodes(nodes_ref) 
-                                                        { nLevels = 0ul; maxLevelWidth = 0ul; }
+  inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
+      : nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
+    nLevels = 0ul;
+    maxLevelWidth = 0ul;
+  }
+
+  CLevelScheduling() = delete;  // Removing default constructor
 
- CLevelScheduling() = delete;   // Removing default constructor
+  /*!
+   * \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
+   * \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   * \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
+   */
+  void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
+    ScalarType levelWidth = 0;
+    unsigned short chainLength = chainPtr.capacity();
 
- void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets)
- {
+    /*This is not a magic number. We are simply initializing
+    the point array with its first element that is always zero.*/
+    chainPtr.push_back(0);
+
+    for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
+      levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
+      maxLevelWidth = std::max(levelWidth, maxLevelWidth);
+
+      if (levelWidth > rowsPerBlock) {
+        if (chainPtr.back() != iLevel) {
+          chainPtr.push_back(iLevel);
+        }
+
+        chainPtr.push_back(iLevel + 1);
+      }
+    }
+
+    chainPtr.push_back(nLevels);
+  }
+
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list of all mesh points.
+   * \param[in] inversePointList - Array utilized to access the index of each point in pointList.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   */
+  void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
     for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
       const auto globalPoint = pointList[localPoint];
       inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
     }
-      
+
     pointList = std::move(inversePointList);
   }
 
- void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets) override
-  {
+  /*!
+   * \brief Reorders the points according to the levels
+   * \param[in] pointList - Ordered array that contains the list  of all mesh points.
+   * \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
+   * \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
+   */
+  void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
+                 vector<ScalarType>& chainPtr) override {
     vector<ScalarType> inversePointList;
     inversePointList.reserve(nPointDomain);
     levels.reserve(nPointDomain);
@@ -111,29 +155,34 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
 
       for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
         const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
-        
+
         if (adjGlobalPoint < nPointDomain) {
           const auto adjLocalPoint = inversePointList[adjGlobalPoint];
-          
+
           if (adjLocalPoint < localPoint) {
-          levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+            levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
+          }
         }
-       }
       }
 
       nLevels = std::max(nLevels, levels[localPoint] + 1);
-    }  
+    }
 
     levelOffsets.resize(nLevels + 1);
-    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++)  ++levelOffsets[levels[iPoint] + 1];
+    for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
+      ++levelOffsets[levels[iPoint] + 1];
+    }
 
     for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
       levelOffsets[iLevel] += levelOffsets[iLevel - 1];
     }
 
-    for(auto elem = levelOffsets.begin(); elem != (levelOffsets.end() - 1); elem++) maxLevelWidth = std::max(*(elem+1) - *elem, maxLevelWidth);
-    
     Reorder(pointList, inversePointList, levelOffsets);
+
+#ifdef HAVE_CUDA
+    CalculateChain(levelOffsets, chainPtr, 20);
+#elif
+    chainPtr = NULL;
+#endif
   }
 };
-
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -150,8 +150,8 @@ class CSysMatrix {
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
   const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
   unsigned long* d_partition_offsets;
-  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
-                                     Mainly used to conditionally free GPU memory in the class destructor. */
+  bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                   Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -28,48 +28,72 @@
 #include<cuda_runtime.h>
 #include<iostream>
 
+/*!< \brief Namespace that contains variables and helper functions that are
+    utilized to launch CUDA Kernels. */
 namespace kernelParameters{
 
-  /*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
+
+  /*!
+   * \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
+   */
   inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
 
-  /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
+  /*!
+   * \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
+   */
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
 
-  static constexpr short MVP_BLOCK_SIZE = 256;
-  static constexpr short MVP_WARP_SIZE = 32;  
+  static constexpr short BLOCK_SIZE = 640;
+  static constexpr short WARP_SIZE = 32;
+  static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
+
 
 };
 
+/*!< \brief Structure containing information related to the Jacobian Matrix
+    which is utilized by any launched Kernel. */
 struct matrixParameters{
 
   public:
-    unsigned long totalRows;
-    unsigned short blockRowSize;
-    unsigned short blockColSize;
-    unsigned long nPartition; 
-    unsigned short blockSize;
-    unsigned short rowsPerBlock;
-    unsigned short activeThreads;
+    unsigned long totalRows;        /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
+    unsigned short blockRowSize;    /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
+    unsigned short blockColSize;    /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
+    unsigned int nChainStart;       /*!< \brief Starting partition of the current chain. */
+    unsigned int nChainEnd;         /*!< \brief Ending partition of the current chain. */
+    unsigned short blockSize;       /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
+    unsigned short activeThreads;   /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
+                                        dimensions of the Jacbian Matrix. */
 
     matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
       totalRows = nPointDomain;
       blockRowSize = nEqn;
       blockColSize = nVar;
-      nPartition = nPartitions;
+      nChainStart = 0;
+      nChainEnd = 0;
       blockSize = nVar * nEqn;
-      rowsPerBlock = kernelParameters::rounded_up_division(kernelParameters::MVP_WARP_SIZE, kernelParameters::MVP_BLOCK_SIZE);
-      activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
+      activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
     }
 
+    /*!
+    * \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
+    */
     __device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
       return (localRow * blockSize + threadNo);
     }
 
+    /*!
+    * \brief Returns a boolean value to check whether the row is under the total number of rows and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    */
     __device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
       return (row<totalRows && threadNo<threadLimit);
     }
 
+    /*!
+    * \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
+    *        thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
+    * \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
+    */
     __device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
       return (rowInPartition && threadNo<threadLimit);
     }
diff --git a/Common/src/geometry/CPhysicalGeometry.cpp b/Common/src/geometry/CPhysicalGeometry.cpp
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
   switch (KindAlgorithm) {
     case LEVEL_SCHEDULING:
       auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
-      levelSchedule.Partition(pointList, partitionOffsets);
+      levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
       nPartition = levelSchedule.nLevels;
       maxPartitionSize = levelSchedule.maxLevelWidth;
       break;
@@ -4558,6 +4558,7 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
     if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
   }
 
+  /*Partition graph into parallel constituents for GPU Operations*/
   if (config->GetCUDA()) PartitionGraph(config, Result);
 
   /*--- Add the MPI points ---*/
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -153,7 +153,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
       ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
     };
 
-      auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+    auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
       ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
     };
 
@@ -165,8 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
     GPUAllocAndCopy(d_col_ind, col_ind, nnz);
     GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
-
+    GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
   }
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu