Introduced Memory Wrapper Functions, Error Handling and Constant Block Sizes

areenraj · areenraj · commit b7ed530933aa · 2025-05-08T18:00:10.000+05:30
diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -103,9 +103,10 @@ class CGpuExecution : public CExecutionPath<ScalarType> {
 #ifdef HAVE_CUDA
     matrix.GPUMatrixVectorProduct(u, v, geometry, config);
 #else
-    std::cerr << "\nError in launching Matrix-Vector Product Function\n";
-    std::cerr << "ENABLE_CUDA is set to YES\n";
-    std::cerr << "Please compile with CUDA options enabled in Meson to access GPU Functions" << std::endl;
+    SU2_MPI::Error(
+        "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
+        "options enabled in Meson to access GPU Functions",
+        CURRENT_FUNCTION);
 #endif
   }
 };
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -145,9 +145,9 @@ class CSysMatrix {
   const unsigned long* col_ind; /*!< \brief Column index for each of the elements in val(). */
   const unsigned long* col_ptr; /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
 
-  ScalarType* d_matrix;     /*!< \brief Device Pointer to store the matrix values on the GPU. */
-  unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
-  unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
+  const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
+  const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -859,9 +859,41 @@ class CSysMatrix {
   void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                               const CConfig* config) const;
 
-  void FGMRESMainLoop(std::vector<ScalarType> W, std::vector<ScalarType> Z, su2vector<ScalarType>& g,
-                      su2vector<ScalarType>& sn, CSysVector<ScalarType>& cs, su2vector<ScalarType>& y,
-                      su2vector<ScalarType>& H, int m, CGeometry* geometry, const CConfig* config) const;
+  /*!
+   * \brief Performs first step of the LU_SGS Preconditioner building
+   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+
+  void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs second step of the LU_SGS Preconditioner building
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+
+  void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+
+  void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
+   * \param[in] vec - CSysVector to be multiplied by the preconditioner.
+   * \param[out] prod - Result of the product A*vec.
+   */
+  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
+                                      const CConfig* config) const;
 
   /*!
    * \brief Build the Jacobi preconditioner.
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -28,6 +28,13 @@
 #include<cuda_runtime.h>
 #include"iostream"
 
+namespace KernelParameters{
+
+  inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
+
+  const int MVP_BLOCK_SIZE = 1024;
+  const int MVP_WARP_SIZE = 32;
+}
 /*!
   * \brief assert style function that reads return codes after intercepting CUDA API calls.
   *        It returns the result code and its location if the call is unsuccessful.
@@ -45,4 +52,4 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
   }
 }
 
-#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
+#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
diff --git a/Common/include/toolboxes/allocation_toolbox.hpp b/Common/include/toolboxes/allocation_toolbox.hpp
@@ -36,6 +36,10 @@
 #include <stdlib.h>
 #endif
 
+#ifdef HAVE_CUDA
+#include "../linear_algebra/GPUComms.cuh"
+#endif
+
 #include <cstring>
 
 #include <cassert>
@@ -90,3 +94,55 @@ inline void aligned_free(T* ptr) noexcept {
 }
 
 }  // namespace MemoryAllocation
+
+namespace GPUMemoryAllocation {
+/*!
+ * \brief Memory allocation for variables on the GPU.
+ * \param[in] size in bytes.
+ * \tparam ZeroInit, initialize memory to 0.
+ * \return Pointer to memory, always use gpu_free to deallocate.
+ */
+template <class T, bool ZeroInit = false>
+inline T* gpu_alloc(size_t size) noexcept {
+  void* ptr = nullptr;
+
+#if defined(HAVE_CUDA)
+  gpuErrChk(cudaMalloc((void**)(&ptr), size));
+  if (ZeroInit) gpuErrChk(cudaMemset((void*)(ptr), 0.0, size));
+#else
+  return 0;
+#endif
+
+  return static_cast<T*>(ptr);
+}
+
+/*!
+ * \brief Free memory allocated on the GPU with gpu_alloc.
+ * \param[in] ptr, pointer to memory we want to release.
+ */
+template <class T>
+inline void gpu_free(T* ptr) noexcept {
+#ifdef HAVE_CUDA
+  gpuErrChk(cudaFree((void*)ptr));
+#endif
+}
+/*!
+ * \brief Memory allocation for variables on the GPU along with initialization from a source host array.
+ * \param[in] size in bytes.
+ * \return Pointer to memory, always use gpu_free to deallocate.
+ */
+template <class T>
+inline T* gpu_alloc_cpy(T* src_ptr, size_t size) noexcept {
+  void* ptr = nullptr;
+
+#ifdef HAVE_CUDA
+  gpuErrChk(cudaMalloc((void**)(&ptr), size));
+  gpuErrChk(cudaMemcpy((void*)(ptr), (void*)src_ptr, size, cudaMemcpyHostToDevice));
+  ;
+#else
+  return 0;
+#endif
+
+  return static_cast<T*>(ptr);
+}
+}  // namespace GPUMemoryAllocation
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -32,10 +32,6 @@
 
 #include <cmath>
 
-#ifdef HAVE_CUDA
-#include "../../include/linear_algebra/GPUComms.cuh"
-#endif
-
 template <class ScalarType>
 CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) {
   nPoint = nPointDomain = nVar = nEqn = 0;
@@ -71,11 +67,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(ILU_matrix);
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
-#ifdef HAVE_CUDA
-  cudaFree(d_matrix);
-  cudaFree(d_row_ptr);
-  cudaFree(d_col_ind);
-#endif
+
+  GPUMemoryAllocation::gpu_free(d_matrix);
+  GPUMemoryAllocation::gpu_free(d_row_ptr);
+  GPUMemoryAllocation::gpu_free(d_col_ind);
 
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);
@@ -147,15 +142,17 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
 
   allocAndInit(matrix, nnz * nVar * nEqn);
 
-#if defined(HAVE_CUDA)
-  gpuErrChk(cudaMalloc((void**)(&d_row_ptr), (sizeof(row_ptr) * (nPointDomain + 1.0))));
-  gpuErrChk(cudaMalloc((void**)(&d_col_ind), (sizeof(col_ind) * nnz)));
-  gpuErrChk(cudaMalloc((void**)(&d_matrix), (sizeof(ScalarType) * nnz * nVar * nEqn)));
+  auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
+    ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
+  };
 
-  gpuErrChk(
-      cudaMemcpy((void*)(d_row_ptr), (void*)row_ptr, (sizeof(row_ptr) * (nPointDomain + 1.0)), cudaMemcpyHostToDevice));
-  gpuErrChk(cudaMemcpy((void*)(d_col_ind), (void*)col_ind, (sizeof(col_ind)) * nnz, cudaMemcpyHostToDevice));
-#endif
+  auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+    ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
+  };
+
+  GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
+  GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
+  GPUAllocAndCopy(d_col_ind, col_ind, nnz);
 
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -1,5 +1,5 @@
 /*!
- * \file GPUMatrix.cu
+ * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
  * \author A. Raj
  * \version 8.1.0 "Harrier"
@@ -29,7 +29,7 @@
 #include "../../include/linear_algebra/GPUComms.cuh"
 
 template<typename matrixType, typename vectorType>
-__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_row_ptr, unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
+__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
 {
    int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
    int threadNo = threadIdx.x%32;
@@ -74,10 +74,9 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
    vec.HtDTransfer();
    prod.GPUSetVal(0.0);
 
-  dim3 blockDim(1024,1,1);
-  double gridx = (double) nPointDomain/32.0;
-  gridx = double(ceil(gridx));
-  dim3 gridDim(gridx, 1.0, 1.0);
+  dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
+  int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
+  dim3 gridDim(gridx, 1, 1);
 
   GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
   gpuErrChk( cudaPeekAtLastError() );
diff --git a/Common/src/linear_algebra/CSysVector.cpp b/Common/src/linear_algebra/CSysVector.cpp
@@ -28,10 +28,6 @@
 #include "../../include/linear_algebra/CSysVector.hpp"
 #include "../../include/toolboxes/allocation_toolbox.hpp"
 
-#ifdef HAVE_CUDA
-#include "../../include/linear_algebra/GPUComms.cuh"
-#endif
-
 template <class ScalarType>
 void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
                                         const ScalarType* val, bool valIsArray, bool errorIfParallel) {
@@ -56,10 +52,7 @@ void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numB
 
   if (vec_val == nullptr) vec_val = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nElm * sizeof(ScalarType));
 
-#if defined(HAVE_CUDA)
-  gpuErrChk(cudaMalloc((void**)(&d_vec_val), (sizeof(ScalarType) * nElm)));
-  gpuErrChk(cudaMemset((void*)d_vec_val, 0.0, (sizeof(ScalarType) * nElm)));
-#endif
+  d_vec_val = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nElm * sizeof(ScalarType));
 
   if (val != nullptr) {
     if (!valIsArray) {
@@ -75,9 +68,8 @@ CSysVector<ScalarType>::~CSysVector() {
   if (!std::is_trivial<ScalarType>::value)
     for (auto i = 0ul; i < nElm; i++) vec_val[i].~ScalarType();
   MemoryAllocation::aligned_free(vec_val);
-#ifdef HAVE_CUDA
-  cudaFree(d_vec_val);
-#endif
+
+  GPUMemoryAllocation::gpu_free(d_vec_val);
 }
 
 /*--- Explicit instantiations ---*/
diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu
@@ -1,5 +1,5 @@
 /*!
- * \file GPUVector.cu
+ * \file CSysVectorGPU.cu
  * \brief Implementations of Kernels and Functions for Vector Operations on the GPU
  * \author A. Raj
  * \version 8.1.0 "Harrier"
@@ -25,8 +25,8 @@
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
 
- #include "../../include/linear_algebra/CSysVector.hpp"
-  #include "../../include/linear_algebra/GPUComms.cuh"
+#include "../../include/linear_algebra/CSysVector.hpp"
+#include "../../include/linear_algebra/GPUComms.cuh"
 
 template<class ScalarType>
 void CSysVector<ScalarType>::HtDTransfer(bool trigger) const
@@ -46,4 +46,4 @@ void CSysVector<ScalarType>::GPUSetVal(ScalarType val, bool trigger) const
    if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm)));
 }
 
-template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
+template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
diff --git a/Common/src/linear_algebra/meson.build b/Common/src/linear_algebra/meson.build
@@ -2,7 +2,7 @@ common_src += files(['CSysSolve_b.cpp',
                      'CSysSolve.cpp',
                      'CSysVector.cpp',
                      'CSysMatrix.cpp',
-                     'GPUMatrix.cu',
-                     'GPUVector.cu',
+                     'CSysMatrixGPU.cu',
+                     'CSysVectorGPU.cu',
                      'CPastixWrapper.cpp',
                      'blas_structure.cpp'])