resolving conflicts

areenraj · areenraj · commit 9dd302893d60 · 2025-06-25T19:27:59.000+05:30
diff --git a/Common/include/linear_algebra/CMatrixVectorProduct.hpp b/Common/include/linear_algebra/CMatrixVectorProduct.hpp
@@ -50,12 +50,6 @@
  * handle the different types of matrix-vector products and still be
  * passed to a single implementation of the Krylov solvers.
  * This abstraction may also be used to define matrix-free products.
- *
- * There is also the use of a dummy class being made to select the
- * correct function as defined by the user while deciding between
- * CPU or GPU execution. This dummy class calls the correct member
- * functions from its derived classes to map the suitable path of
- * execution - CPU or GPU.
  */
 
 template <class ScalarType>
@@ -101,14 +95,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-#ifdef HAVE_CUDA
     if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
       matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+#else
+      SU2_MPI::Error(
+          "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
+          "options enabled in Meson to access GPU Functions",
+          CURRENT_FUNCTION);
+#endif
     } else {
       matrix.MatrixVectorProduct(u, v, geometry, config);
     }
-#else
-    matrix.MatrixVectorProduct(u, v, geometry, config);
-#endif
   }
 };
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -148,7 +148,8 @@ class CSysMatrix {
   ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
   const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
   const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
-  const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
+  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                                     Mainly used to conditionally free GPU memory in the class destructor. */
 
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -856,11 +857,38 @@ class CSysMatrix {
    * \param[in] config - Definition of the particular problem.
    * \param[out] prod - Result of the product.
    */
-
   void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                               const CConfig* config) const;
 
   /*!
+<<<<<<< HEAD
+=======
+   * \brief Performs first step of the LU_SGS Preconditioner building
+   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs second step of the LU_SGS Preconditioner building
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+>>>>>>> upstream/develop
    * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
    * \param[in] vec - CSysVector to be multiplied by the preconditioner.
    * \param[out] prod - Result of the product A*vec.
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
@@ -72,7 +72,7 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   unsigned long nElmDomain = 0; /*!< \brief Total number of elements without Ghost cells. */
   unsigned long nVar = 1;       /*!< \brief Number of elements in a block. */
 
-  ScalarType* d_vec_val; /*!< \brief Device Pointer to store the vector values on the GPU. */
+  ScalarType* d_vec_val = nullptr; /*!< \brief Device Pointer to store the vector values on the GPU. */
 
   /*!
    * \brief Generic initialization from a scalar or array.
@@ -217,7 +217,7 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   void GPUSetVal(ScalarType val, bool trigger = true) const;
 
   /*!
-   * \brief return the number of local elements in the CSysVector
+   * \brief return device pointer that points to the CSysVector values in GPU memory
    */
   inline ScalarType* GetDevicePointer() const { return d_vec_val; }
 
diff --git a/Common/include/linear_algebra/GPUComms.cuh b/Common/include/linear_algebra/GPUComms.cuh
@@ -2,7 +2,7 @@
 \file GPUComms.cuh
 * \brief Header file containing universal functions that provide basic and essential utilities for other GPU processes
 * \author A. Raj
-* \version 8.1.0 "Harrier"
+* \version 8.2.0 "Harrier"
 *
 * SU2 Project Website: https://su2code.github.io
 *
@@ -26,7 +26,7 @@
 */
 
 #include<cuda_runtime.h>
-#include"iostream"
+#include<iostream>
 
 namespace KernelParameters{
 
@@ -36,21 +36,19 @@ namespace KernelParameters{
   /*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
   inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }   
 
-  const int MVP_BLOCK_SIZE = 1024;
-  const int MVP_WARP_SIZE = 32;
+static constexpr int MVP_BLOCK_SIZE = 1024;
+static constexpr int MVP_WARP_SIZE = 32;
 }
 /*!
-  * \brief assert style function that reads return codes after intercepting CUDA API calls.
-  *        It returns the result code and its location if the call is unsuccessful.
-  * \param[in] code - result code of CUDA function
-  * \param[in] file - name of file holding the function
-  * \param[in] line - line containing the function
-  */
+* \brief assert style function that reads return codes after intercepting CUDA API calls.
+*        It returns the result code and its location if the call is unsuccessful.
+* \param[in] code - result code of CUDA function
+* \param[in] file - name of file holding the function
+* \param[in] line - line containing the function
+*/
 
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
-{
-  if (code != cudaSuccess)
-  {
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+  if (code != cudaSuccess){
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
diff --git a/Common/include/toolboxes/allocation_toolbox.hpp b/Common/include/toolboxes/allocation_toolbox.hpp
@@ -132,15 +132,12 @@ inline void gpu_free(T* ptr) noexcept {
  * \return Pointer to memory, always use gpu_free to deallocate.
  */
 template <class T>
-inline T* gpu_alloc_cpy(T* src_ptr, size_t size) noexcept {
+inline T* gpu_alloc_cpy(const T* src_ptr, size_t size) noexcept {
   void* ptr = nullptr;
 
 #ifdef HAVE_CUDA
   gpuErrChk(cudaMalloc((void**)(&ptr), size));
   gpuErrChk(cudaMemcpy((void*)(ptr), (void*)src_ptr, size, cudaMemcpyHostToDevice));
-  ;
-#else
-  return 0;
 #endif
 
   return static_cast<T*>(ptr);
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -68,9 +68,11 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
 
-  GPUMemoryAllocation::gpu_free(d_matrix);
-  GPUMemoryAllocation::gpu_free(d_row_ptr);
-  GPUMemoryAllocation::gpu_free(d_col_ind);
+  if (useCuda) {
+    GPUMemoryAllocation::gpu_free(d_matrix);
+    GPUMemoryAllocation::gpu_free(d_row_ptr);
+    GPUMemoryAllocation::gpu_free(d_col_ind);
+  }
 
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);
@@ -142,19 +144,23 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
 
   allocAndInit(matrix, nnz * nVar * nEqn);
 
-  auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
-    ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
-  };
+  useCuda = config->GetCUDA();
 
-  auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
-    ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
-  };
+  if (useCuda) {
+    /*--- Allocate GPU data. ---*/
+    auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
+    };
+
+    auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
+    };
+
+    GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
+    GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
+    GPUAllocAndCopy(d_col_ind, col_ind, nnz);
+  }
 
-  GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
-  GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
-  GPUAllocAndCopy(d_col_ind, col_ind, nnz);
-  GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
-  
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {
diff --git a/Common/src/linear_algebra/CSysMatrixGPU.cu b/Common/src/linear_algebra/CSysMatrixGPU.cu
@@ -2,7 +2,7 @@
  * \file CSysMatrixGPU.cu
  * \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
  * \author A. Raj
- * \version 8.1.0 "Harrier"
+ * \version 8.2.0 "Harrier"
  *
  * SU2 Project Website: https://su2code.github.io
  *
diff --git a/Common/src/linear_algebra/CSysVectorGPU.cu b/Common/src/linear_algebra/CSysVectorGPU.cu
@@ -2,7 +2,7 @@
  * \file CSysVectorGPU.cu
  * \brief Implementations of Kernels and Functions for Vector Operations on the GPU
  * \author A. Raj
- * \version 8.1.0 "Harrier"
+ * \version 8.2.0 "Harrier"
  *
  * SU2 Project Website: https://su2code.github.io
  *
diff --git a/TestCases/gpu/flatplate/lam_flatplate.cfg b/TestCases/gpu/flatplate/lam_flatplate.cfg

Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`* \file CSysMatrixGPU.cu`
`3`	`3`	`* \brief Implementations of Kernels and Functions for Matrix Operations on the GPU`
`4`	`4`	`* \author A. Raj`
`5`		`- * \version 8.1.0 "Harrier"`
	`5`	`+ * \version 8.2.0 "Harrier"`
`6`	`6`	`*`
`7`	`7`	`* SU2 Project Website: https://su2code.github.io`
`8`	`8`	`*`
Original file line number	Diff line number	Diff line change
`@@ -2,7 +2,7 @@`
`2`	`2`	`* \file CSysVectorGPU.cu`
`3`	`3`	`* \brief Implementations of Kernels and Functions for Vector Operations on the GPU`
`4`	`4`	`* \author A. Raj`
`5`		`- * \version 8.1.0 "Harrier"`
	`5`	`+ * \version 8.2.0 "Harrier"`
`6`	`6`	`*`
`7`	`7`	`* SU2 Project Website: https://su2code.github.io`
`8`	`8`	`*`