su2code
diff --git a/‎Common/include/CConfig.hpp‎
Lines changed: 7 additions & 0 deletions b/‎Common/include/CConfig.hpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Common/include/linear_algebra/CMatrixVectorProduct.hpp‎
Lines changed: 13 additions & 1 deletion b/‎Common/include/linear_algebra/CMatrixVectorProduct.hpp‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎Common/include/linear_algebra/CSysMatrix.hpp‎
Lines changed: 55 additions & 0 deletions b/‎Common/include/linear_algebra/CSysMatrix.hpp‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎Common/include/linear_algebra/CSysVector.hpp‎
Lines changed: 26 additions & 0 deletions b/‎Common/include/linear_algebra/CSysVector.hpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎Common/include/linear_algebra/GPUComms.cuh‎
Lines changed: 53 additions & 0 deletions b/‎Common/include/linear_algebra/GPUComms.cuh‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎Common/include/toolboxes/allocation_toolbox.hpp‎
Lines changed: 53 additions & 0 deletions b/‎Common/include/toolboxes/allocation_toolbox.hpp‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎Common/src/CConfig.cpp‎
Lines changed: 2 additions & 0 deletions b/‎Common/src/CConfig.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Common/src/linear_algebra/CSysMatrix.cpp‎
Lines changed: 30 additions & 7 deletions b/‎Common/src/linear_algebra/CSysMatrix.cpp‎
Lines changed: 30 additions & 7 deletions
@@ -132,6 +132,7 @@ class CConfig {
   Sens_Remove_Sharp,        /*!< \brief Flag for removing or not the sharp edges from the sensitivity computation. */
   Hold_GridFixed,           /*!< \brief Flag hold fixed some part of the mesh during the deformation. */
   Axisymmetric,             /*!< \brief Flag for axisymmetric calculations */
+  Enable_Cuda,              /*!< \brief Flag for switching GPU computing*/
   Integrated_HeatFlux;      /*!< \brief Flag for heat flux BC whether it deals with integrated values.*/
   su2double Buffet_k;       /*!< \brief Sharpness coefficient for buffet sensor.*/
   su2double Buffet_lambda;  /*!< \brief Offset parameter for buffet sensor.*/
@@ -6192,6 +6193,12 @@ class CConfig {
    */
   bool GetAxisymmetric(void) const { return Axisymmetric; }
 
+  /*!
+   * \brief Get information about GPU support.
+   * \return <code>TRUE</code> if cuda is enabled; otherwise <code>FALSE</code>.
+   */
+  bool GetCUDA(void) const { return Enable_Cuda; }
+
   /*!
    * \brief Subtract one to the index of the finest grid (full multigrid strategy).
    * \return Change the index of the finest grid.
 
@@ -51,6 +51,7 @@
  * passed to a single implementation of the Krylov solvers.
  * This abstraction may also be used to define matrix-free products.
  */
+
 template <class ScalarType>
 class CMatrixVectorProduct {
  public:
@@ -94,6 +95,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
    * \param[out] v - CSysVector that is the result of the product
    */
   inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
-    matrix.MatrixVectorProduct(u, v, geometry, config);
+    if (config->GetCUDA()) {
+#ifdef HAVE_CUDA
+      matrix.GPUMatrixVectorProduct(u, v, geometry, config);
+#else
+      SU2_MPI::Error(
+          "\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
+          "options enabled in Meson to access GPU Functions",
+          CURRENT_FUNCTION);
+#endif
+    } else {
+      matrix.MatrixVectorProduct(u, v, geometry, config);
+    }
   }
 };
@@ -145,6 +145,12 @@ class CSysMatrix {
   const unsigned long* col_ind; /*!< \brief Column index for each of the elements in val(). */
   const unsigned long* col_ptr; /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
 
+  ScalarType* d_matrix;           /*!< \brief Device Pointer to store the matrix values on the GPU. */
+  const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
+  const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
+  bool useCuda;                   /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
+                                     Mainly used to conditionally free GPU memory in the class destructor. */
+
   ScalarType* ILU_matrix;           /*!< \brief Entries of the ILU sparse matrix. */
   unsigned long nnz_ilu;            /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
   const unsigned long* row_ptr_ilu; /*!< \brief Pointers to the first element in each row (ILU). */
@@ -391,6 +397,12 @@ class CSysMatrix {
    */
   void SetValDiagonalZero(void);
 
+  /*!
+   * \brief Performs the memory copy from host to device.
+   * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
+   */
+  void HtDTransfer(bool trigger = true) const;
+
   /*!
    * \brief Get a pointer to the start of block "ij"
    * \param[in] block_i - Row index.
@@ -838,6 +850,49 @@ class CSysMatrix {
   void MatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
                            const CConfig* config) const;
 
+  /*!
+   * \brief Performs the product of a sparse matrix by a CSysVector.
+   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
+                              const CConfig* config) const;
+
+  /*!
+   * \brief Performs first step of the LU_SGS Preconditioner building
+   * \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs second step of the LU_SGS Preconditioner building
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
+   * \param[in] geometry - Geometrical definition of the problem.
+   * \param[in] config - Definition of the particular problem.
+   * \param[out] prod - Result of the product.
+   */
+  void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
+
+  /*!
+   * \brief Multiply CSysVector by the preconditioner all of which are stored on the device
+   * \param[in] vec - CSysVector to be multiplied by the preconditioner.
+   * \param[out] prod - Result of the product A*vec.
+   */
+  void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
+                                      const CConfig* config) const;
+
   /*!
    * \brief Build the Jacobi preconditioner.
    */
 
@@ -32,6 +32,7 @@
 #include "../parallelization/omp_structure.hpp"
 #include "../parallelization/vectorization.hpp"
 #include "vector_expressions.hpp"
+#include "../../include/CConfig.hpp"
 
 /*!
  * \brief OpenMP worksharing construct used in CSysVector for loops.
@@ -71,6 +72,8 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
   unsigned long nElmDomain = 0; /*!< \brief Total number of elements without Ghost cells. */
   unsigned long nVar = 1;       /*!< \brief Number of elements in a block. */
 
+  ScalarType* d_vec_val = nullptr; /*!< \brief Device Pointer to store the vector values on the GPU. */
+
   /*!
    * \brief Generic initialization from a scalar or array.
    * \note If val==nullptr vec_val is not initialized, only allocated.
@@ -195,6 +198,29 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
     END_CSYSVEC_PARFOR
   }
 
+  /*!
+   * \brief Performs the memory copy from host to device.
+   * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
+   */
+  void HtDTransfer(bool trigger = true) const;
+
+  /*!
+   * \brief Performs the memory copy from device to host.
+   * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
+   */
+  void DtHTransfer(bool trigger = true) const;
+
+  /*!
+   * \brief Sets all the elements of the GPU vector to a certain value
+   * \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
+   */
+  void GPUSetVal(ScalarType val, bool trigger = true) const;
+
+  /*!
+   * \brief return device pointer that points to the CSysVector values in GPU memory
+   */
+  inline ScalarType* GetDevicePointer() const { return d_vec_val; }
+
   /*!
    * \brief return the number of local elements in the CSysVector
    */
 
@@ -0,0 +1,53 @@
+/*!
+\file GPUComms.cuh
+* \brief Header file containing universal functions that provide basic and essential utilities for other GPU processes
+* \author A. Raj
+* \version 8.2.0 "Harrier"
+*
+* SU2 Project Website: https://su2code.github.io
+*
+* The SU2 Project is maintained by the SU2 Foundation
+* (http://su2foundation.org)
+*
+* Copyright 2012-2024, SU2 Contributors (cf. AUTHORS.md)
+*
+* SU2 is free software; you can redistribute it and/or
+* modify it under the terms of the GNU Lesser General Public
+* License as published by the Free Software Foundation; either
+* version 2.1 of the License, or (at your option) any later version.
+*
+* SU2 is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public
+* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include<cuda_runtime.h>
+#include<iostream>
+
+namespace KernelParameters{
+
+inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
+
+static constexpr int MVP_BLOCK_SIZE = 1024;
+static constexpr int MVP_WARP_SIZE = 32;
+}
+/*!
+* \brief assert style function that reads return codes after intercepting CUDA API calls.
+*        It returns the result code and its location if the call is unsuccessful.
+* \param[in] code - result code of CUDA function
+* \param[in] file - name of file holding the function
+* \param[in] line - line containing the function
+*/
+
+inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
+  if (code != cudaSuccess){
+     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
+     if (abort) exit(code);
+  }
+}
+
+#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
@@ -36,6 +36,10 @@
 #include <stdlib.h>
 #endif
 
+#ifdef HAVE_CUDA
+#include "../linear_algebra/GPUComms.cuh"
+#endif
+
 #include <cstring>
 
 #include <cassert>
@@ -90,3 +94,52 @@ inline void aligned_free(T* ptr) noexcept {
 }
 
 }  // namespace MemoryAllocation
+
+namespace GPUMemoryAllocation {
+/*!
+ * \brief Memory allocation for variables on the GPU.
+ * \param[in] size in bytes.
+ * \tparam ZeroInit, initialize memory to 0.
+ * \return Pointer to memory, always use gpu_free to deallocate.
+ */
+template <class T, bool ZeroInit = false>
+inline T* gpu_alloc(size_t size) noexcept {
+  void* ptr = nullptr;
+
+#if defined(HAVE_CUDA)
+  gpuErrChk(cudaMalloc((void**)(&ptr), size));
+  if (ZeroInit) gpuErrChk(cudaMemset((void*)(ptr), 0.0, size));
+#else
+  return 0;
+#endif
+
+  return static_cast<T*>(ptr);
+}
+
+/*!
+ * \brief Free memory allocated on the GPU with gpu_alloc.
+ * \param[in] ptr, pointer to memory we want to release.
+ */
+template <class T>
+inline void gpu_free(T* ptr) noexcept {
+#ifdef HAVE_CUDA
+  gpuErrChk(cudaFree((void*)ptr));
+#endif
+}
+/*!
+ * \brief Memory allocation for variables on the GPU along with initialization from a source host array.
+ * \param[in] size in bytes.
+ * \return Pointer to memory, always use gpu_free to deallocate.
+ */
+template <class T>
+inline T* gpu_alloc_cpy(const T* src_ptr, size_t size) noexcept {
+  void* ptr = nullptr;
+
+#ifdef HAVE_CUDA
+  gpuErrChk(cudaMalloc((void**)(&ptr), size));
+  gpuErrChk(cudaMemcpy((void*)(ptr), (void*)src_ptr, size, cudaMemcpyHostToDevice));
+#endif
+
+  return static_cast<T*>(ptr);
+}
+}  // namespace GPUMemoryAllocation
@@ -1149,6 +1149,8 @@ void CConfig::SetConfig_Options() {
 
   /*\brief AXISYMMETRIC \n DESCRIPTION: Axisymmetric simulation \n DEFAULT: false \ingroup Config */
   addBoolOption("AXISYMMETRIC", Axisymmetric, false);
+  /*\brief ENABLE_CUDA \n DESCRIPTION: GPU Acceleration \n DEFAULT: false \ingroup Config */
+  addBoolOption("ENABLE_CUDA", Enable_Cuda, false);
   /* DESCRIPTION: Add the gravity force */
   addBoolOption("GRAVITY_FORCE", GravityForce, false);
   /* DESCRIPTION: Add the Vorticity Confinement term*/
 
@@ -68,6 +68,12 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
   MemoryAllocation::aligned_free(matrix);
   MemoryAllocation::aligned_free(invM);
 
+  if (useCuda) {
+    GPUMemoryAllocation::gpu_free(d_matrix);
+    GPUMemoryAllocation::gpu_free(d_row_ptr);
+    GPUMemoryAllocation::gpu_free(d_col_ind);
+  }
+
 #ifdef USE_MKL
   mkl_jit_destroy(MatrixMatrixProductJitter);
   mkl_jit_destroy(MatrixVectorProductJitterBetaZero);
@@ -131,6 +137,30 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   col_ind = csr.innerIdx();
   dia_ptr = csr.diagPtr();
 
+  /*--- Allocate data. ---*/
+  auto allocAndInit = [](ScalarType*& ptr, unsigned long num) {
+    ptr = MemoryAllocation::aligned_alloc<ScalarType, true>(64, num * sizeof(ScalarType));
+  };
+
+  allocAndInit(matrix, nnz * nVar * nEqn);
+
+  useCuda = config->GetCUDA();
+
+  if (useCuda) {
+    /*--- Allocate GPU data. ---*/
+    auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
+    };
+
+    auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
+      ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
+    };
+
+    GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
+    GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
+    GPUAllocAndCopy(d_col_ind, col_ind, nnz);
+  }
+
   if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
 
   if (type == ConnectivityType::FiniteVolume) {
@@ -151,13 +181,6 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
     nnz_ilu = csr_ilu.getNumNonZeros();
   }
 
-  /*--- Allocate data. ---*/
-  auto allocAndInit = [](ScalarType*& ptr, unsigned long num) {
-    ptr = MemoryAllocation::aligned_alloc<ScalarType, true>(64, num * sizeof(ScalarType));
-  };
-
-  allocAndInit(matrix, nnz * nVar * nEqn);
-
   /*--- Preconditioners. ---*/
 
   if (ilu_needed) allocAndInit(ILU_matrix, nnz_ilu * nVar * nEqn);