Skip to content

Commit a0ab3da

Browse files
authored
Merge pull request #2346 from areenraj/master
added GPU acceleration of matrix-vector products
2 parents b49318e + 1a2b50b commit a0ab3da

File tree

16 files changed

+406
-10
lines changed

16 files changed

+406
-10
lines changed

Common/include/CConfig.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ class CConfig {
132132
Sens_Remove_Sharp, /*!< \brief Flag for removing or not the sharp edges from the sensitivity computation. */
133133
Hold_GridFixed, /*!< \brief Flag hold fixed some part of the mesh during the deformation. */
134134
Axisymmetric, /*!< \brief Flag for axisymmetric calculations */
135+
Enable_Cuda, /*!< \brief Flag for switching GPU computing*/
135136
Integrated_HeatFlux; /*!< \brief Flag for heat flux BC whether it deals with integrated values.*/
136137
su2double Buffet_k; /*!< \brief Sharpness coefficient for buffet sensor.*/
137138
su2double Buffet_lambda; /*!< \brief Offset parameter for buffet sensor.*/
@@ -6192,6 +6193,12 @@ class CConfig {
61926193
*/
61936194
bool GetAxisymmetric(void) const { return Axisymmetric; }
61946195

6196+
/*!
6197+
* \brief Get information about GPU support.
6198+
* \return <code>TRUE</code> if cuda is enabled; otherwise <code>FALSE</code>.
6199+
*/
6200+
bool GetCUDA(void) const { return Enable_Cuda; }
6201+
61956202
/*!
61966203
* \brief Subtract one to the index of the finest grid (full multigrid strategy).
61976204
* \return Change the index of the finest grid.

Common/include/linear_algebra/CMatrixVectorProduct.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
* passed to a single implementation of the Krylov solvers.
5252
* This abstraction may also be used to define matrix-free products.
5353
*/
54+
5455
template <class ScalarType>
5556
class CMatrixVectorProduct {
5657
public:
@@ -94,6 +95,17 @@ class CSysMatrixVectorProduct final : public CMatrixVectorProduct<ScalarType> {
9495
* \param[out] v - CSysVector that is the result of the product
9596
*/
9697
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
97-
matrix.MatrixVectorProduct(u, v, geometry, config);
98+
if (config->GetCUDA()) {
99+
#ifdef HAVE_CUDA
100+
matrix.GPUMatrixVectorProduct(u, v, geometry, config);
101+
#else
102+
SU2_MPI::Error(
103+
"\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
104+
"options enabled in Meson to access GPU Functions",
105+
CURRENT_FUNCTION);
106+
#endif
107+
} else {
108+
matrix.MatrixVectorProduct(u, v, geometry, config);
109+
}
98110
}
99111
};

Common/include/linear_algebra/CSysMatrix.hpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,12 @@ class CSysMatrix {
145145
const unsigned long* col_ind; /*!< \brief Column index for each of the elements in val(). */
146146
const unsigned long* col_ptr; /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
147147

148+
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
149+
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
150+
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
151+
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
152+
Mainly used to conditionally free GPU memory in the class destructor. */
153+
148154
ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
149155
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
150156
const unsigned long* row_ptr_ilu; /*!< \brief Pointers to the first element in each row (ILU). */
@@ -391,6 +397,12 @@ class CSysMatrix {
391397
*/
392398
void SetValDiagonalZero(void);
393399

400+
/*!
401+
* \brief Performs the memory copy from host to device.
402+
* \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
403+
*/
404+
void HtDTransfer(bool trigger = true) const;
405+
394406
/*!
395407
* \brief Get a pointer to the start of block "ij"
396408
* \param[in] block_i - Row index.
@@ -838,6 +850,49 @@ class CSysMatrix {
838850
void MatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
839851
const CConfig* config) const;
840852

853+
/*!
854+
* \brief Performs the product of a sparse matrix by a CSysVector.
855+
* \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
856+
* \param[in] geometry - Geometrical definition of the problem.
857+
* \param[in] config - Definition of the particular problem.
858+
* \param[out] prod - Result of the product.
859+
*/
860+
void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
861+
const CConfig* config) const;
862+
863+
/*!
864+
* \brief Performs first step of the LU_SGS Preconditioner building
865+
* \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
866+
* \param[in] geometry - Geometrical definition of the problem.
867+
* \param[in] config - Definition of the particular problem.
868+
* \param[out] prod - Result of the product.
869+
*/
870+
void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
871+
872+
/*!
873+
* \brief Performs second step of the LU_SGS Preconditioner building
874+
* \param[in] geometry - Geometrical definition of the problem.
875+
* \param[in] config - Definition of the particular problem.
876+
* \param[out] prod - Result of the product.
877+
*/
878+
void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
879+
880+
/*!
881+
* \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
882+
* \param[in] geometry - Geometrical definition of the problem.
883+
* \param[in] config - Definition of the particular problem.
884+
* \param[out] prod - Result of the product.
885+
*/
886+
void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
887+
888+
/*!
889+
* \brief Multiply CSysVector by the preconditioner all of which are stored on the device
890+
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
891+
* \param[out] prod - Result of the product A*vec.
892+
*/
893+
void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
894+
const CConfig* config) const;
895+
841896
/*!
842897
* \brief Build the Jacobi preconditioner.
843898
*/

Common/include/linear_algebra/CSysVector.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "../parallelization/omp_structure.hpp"
3333
#include "../parallelization/vectorization.hpp"
3434
#include "vector_expressions.hpp"
35+
#include "../../include/CConfig.hpp"
3536

3637
/*!
3738
* \brief OpenMP worksharing construct used in CSysVector for loops.
@@ -71,6 +72,8 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
7172
unsigned long nElmDomain = 0; /*!< \brief Total number of elements without Ghost cells. */
7273
unsigned long nVar = 1; /*!< \brief Number of elements in a block. */
7374

75+
ScalarType* d_vec_val = nullptr; /*!< \brief Device Pointer to store the vector values on the GPU. */
76+
7477
/*!
7578
* \brief Generic initialization from a scalar or array.
7679
* \note If val==nullptr vec_val is not initialized, only allocated.
@@ -195,6 +198,29 @@ class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType>
195198
END_CSYSVEC_PARFOR
196199
}
197200

201+
/*!
202+
* \brief Performs the memory copy from host to device.
203+
* \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
204+
*/
205+
void HtDTransfer(bool trigger = true) const;
206+
207+
/*!
208+
* \brief Performs the memory copy from device to host.
209+
* \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
210+
*/
211+
void DtHTransfer(bool trigger = true) const;
212+
213+
/*!
214+
* \brief Sets all the elements of the GPU vector to a certain value
215+
* \param[in] trigger - boolean value that decides whether to conduct the transfer or not. True by default.
216+
*/
217+
void GPUSetVal(ScalarType val, bool trigger = true) const;
218+
219+
/*!
220+
* \brief return device pointer that points to the CSysVector values in GPU memory
221+
*/
222+
inline ScalarType* GetDevicePointer() const { return d_vec_val; }
223+
198224
/*!
199225
* \brief return the number of local elements in the CSysVector
200226
*/
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*!
2+
\file GPUComms.cuh
3+
* \brief Header file containing universal functions that provide basic and essential utilities for other GPU processes
4+
* \author A. Raj
5+
* \version 8.2.0 "Harrier"
6+
*
7+
* SU2 Project Website: https://su2code.github.io
8+
*
9+
* The SU2 Project is maintained by the SU2 Foundation
10+
* (http://su2foundation.org)
11+
*
12+
* Copyright 2012-2024, SU2 Contributors (cf. AUTHORS.md)
13+
*
14+
* SU2 is free software; you can redistribute it and/or
15+
* modify it under the terms of the GNU Lesser General Public
16+
* License as published by the Free Software Foundation; either
17+
* version 2.1 of the License, or (at your option) any later version.
18+
*
19+
* SU2 is distributed in the hope that it will be useful,
20+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
21+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22+
* Lesser General Public License for more details.
23+
*
24+
* You should have received a copy of the GNU Lesser General Public
25+
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
26+
*/
27+
28+
#include<cuda_runtime.h>
29+
#include<iostream>
30+
31+
namespace KernelParameters{
32+
33+
inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
34+
35+
static constexpr int MVP_BLOCK_SIZE = 1024;
36+
static constexpr int MVP_WARP_SIZE = 32;
37+
}
38+
/*!
39+
* \brief assert style function that reads return codes after intercepting CUDA API calls.
40+
* It returns the result code and its location if the call is unsuccessful.
41+
* \param[in] code - result code of CUDA function
42+
* \param[in] file - name of file holding the function
43+
* \param[in] line - line containing the function
44+
*/
45+
46+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){
47+
if (code != cudaSuccess){
48+
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
49+
if (abort) exit(code);
50+
}
51+
}
52+
53+
#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

Common/include/toolboxes/allocation_toolbox.hpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
#include <stdlib.h>
3737
#endif
3838

39+
#ifdef HAVE_CUDA
40+
#include "../linear_algebra/GPUComms.cuh"
41+
#endif
42+
3943
#include <cstring>
4044

4145
#include <cassert>
@@ -90,3 +94,52 @@ inline void aligned_free(T* ptr) noexcept {
9094
}
9195

9296
} // namespace MemoryAllocation
97+
98+
namespace GPUMemoryAllocation {
99+
/*!
100+
* \brief Memory allocation for variables on the GPU.
101+
* \param[in] size in bytes.
102+
* \tparam ZeroInit, initialize memory to 0.
103+
* \return Pointer to memory, always use gpu_free to deallocate.
104+
*/
105+
template <class T, bool ZeroInit = false>
106+
inline T* gpu_alloc(size_t size) noexcept {
107+
void* ptr = nullptr;
108+
109+
#if defined(HAVE_CUDA)
110+
gpuErrChk(cudaMalloc((void**)(&ptr), size));
111+
if (ZeroInit) gpuErrChk(cudaMemset((void*)(ptr), 0.0, size));
112+
#else
113+
return 0;
114+
#endif
115+
116+
return static_cast<T*>(ptr);
117+
}
118+
119+
/*!
120+
* \brief Free memory allocated on the GPU with gpu_alloc.
121+
* \param[in] ptr, pointer to memory we want to release.
122+
*/
123+
template <class T>
124+
inline void gpu_free(T* ptr) noexcept {
125+
#ifdef HAVE_CUDA
126+
gpuErrChk(cudaFree((void*)ptr));
127+
#endif
128+
}
129+
/*!
130+
* \brief Memory allocation for variables on the GPU along with initialization from a source host array.
131+
* \param[in] size in bytes.
132+
* \return Pointer to memory, always use gpu_free to deallocate.
133+
*/
134+
template <class T>
135+
inline T* gpu_alloc_cpy(const T* src_ptr, size_t size) noexcept {
136+
void* ptr = nullptr;
137+
138+
#ifdef HAVE_CUDA
139+
gpuErrChk(cudaMalloc((void**)(&ptr), size));
140+
gpuErrChk(cudaMemcpy((void*)(ptr), (void*)src_ptr, size, cudaMemcpyHostToDevice));
141+
#endif
142+
143+
return static_cast<T*>(ptr);
144+
}
145+
} // namespace GPUMemoryAllocation

Common/src/CConfig.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1149,6 +1149,8 @@ void CConfig::SetConfig_Options() {
11491149

11501150
/*\brief AXISYMMETRIC \n DESCRIPTION: Axisymmetric simulation \n DEFAULT: false \ingroup Config */
11511151
addBoolOption("AXISYMMETRIC", Axisymmetric, false);
1152+
/*\brief ENABLE_CUDA \n DESCRIPTION: GPU Acceleration \n DEFAULT: false \ingroup Config */
1153+
addBoolOption("ENABLE_CUDA", Enable_Cuda, false);
11521154
/* DESCRIPTION: Add the gravity force */
11531155
addBoolOption("GRAVITY_FORCE", GravityForce, false);
11541156
/* DESCRIPTION: Add the Vorticity Confinement term*/

Common/src/linear_algebra/CSysMatrix.cpp

Lines changed: 30 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
6868
MemoryAllocation::aligned_free(matrix);
6969
MemoryAllocation::aligned_free(invM);
7070

71+
if (useCuda) {
72+
GPUMemoryAllocation::gpu_free(d_matrix);
73+
GPUMemoryAllocation::gpu_free(d_row_ptr);
74+
GPUMemoryAllocation::gpu_free(d_col_ind);
75+
}
76+
7177
#ifdef USE_MKL
7278
mkl_jit_destroy(MatrixMatrixProductJitter);
7379
mkl_jit_destroy(MatrixVectorProductJitterBetaZero);
@@ -131,6 +137,30 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
131137
col_ind = csr.innerIdx();
132138
dia_ptr = csr.diagPtr();
133139

140+
/*--- Allocate data. ---*/
141+
auto allocAndInit = [](ScalarType*& ptr, unsigned long num) {
142+
ptr = MemoryAllocation::aligned_alloc<ScalarType, true>(64, num * sizeof(ScalarType));
143+
};
144+
145+
allocAndInit(matrix, nnz * nVar * nEqn);
146+
147+
useCuda = config->GetCUDA();
148+
149+
if (useCuda) {
150+
/*--- Allocate GPU data. ---*/
151+
auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
152+
ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
153+
};
154+
155+
auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
156+
ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
157+
};
158+
159+
GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
160+
GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
161+
GPUAllocAndCopy(d_col_ind, col_ind, nnz);
162+
}
163+
134164
if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
135165

136166
if (type == ConnectivityType::FiniteVolume) {
@@ -151,13 +181,6 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
151181
nnz_ilu = csr_ilu.getNumNonZeros();
152182
}
153183

154-
/*--- Allocate data. ---*/
155-
auto allocAndInit = [](ScalarType*& ptr, unsigned long num) {
156-
ptr = MemoryAllocation::aligned_alloc<ScalarType, true>(64, num * sizeof(ScalarType));
157-
};
158-
159-
allocAndInit(matrix, nnz * nVar * nEqn);
160-
161184
/*--- Preconditioners. ---*/
162185

163186
if (ilu_needed) allocAndInit(ILU_matrix, nnz_ilu * nVar * nEqn);

0 commit comments

Comments
 (0)