Skip to content

Commit b7ed530

Browse files
committed
Introduced Memory Wrapper Functions, Error Handling and Constant Block Sizes
1 parent c7777c8 commit b7ed530

File tree

9 files changed

+134
-50
lines changed

9 files changed

+134
-50
lines changed

Common/include/linear_algebra/CMatrixVectorProduct.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,10 @@ class CGpuExecution : public CExecutionPath<ScalarType> {
103103
#ifdef HAVE_CUDA
104104
matrix.GPUMatrixVectorProduct(u, v, geometry, config);
105105
#else
106-
std::cerr << "\nError in launching Matrix-Vector Product Function\n";
107-
std::cerr << "ENABLE_CUDA is set to YES\n";
108-
std::cerr << "Please compile with CUDA options enabled in Meson to access GPU Functions" << std::endl;
106+
SU2_MPI::Error(
107+
"\nError in launching Matrix-Vector Product Function\nENABLE_CUDA is set to YES\nPlease compile with CUDA "
108+
"options enabled in Meson to access GPU Functions",
109+
CURRENT_FUNCTION);
109110
#endif
110111
}
111112
};

Common/include/linear_algebra/CSysMatrix.hpp

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ class CSysMatrix {
145145
const unsigned long* col_ind; /*!< \brief Column index for each of the elements in val(). */
146146
const unsigned long* col_ptr; /*!< \brief The transpose of col_ind, pointer to blocks with the same column index. */
147147

148-
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
149-
unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
150-
unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
148+
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
149+
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
150+
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
151151

152152
ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
153153
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -859,9 +859,41 @@ class CSysMatrix {
859859
void GPUMatrixVectorProduct(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
860860
const CConfig* config) const;
861861

862-
void FGMRESMainLoop(std::vector<ScalarType> W, std::vector<ScalarType> Z, su2vector<ScalarType>& g,
863-
su2vector<ScalarType>& sn, CSysVector<ScalarType>& cs, su2vector<ScalarType>& y,
864-
su2vector<ScalarType>& H, int m, CGeometry* geometry, const CConfig* config) const;
862+
/*!
863+
* \brief Performs first step of the LU_SGS Preconditioner building
864+
* \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
865+
* \param[in] geometry - Geometrical definition of the problem.
866+
* \param[in] config - Definition of the particular problem.
867+
* \param[out] prod - Result of the product.
868+
*/
869+
870+
void GPUFirstSymmetricIteration(ScalarType& vec, ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
871+
872+
/*!
873+
* \brief Performs second step of the LU_SGS Preconditioner building
874+
* \param[in] geometry - Geometrical definition of the problem.
875+
* \param[in] config - Definition of the particular problem.
876+
* \param[out] prod - Result of the product.
877+
*/
878+
879+
void GPUSecondSymmetricIteration(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
880+
881+
/*!
882+
* \brief Performs Gaussian Elimination between diagional blocks of the matrix and the prod vector
883+
* \param[in] geometry - Geometrical definition of the problem.
884+
* \param[in] config - Definition of the particular problem.
885+
* \param[out] prod - Result of the product.
886+
*/
887+
888+
void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
889+
890+
/*!
891+
* \brief Multiply CSysVector by the preconditioner all of which are stored on the device
892+
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
893+
* \param[out] prod - Result of the product A*vec.
894+
*/
895+
void GPUComputeLU_SGSPreconditioner(ScalarType& vec, ScalarType& prod, CGeometry* geometry,
896+
const CConfig* config) const;
865897

866898
/*!
867899
* \brief Build the Jacobi preconditioner.

Common/include/linear_algebra/GPUComms.cuh

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
#include<cuda_runtime.h>
2929
#include"iostream"
3030

31+
namespace KernelParameters{
32+
33+
inline constexpr int round_up_division(const int multiple, int x) { return ((x + multiple - 1) / multiple); }
34+
35+
const int MVP_BLOCK_SIZE = 1024;
36+
const int MVP_WARP_SIZE = 32;
37+
}
3138
/*!
3239
* \brief assert style function that reads return codes after intercepting CUDA API calls.
3340
* It returns the result code and its location if the call is unsuccessful.
@@ -45,4 +52,4 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=t
4552
}
4653
}
4754

48-
#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
55+
#define gpuErrChk(ans) { gpuAssert((ans), __FILE__, __LINE__); }

Common/include/toolboxes/allocation_toolbox.hpp

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@
3636
#include <stdlib.h>
3737
#endif
3838

39+
#ifdef HAVE_CUDA
40+
#include "../linear_algebra/GPUComms.cuh"
41+
#endif
42+
3943
#include <cstring>
4044

4145
#include <cassert>
@@ -90,3 +94,55 @@ inline void aligned_free(T* ptr) noexcept {
9094
}
9195

9296
} // namespace MemoryAllocation
97+
98+
namespace GPUMemoryAllocation {
99+
/*!
100+
* \brief Memory allocation for variables on the GPU.
101+
* \param[in] size in bytes.
102+
* \tparam ZeroInit, initialize memory to 0.
103+
* \return Pointer to memory, always use gpu_free to deallocate.
104+
*/
105+
template <class T, bool ZeroInit = false>
106+
inline T* gpu_alloc(size_t size) noexcept {
107+
void* ptr = nullptr;
108+
109+
#if defined(HAVE_CUDA)
110+
gpuErrChk(cudaMalloc((void**)(&ptr), size));
111+
if (ZeroInit) gpuErrChk(cudaMemset((void*)(ptr), 0.0, size));
112+
#else
113+
return 0;
114+
#endif
115+
116+
return static_cast<T*>(ptr);
117+
}
118+
119+
/*!
120+
* \brief Free memory allocated on the GPU with gpu_alloc.
121+
* \param[in] ptr, pointer to memory we want to release.
122+
*/
123+
template <class T>
124+
inline void gpu_free(T* ptr) noexcept {
125+
#ifdef HAVE_CUDA
126+
gpuErrChk(cudaFree((void*)ptr));
127+
#endif
128+
}
129+
/*!
130+
* \brief Memory allocation for variables on the GPU along with initialization from a source host array.
131+
* \param[in] size in bytes.
132+
* \return Pointer to memory, always use gpu_free to deallocate.
133+
*/
134+
template <class T>
135+
inline T* gpu_alloc_cpy(T* src_ptr, size_t size) noexcept {
136+
void* ptr = nullptr;
137+
138+
#ifdef HAVE_CUDA
139+
gpuErrChk(cudaMalloc((void**)(&ptr), size));
140+
gpuErrChk(cudaMemcpy((void*)(ptr), (void*)src_ptr, size, cudaMemcpyHostToDevice));
141+
;
142+
#else
143+
return 0;
144+
#endif
145+
146+
return static_cast<T*>(ptr);
147+
}
148+
} // namespace GPUMemoryAllocation

Common/src/linear_algebra/CSysMatrix.cpp

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,6 @@
3232

3333
#include <cmath>
3434

35-
#ifdef HAVE_CUDA
36-
#include "../../include/linear_algebra/GPUComms.cuh"
37-
#endif
38-
3935
template <class ScalarType>
4036
CSysMatrix<ScalarType>::CSysMatrix() : rank(SU2_MPI::GetRank()), size(SU2_MPI::GetSize()) {
4137
nPoint = nPointDomain = nVar = nEqn = 0;
@@ -71,11 +67,10 @@ CSysMatrix<ScalarType>::~CSysMatrix() {
7167
MemoryAllocation::aligned_free(ILU_matrix);
7268
MemoryAllocation::aligned_free(matrix);
7369
MemoryAllocation::aligned_free(invM);
74-
#ifdef HAVE_CUDA
75-
cudaFree(d_matrix);
76-
cudaFree(d_row_ptr);
77-
cudaFree(d_col_ind);
78-
#endif
70+
71+
GPUMemoryAllocation::gpu_free(d_matrix);
72+
GPUMemoryAllocation::gpu_free(d_row_ptr);
73+
GPUMemoryAllocation::gpu_free(d_col_ind);
7974

8075
#ifdef USE_MKL
8176
mkl_jit_destroy(MatrixMatrixProductJitter);
@@ -147,15 +142,17 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
147142

148143
allocAndInit(matrix, nnz * nVar * nEqn);
149144

150-
#if defined(HAVE_CUDA)
151-
gpuErrChk(cudaMalloc((void**)(&d_row_ptr), (sizeof(row_ptr) * (nPointDomain + 1.0))));
152-
gpuErrChk(cudaMalloc((void**)(&d_col_ind), (sizeof(col_ind) * nnz)));
153-
gpuErrChk(cudaMalloc((void**)(&d_matrix), (sizeof(ScalarType) * nnz * nVar * nEqn)));
145+
auto GPUAllocAndInit = [](ScalarType*& ptr, unsigned long num) {
146+
ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
147+
};
154148

155-
gpuErrChk(
156-
cudaMemcpy((void*)(d_row_ptr), (void*)row_ptr, (sizeof(row_ptr) * (nPointDomain + 1.0)), cudaMemcpyHostToDevice));
157-
gpuErrChk(cudaMemcpy((void*)(d_col_ind), (void*)col_ind, (sizeof(col_ind)) * nnz, cudaMemcpyHostToDevice));
158-
#endif
149+
auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
150+
ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
151+
};
152+
153+
GPUAllocAndInit(d_matrix, nnz * nVar * nEqn);
154+
GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
155+
GPUAllocAndCopy(d_col_ind, col_ind, nnz);
159156

160157
if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();
161158

Common/src/linear_algebra/GPUMatrix.cu renamed to Common/src/linear_algebra/CSysMatrixGPU.cu

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*!
2-
* \file GPUMatrix.cu
2+
* \file CSysMatrixGPU.cu
33
* \brief Implementations of Kernels and Functions for Matrix Operations on the GPU
44
* \author A. Raj
55
* \version 8.1.0 "Harrier"
@@ -29,7 +29,7 @@
2929
#include "../../include/linear_algebra/GPUComms.cuh"
3030

3131
template<typename matrixType, typename vectorType>
32-
__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, unsigned long* d_row_ptr, unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
32+
__global__ void GPUMatrixVectorProductAdd(matrixType* matrix, vectorType* vec, vectorType* prod, const unsigned long* d_row_ptr, const unsigned long* d_col_ind, unsigned long nPointDomain, unsigned long nVar, unsigned long nEqn)
3333
{
3434
int row = (blockIdx.x * blockDim.x + threadIdx.x)/32;
3535
int threadNo = threadIdx.x%32;
@@ -74,10 +74,9 @@ void CSysMatrix<ScalarType>::GPUMatrixVectorProduct(const CSysVector<ScalarType>
7474
vec.HtDTransfer();
7575
prod.GPUSetVal(0.0);
7676

77-
dim3 blockDim(1024,1,1);
78-
double gridx = (double) nPointDomain/32.0;
79-
gridx = double(ceil(gridx));
80-
dim3 gridDim(gridx, 1.0, 1.0);
77+
dim3 blockDim(KernelParameters::MVP_BLOCK_SIZE,1,1);
78+
int gridx = KernelParameters::round_up_division(KernelParameters::MVP_WARP_SIZE, nPointDomain);
79+
dim3 gridDim(gridx, 1, 1);
8180

8281
GPUMatrixVectorProductAdd<<<gridDim, blockDim>>>(d_matrix, d_vec, d_prod, d_row_ptr, d_col_ind, nPointDomain, nVar, nEqn);
8382
gpuErrChk( cudaPeekAtLastError() );

Common/src/linear_algebra/CSysVector.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@
2828
#include "../../include/linear_algebra/CSysVector.hpp"
2929
#include "../../include/toolboxes/allocation_toolbox.hpp"
3030

31-
#ifdef HAVE_CUDA
32-
#include "../../include/linear_algebra/GPUComms.cuh"
33-
#endif
34-
3531
template <class ScalarType>
3632
void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
3733
const ScalarType* val, bool valIsArray, bool errorIfParallel) {
@@ -56,10 +52,7 @@ void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numB
5652

5753
if (vec_val == nullptr) vec_val = MemoryAllocation::aligned_alloc<ScalarType, true>(64, nElm * sizeof(ScalarType));
5854

59-
#if defined(HAVE_CUDA)
60-
gpuErrChk(cudaMalloc((void**)(&d_vec_val), (sizeof(ScalarType) * nElm)));
61-
gpuErrChk(cudaMemset((void*)d_vec_val, 0.0, (sizeof(ScalarType) * nElm)));
62-
#endif
55+
d_vec_val = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(nElm * sizeof(ScalarType));
6356

6457
if (val != nullptr) {
6558
if (!valIsArray) {
@@ -75,9 +68,8 @@ CSysVector<ScalarType>::~CSysVector() {
7568
if (!std::is_trivial<ScalarType>::value)
7669
for (auto i = 0ul; i < nElm; i++) vec_val[i].~ScalarType();
7770
MemoryAllocation::aligned_free(vec_val);
78-
#ifdef HAVE_CUDA
79-
cudaFree(d_vec_val);
80-
#endif
71+
72+
GPUMemoryAllocation::gpu_free(d_vec_val);
8173
}
8274

8375
/*--- Explicit instantiations ---*/

Common/src/linear_algebra/GPUVector.cu renamed to Common/src/linear_algebra/CSysVectorGPU.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*!
2-
* \file GPUVector.cu
2+
* \file CSysVectorGPU.cu
33
* \brief Implementations of Kernels and Functions for Vector Operations on the GPU
44
* \author A. Raj
55
* \version 8.1.0 "Harrier"
@@ -25,8 +25,8 @@
2525
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
2626
*/
2727

28-
#include "../../include/linear_algebra/CSysVector.hpp"
29-
#include "../../include/linear_algebra/GPUComms.cuh"
28+
#include "../../include/linear_algebra/CSysVector.hpp"
29+
#include "../../include/linear_algebra/GPUComms.cuh"
3030

3131
template<class ScalarType>
3232
void CSysVector<ScalarType>::HtDTransfer(bool trigger) const
@@ -46,4 +46,4 @@ void CSysVector<ScalarType>::GPUSetVal(ScalarType val, bool trigger) const
4646
if(trigger) gpuErrChk(cudaMemset((void*)(d_vec_val), val, (sizeof(ScalarType)*nElm)));
4747
}
4848

49-
template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.
49+
template class CSysVector<su2double>; //This is a temporary fix for invalid instantiations due to separating the member function from the header file the class is defined in. Will try to rectify it in coming commits.

Common/src/linear_algebra/meson.build

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ common_src += files(['CSysSolve_b.cpp',
22
'CSysSolve.cpp',
33
'CSysVector.cpp',
44
'CSysMatrix.cpp',
5-
'GPUMatrix.cu',
6-
'GPUVector.cu',
5+
'CSysMatrixGPU.cu',
6+
'CSysVectorGPU.cu',
77
'CPastixWrapper.cpp',
88
'blas_structure.cpp'])

0 commit comments

Comments
 (0)