Skip to content

Commit 0372099

Browse files
committed
Working GPU LU_SGS Preconditioner Port
1 parent 9bfeff9 commit 0372099

File tree

8 files changed

+217
-137
lines changed

8 files changed

+217
-137
lines changed

Common/include/geometry/CGeometry.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,11 +260,12 @@ class CGeometry {
260260
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
261261
on all prior ranks in the linear partitioning. */
262262

263-
unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
264-
partitioning algorithms. */
263+
unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
264+
partitioning algorithms. */
265265
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
266266
vector<unsigned long>
267267
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
268+
vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
268269

269270
/*--- Data structures for point-to-point MPI communications. ---*/
270271

Common/include/geometry/CPhysicalGeometry.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,6 @@ class CPhysicalGeometry final : public CGeometry {
152152
* \brief Divide the graph produced by the matrix into parallel partitions.
153153
* \param[in] config - Definition of the particular problem.
154154
* \param[in] pointList - Ordered list of points in the mesh.
155-
* \param[in] numPartitions - Returns the number of parallel partitions created by the algorithm.
156-
* \param[in] indexOffsets - Vector array that represents the starting index of each partition in the reordered point
157-
* list.
158155
*/
159156
template <class ScalarType>
160157
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);

Common/include/linear_algebra/CGraphPartitioning.hpp

Lines changed: 84 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*!
22
* \file CGraphPartitioning.hpp
3-
* \brief Headers for the classes realted to the algorithms that are used
3+
* \brief Headers for the classes realted to the algorithms that are used
44
to divide the matrix acyclic graph into parallel partitions.
55
* \author A. Raj
66
* \version 8.2.0 "Harrier"
@@ -26,6 +26,8 @@
2626
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
2727
*/
2828

29+
#pragma once
30+
2931
#include "../CConfig.hpp"
3032
#include "../geometry/CGeometry.hpp"
3133
#include "../geometry/dual_grid/CPoint.hpp"
@@ -35,65 +37,107 @@
3537
* \brief Abstract base class for defining graph partitioning algorithms
3638
* \author A. Raj
3739
*
38-
* In order to use certain parallel algorithms in the solution process -
39-
* whether with linear solvers or preconditioners - we require the matrix
40-
* to be partitioned into certain parallel divisions. These maybe in the form
41-
* of levels, blocks, colors and so on. Since a number of different algorithms
42-
* can be used to split the graph, we've introduced a base class containing the
43-
* "Partition" member function from which child classes of the specific
44-
* algorithm can be derived. Currently, we are only using direct declarations
40+
* In order to use certain parallel algorithms in the solution process -
41+
* whether with linear solvers or preconditioners - we require the matrix
42+
* to be partitioned into certain parallel divisions. These maybe in the form
43+
* of levels, blocks, colors and so on. Since a number of different algorithms
44+
* can be used to split the graph, we've introduced a base class containing the
45+
* "Partition" member function from which child classes of the specific
46+
* algorithm can be derived. Currently, we are only using direct declarations
4547
* of the derived classes in the code. However, this method was chosen as it
46-
* allows us to pass different child class algorithms to a single implementation
48+
* allows us to pass different child class algorithms to a single implementation
4749
* of the function that requires it - similar to the CMatrixVectorProduct class.
4850
*/
4951

5052
template <class ScalarType>
5153

5254
class CGraphPartitioning {
53-
5455
public:
5556
virtual ~CGraphPartitioning() = 0;
56-
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets) = 0;
57+
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
58+
vector<ScalarType>& chainPtr) = 0;
5759
};
5860
template <class ScalarType>
59-
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
61+
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
6062

6163
template <class ScalarType>
6264

63-
class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
64-
65+
class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
6566
private:
6667
ScalarType nPointDomain;
6768
CPoint* nodes;
68-
69+
6970
public:
7071
ScalarType nLevels;
7172
ScalarType maxLevelWidth;
7273
vector<ScalarType> levels;
7374

74-
/*!
75+
/*!
7576
* \brief constructor of the class
7677
* \param[in] nPointDomain_ref - number of points associated with the problem
77-
* \param[in] nodes - represents the relationships between the points
78+
* \param[in] nodes_ref - represents the relationships between the points
7879
*/
79-
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
80-
: nPointDomain(nPointDomain_ref), nodes(nodes_ref)
81-
{ nLevels = 0ul; maxLevelWidth = 0ul; }
80+
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
81+
: nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
82+
nLevels = 0ul;
83+
maxLevelWidth = 0ul;
84+
}
85+
86+
CLevelScheduling() = delete; // Removing default constructor
8287

83-
CLevelScheduling() = delete; // Removing default constructor
88+
/*!
89+
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
90+
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
91+
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
92+
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per block.
93+
*/
94+
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, int rowsPerBlock) {
95+
ScalarType levelWidth = 0;
96+
unsigned short chainLength = chainPtr.capacity();
8497

85-
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets)
86-
{
98+
/*This is not a magic number. We are simply initializing
99+
the point array with its first element that is always zero.*/
100+
chainPtr.push_back(0);
101+
102+
for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
103+
levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
104+
maxLevelWidth = std::max(levelWidth, maxLevelWidth);
105+
106+
if (levelWidth > rowsPerBlock) {
107+
if (chainPtr.back() != iLevel) {
108+
chainPtr.push_back(iLevel);
109+
}
110+
111+
chainPtr.push_back(iLevel + 1);
112+
}
113+
}
114+
115+
chainPtr.push_back(nLevels);
116+
}
117+
118+
/*!
119+
* \brief Reorders the points according to the levels
120+
* \param[in] pointList - Ordered array that contains the list of all mesh points.
121+
* \param[in] inversePointList - Array utilized to access the index of each point in pointList.
122+
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
123+
*/
124+
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
87125
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
88126
const auto globalPoint = pointList[localPoint];
89127
inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
90128
}
91-
129+
92130
pointList = std::move(inversePointList);
93131
}
94132

95-
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets) override
96-
{
133+
/*!
134+
* \brief Reorders the points according to the levels
135+
* \param[in] pointList - Ordered array that contains the list of all mesh points.
136+
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
137+
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
138+
*/
139+
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets,
140+
vector<ScalarType>& chainPtr) override {
97141
vector<ScalarType> inversePointList;
98142
inversePointList.reserve(nPointDomain);
99143
levels.reserve(nPointDomain);
@@ -111,29 +155,34 @@ class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
111155

112156
for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
113157
const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
114-
158+
115159
if (adjGlobalPoint < nPointDomain) {
116160
const auto adjLocalPoint = inversePointList[adjGlobalPoint];
117-
161+
118162
if (adjLocalPoint < localPoint) {
119-
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
163+
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
164+
}
120165
}
121-
}
122166
}
123167

124168
nLevels = std::max(nLevels, levels[localPoint] + 1);
125-
}
169+
}
126170

127171
levelOffsets.resize(nLevels + 1);
128-
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) ++levelOffsets[levels[iPoint] + 1];
172+
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
173+
++levelOffsets[levels[iPoint] + 1];
174+
}
129175

130176
for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
131177
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
132178
}
133179

134-
for(auto elem = levelOffsets.begin(); elem != (levelOffsets.end() - 1); elem++) maxLevelWidth = std::max(*(elem+1) - *elem, maxLevelWidth);
135-
136180
Reorder(pointList, inversePointList, levelOffsets);
181+
182+
#ifdef HAVE_CUDA
183+
CalculateChain(levelOffsets, chainPtr, 20);
184+
#elif
185+
chainPtr = NULL;
186+
#endif
137187
}
138188
};
139-

Common/include/linear_algebra/CSysMatrix.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ class CSysMatrix {
150150
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
151151
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
152152
unsigned long* d_partition_offsets;
153-
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
154-
Mainly used to conditionally free GPU memory in the class destructor. */
153+
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
154+
Mainly used to conditionally free GPU memory in the class destructor. */
155155

156156
ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
157157
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */

Common/include/linear_algebra/GPUComms.cuh

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,48 +28,72 @@
2828
#include<cuda_runtime.h>
2929
#include<iostream>
3030

31+
/*!< \brief Namespace that contains variables and helper functions that are
32+
utilized to launch CUDA Kernels. */
3133
namespace kernelParameters{
3234

33-
/*Returns the rounded up value of the decimal quotient to the next integer (in all cases)*/
35+
36+
/*!
37+
* \brief Returns the rounded up value of the decimal quotient to the next integer (in all cases).
38+
*/
3439
inline constexpr int rounded_up_division(const int divisor, int dividend) { return ((dividend + divisor - 1) / divisor); }
3540

36-
/*Returns the rounded down value of the decimal quotient to the previous integer (in all cases)*/
41+
/*!
42+
* \brief Returns the rounded down value of the decimal quotient to the previous integer (in all cases).
43+
*/
3744
inline constexpr int rounded_down_division(const int divisor, int dividend) { return ((dividend - divisor + 1) / divisor); }
3845

39-
static constexpr short MVP_BLOCK_SIZE = 256;
40-
static constexpr short MVP_WARP_SIZE = 32;
46+
static constexpr short BLOCK_SIZE = 640;
47+
static constexpr short WARP_SIZE = 32;
48+
static constexpr short ROWS_PER_BLOCK = rounded_up_division(WARP_SIZE, BLOCK_SIZE);
49+
4150

4251
};
4352

53+
/*!< \brief Structure containing information related to the Jacobian Matrix
54+
which is utilized by any launched Kernel. */
4455
struct matrixParameters{
4556

4657
public:
47-
unsigned long totalRows;
48-
unsigned short blockRowSize;
49-
unsigned short blockColSize;
50-
unsigned long nPartition;
51-
unsigned short blockSize;
52-
unsigned short rowsPerBlock;
53-
unsigned short activeThreads;
58+
unsigned long totalRows; /*!< \brief Contains the total number of rows of the Jacbian Matrix. */
59+
unsigned short blockRowSize; /*!< \brief Contains the row dimensions of the blocks of the Jacobian Matrix. */
60+
unsigned short blockColSize; /*!< \brief Contains the column dimensions of the blocks of the Jacobian Matrix. */
61+
unsigned int nChainStart; /*!< \brief Starting partition of the current chain. */
62+
unsigned int nChainEnd; /*!< \brief Ending partition of the current chain. */
63+
unsigned short blockSize; /*!< \brief Contains the total number of elements in each block of the Jacbian Matrix. */
64+
unsigned short activeThreads; /*!< \brief Cotains the number of active threads per iteration during MVP - depending on the
65+
dimensions of the Jacbian Matrix. */
5466

5567
matrixParameters(unsigned long nPointDomain, unsigned long nEqn, unsigned long nVar, unsigned long nPartitions){
5668
totalRows = nPointDomain;
5769
blockRowSize = nEqn;
5870
blockColSize = nVar;
59-
nPartition = nPartitions;
71+
nChainStart = 0;
72+
nChainEnd = 0;
6073
blockSize = nVar * nEqn;
61-
rowsPerBlock = kernelParameters::rounded_up_division(kernelParameters::MVP_WARP_SIZE, kernelParameters::MVP_BLOCK_SIZE);
62-
activeThreads = nVar * (kernelParameters::MVP_WARP_SIZE/nVar);
74+
activeThreads = nVar * (kernelParameters::WARP_SIZE/nVar);
6375
}
6476

77+
/*!
78+
* \brief Returns the memory index in the shared memory array used by the Symmetric Iteration Kernels.
79+
*/
6580
__device__ unsigned short shrdMemIndex(unsigned short localRow, unsigned short threadNo){
6681
return (localRow * blockSize + threadNo);
6782
}
6883

84+
/*!
85+
* \brief Returns a boolean value to check whether the row is under the total number of rows and if the
86+
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
87+
*/
6988
__device__ bool validAccess(unsigned long row, unsigned short threadNo, unsigned short threadLimit){
7089
return (row<totalRows && threadNo<threadLimit);
7190
}
7291

92+
/*!
93+
* \brief Returns a boolean value to check whether the row is part of the parallel partition being executed and if the
94+
* thread number is within a user-specified thread limit. This is to avoid illegal memory accesses.
95+
* \param[in] rowInPartition - Represents a boolean that indicates the presence/absence of the row in the partition.
96+
*/
7397
__device__ bool validParallelAccess(bool rowInPartition, unsigned short threadNo, unsigned short threadLimit){
7498
return (rowInPartition && threadNo<threadLimit);
7599
}

Common/src/geometry/CPhysicalGeometry.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ void CPhysicalGeometry::PartitionGraph(const CConfig* config, vector<ScalarType>
708708
switch (KindAlgorithm) {
709709
case LEVEL_SCHEDULING:
710710
auto levelSchedule = CLevelScheduling<ScalarType>(nPointDomain, nodes);
711-
levelSchedule.Partition(pointList, partitionOffsets);
711+
levelSchedule.Partition(pointList, partitionOffsets, chainPtr);
712712
nPartition = levelSchedule.nLevels;
713713
maxPartitionSize = levelSchedule.maxLevelWidth;
714714
break;
@@ -4558,6 +4558,7 @@ void CPhysicalGeometry::SetRCM_Ordering(CConfig* config) {
45584558
if (!status) SU2_MPI::Error("RCM ordering failed", CURRENT_FUNCTION);
45594559
}
45604560

4561+
/*Partition graph into parallel constituents for GPU Operations*/
45614562
if (config->GetCUDA()) PartitionGraph(config, Result);
45624563

45634564
/*--- Add the MPI points ---*/

Common/src/linear_algebra/CSysMatrix.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
153153
ptr = GPUMemoryAllocation::gpu_alloc<ScalarType, true>(num * sizeof(ScalarType));
154154
};
155155

156-
auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
156+
auto GPUAllocAndCopy = [](const unsigned long*& ptr, const unsigned long*& src_ptr, unsigned long num) {
157157
ptr = GPUMemoryAllocation::gpu_alloc_cpy<const unsigned long>(src_ptr, num * sizeof(const unsigned long));
158158
};
159159

@@ -165,8 +165,7 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
165165
GPUAllocAndCopy(d_row_ptr, row_ptr, (nPointDomain + 1.0));
166166
GPUAllocAndCopy(d_col_ind, col_ind, nnz);
167167
GPUAllocAndCopy(d_dia_ptr, dia_ptr, nPointDomain);
168-
GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition);
169-
168+
GPUVectorAllocAndCopy(d_partition_offsets, geometry->partitionOffsets, geometry->nPartition + 1);
170169
}
171170

172171
if (needTranspPtr) col_ptr = geometry->GetTransposeSparsePatternMap(type).data();

0 commit comments

Comments
 (0)