Skip to content

Commit 661c9b8

Browse files
committed
LU_SGS Preconditioner Port
2 parents 9dd3028 + 52b90b6 commit 661c9b8

File tree

13 files changed

+578
-129
lines changed

13 files changed

+578
-129
lines changed

Common/include/CConfig.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,7 @@ class CConfig {
521521
Kind_Gradient_Method_Recon, /*!< \brief Numerical method for computation of spatial gradients used for upwind reconstruction. */
522522
Kind_Deform_Linear_Solver, /*!< Numerical method to deform the grid */
523523
Kind_Deform_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
524+
Kind_Graph_Part_Algo, /*!< \brief Algorithm for parallel partitioning of the matrix graph. */
524525
Kind_Linear_Solver, /*!< \brief Numerical solver for the implicit scheme. */
525526
Kind_Linear_Solver_Prec, /*!< \brief Preconditioner of the linear solver. */
526527
Kind_DiscAdj_Linear_Solver, /*!< \brief Linear solver for the discrete adjoint system. */
@@ -631,6 +632,8 @@ class CConfig {
631632
unsigned long Linear_Solver_Restart_Frequency; /*!< \brief Restart frequency of the linear solver for the implicit formulation. */
632633
unsigned long Linear_Solver_Prec_Threads; /*!< \brief Number of threads per rank for ILU and LU_SGS preconditioners. */
633634
unsigned short Linear_Solver_ILU_n; /*!< \brief ILU fill=in level. */
635+
unsigned short Cuda_Block_Size; /*!< \brief User-specified value for the X-Axis dimension of thread blocks
636+
that are deployed by the CUDA Kernels. */
634637
su2double SemiSpan; /*!< \brief Wing Semi span. */
635638
su2double Roe_Kappa; /*!< \brief Relaxation of the Roe scheme. */
636639
su2double Relaxation_Factor_Adjoint; /*!< \brief Relaxation coefficient for variable updates of adjoint solvers. */
@@ -4136,6 +4139,12 @@ class CConfig {
41364139
*/
41374140
bool GetLeastSquaresRequired(void) const { return LeastSquaresRequired; }
41384141

4142+
/*!
4143+
* \brief Get the type of algorithm used for partitioning the matrix graph.
4144+
* \return Algorithm that divides the matrix into partitions that are executed parallely.
4145+
*/
4146+
unsigned short GetKind_Graph_Part_Algo(void) const { return Kind_Graph_Part_Algo; }
4147+
41394148
/*!
41404149
* \brief Get the kind of solver for the implicit solver.
41414150
* \return Numerical solver for implicit formulation (solving the linear system).
@@ -4197,6 +4206,18 @@ class CConfig {
41974206
*/
41984207
su2double GetLinear_Solver_Smoother_Relaxation(void) const { return Linear_Solver_Smoother_Relaxation; }
41994208

4209+
/*!
4210+
* \brief Get thread block dimensions (X-axis) being used by the CUDA Kernels.
4211+
* \return Thread block dimensions (X-axis) being used by the CUDA Kernels.
4212+
*/
4213+
unsigned short GetCuda_Block_Size(void) const { return Cuda_Block_Size; }
4214+
4215+
/*!
4216+
* \brief Get the number of matrix rows assigned per CUDA Block.
4217+
* \return The number of matrix rows assigned per CUDA Block.
4218+
*/
4219+
unsigned short GetRows_Per_Cuda_Block(void) const { return cudaKernelParameters::rounded_up_division(cudaKernelParameters::CUDA_WARP_SIZE, Cuda_Block_Size); }
4220+
42004221
/*!
42014222
* \brief Get the relaxation factor for solution updates of adjoint solvers.
42024223
*/

Common/include/geometry/CGeometry.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,13 @@ class CGeometry {
260260
unsigned long* nPointCumulative{nullptr}; /*!< \brief Cumulative storage array containing the total number of points
261261
on all prior ranks in the linear partitioning. */
262262

263+
unsigned long nPartition; /*!< \brief Number of divisions of the matrix graph during execution of parallel
264+
partitioning algorithms. */
265+
unsigned long maxPartitionSize; /*!< \brief Size of the level with the maximum number of elements. */
266+
vector<unsigned long>
267+
partitionOffsets; /*!< \brief Vector array containing the indices at which different parallel partitions begin. */
268+
vector<unsigned long> chainPtr; /*!< \brief Vector array containing distribution of levels into chains. */
269+
263270
/*--- Data structures for point-to-point MPI communications. ---*/
264271

265272
int maxCountPerPoint{0}; /*!< \brief Maximum number of pieces of data sent per vertex in point-to-point comms. */

Common/include/geometry/CPhysicalGeometry.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,14 @@ class CPhysicalGeometry final : public CGeometry {
148148
*/
149149
void DistributeColoring(const CConfig* config, CGeometry* geometry);
150150

151+
/*!
152+
* \brief Divide the graph produced by the matrix into parallel partitions.
153+
* \param[in] config - Definition of the particular problem.
154+
* \param[in] pointList - Ordered list of points in the mesh.
155+
*/
156+
template <class ScalarType>
157+
void PartitionGraph(const CConfig* config, vector<ScalarType>& pointList);
158+
151159
/*!
152160
* \brief Distribute the grid points, including ghost points, across all ranks based on a ParMETIS coloring.
153161
* \param[in] config - Definition of the particular problem.
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/*!
2+
* \file CGraphPartitioning.hpp
3+
* \brief Headers for the classes realted to the algorithms that are used
4+
to divide the matrix acyclic graph into parallel partitions.
5+
* \author A. Raj
6+
* \version 8.2.0 "Harrier"
7+
*
8+
* SU2 Project Website: https://su2code.github.io
9+
*
10+
* The SU2 Project is maintained by the SU2 Foundation
11+
* (http://su2foundation.org)
12+
*
13+
* Copyright 2012-2025, SU2 Contributors (cf. AUTHORS.md)
14+
*
15+
* SU2 is free software; you can redistribute it and/or
16+
* modify it under the terms of the GNU Lesser General Public
17+
* License as published by the Free Software Foundation; either
18+
* version 2.1 of the License, or (at your option) any later version.
19+
*
20+
* SU2 is distributed in the hope that it will be useful,
21+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
22+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23+
* Lesser General Public License for more details.
24+
*
25+
* You should have received a copy of the GNU Lesser General Public
26+
* License along with SU2. If not, see <http://www.gnu.org/licenses/>.
27+
*/
28+
29+
#pragma once
30+
31+
#include "../geometry/CGeometry.hpp"
32+
33+
/*!
34+
* \class CGraphPartitioning
35+
* \brief Abstract base class for defining graph partitioning algorithms.
36+
* \author A. Raj
37+
*
38+
* In order to use certain parallel algorithms in the solution process -
39+
* whether with linear solvers or preconditioners - we require the matrix
40+
* to be partitioned into certain parallel divisions. These maybe in the form
41+
* of levels, blocks, colors and so on. Since a number of different algorithms
42+
* can be used to split the graph, we've introduced a base class containing the
43+
* "Partition" member function from which child classes of the specific
44+
* algorithm can be derived. Currently, we are only using direct declarations
45+
* of the derived classes in the code. However, this method was chosen as it
46+
* allows us to pass different child class algorithms to a single implementation
47+
* of the function that requires it - similar to the CMatrixVectorProduct class.
48+
*/
49+
50+
template <class ScalarType>
51+
52+
class CGraphPartitioning {
53+
public:
54+
virtual ~CGraphPartitioning() = 0;
55+
virtual void Partition(vector<ScalarType>& pointList, vector<ScalarType>& partitionOffsets,
56+
vector<ScalarType>& chainPtr, unsigned short chainLimit) = 0;
57+
};
58+
template <class ScalarType>
59+
CGraphPartitioning<ScalarType>::~CGraphPartitioning() {}
60+
61+
template <class ScalarType>
62+
63+
class CLevelScheduling final : public CGraphPartitioning<ScalarType> {
64+
private:
65+
ScalarType nPointDomain;
66+
CPoint* nodes;
67+
68+
public:
69+
ScalarType nLevels;
70+
ScalarType maxLevelWidth;
71+
vector<ScalarType> levels;
72+
73+
/*!
74+
* \brief constructor of the class
75+
* \param[in] nPointDomain_ref - number of points associated with the problem
76+
* \param[in] nodes_ref - represents the relationships between the points
77+
*/
78+
inline CLevelScheduling<ScalarType>(ScalarType nPointDomain_ref, CPoint* nodes_ref)
79+
: nPointDomain(nPointDomain_ref), nodes(nodes_ref) {
80+
nLevels = 0ul;
81+
maxLevelWidth = 0ul;
82+
}
83+
84+
CLevelScheduling() = delete; // Removing default constructor
85+
86+
/*!
87+
* \brief Divides the levels into groups of chains depending on the preset GPU block and warp size.
88+
* \param[in] levelOffsets - Represents the vector array containing the ordered list of starting rows of each level.
89+
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
90+
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
91+
*/
92+
void CalculateChain(vector<ScalarType> levelOffsets, vector<ScalarType>& chainPtr, unsigned short rowsPerBlock) {
93+
ScalarType levelWidth = 0;
94+
unsigned short chainLength = chainPtr.capacity();
95+
96+
/*This is not a magic number. We are simply initializing
97+
the point array with its first element that is always zero.*/
98+
chainPtr.push_back(0);
99+
100+
for (ScalarType iLevel = 0ul; iLevel < nLevels; iLevel++) {
101+
levelWidth = levelOffsets[iLevel + 1] - levelOffsets[iLevel];
102+
maxLevelWidth = std::max(levelWidth, maxLevelWidth);
103+
104+
if (levelWidth > rowsPerBlock) {
105+
if (chainPtr.back() != iLevel) {
106+
chainPtr.push_back(iLevel);
107+
}
108+
109+
chainPtr.push_back(iLevel + 1);
110+
}
111+
}
112+
113+
chainPtr.push_back(nLevels);
114+
}
115+
116+
/*!
117+
* \brief Reorders the points according to the levels
118+
* \param[in] pointList - Ordered array that contains the list of all mesh points.
119+
* \param[in] inversePointList - Array utilized to access the index of each point in pointList.
120+
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
121+
*/
122+
void Reorder(vector<ScalarType>& pointList, vector<ScalarType>& inversePointList, vector<ScalarType> levelOffsets) {
123+
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
124+
const auto globalPoint = pointList[localPoint];
125+
inversePointList[levelOffsets[levels[localPoint]]++] = globalPoint;
126+
}
127+
128+
pointList = std::move(inversePointList);
129+
}
130+
131+
/*!
132+
* \brief Reorders the points according to the levels
133+
* \param[in] pointList - Ordered array that contains the list of all mesh points.
134+
* \param[in] levelOffsets - Vector array containing the ordered list of starting rows of each level.
135+
* \param[in] chainPtr - Represents the vector array containing the ordered list of starting levels of each chain.
136+
* \param[in] rowsPerBlock - Represents the maximum number of rows that can be accomodated per CUDA block.
137+
*/
138+
void Partition(vector<ScalarType>& pointList, vector<ScalarType>& levelOffsets, vector<ScalarType>& chainPtr,
139+
unsigned short rowsPerBlock) override {
140+
vector<ScalarType> inversePointList;
141+
inversePointList.reserve(nPointDomain);
142+
levels.reserve(nPointDomain);
143+
144+
for (auto point = 0ul; point < nPointDomain; point++) {
145+
inversePointList[pointList[point]] = point;
146+
levels[point] = 0;
147+
}
148+
149+
// Local Point - Ordering of the points post the RCM ordering
150+
// Global Point - Original order of the points before the RCM ordering
151+
152+
for (auto localPoint = 0ul; localPoint < nPointDomain; ++localPoint) {
153+
const auto globalPoint = pointList[localPoint];
154+
155+
for (auto adjPoints = 0u; adjPoints < nodes->GetnPoint(globalPoint); adjPoints++) {
156+
const auto adjGlobalPoint = nodes->GetPoint(globalPoint, adjPoints);
157+
158+
if (adjGlobalPoint < nPointDomain) {
159+
const auto adjLocalPoint = inversePointList[adjGlobalPoint];
160+
161+
if (adjLocalPoint < localPoint) {
162+
levels[localPoint] = std::max(levels[localPoint], levels[adjLocalPoint] + 1);
163+
}
164+
}
165+
}
166+
167+
nLevels = std::max(nLevels, levels[localPoint] + 1);
168+
}
169+
170+
levelOffsets.resize(nLevels + 1);
171+
for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
172+
++levelOffsets[levels[iPoint] + 1];
173+
}
174+
175+
for (auto iLevel = 2ul; iLevel <= nLevels; ++iLevel) {
176+
levelOffsets[iLevel] += levelOffsets[iLevel - 1];
177+
}
178+
179+
Reorder(pointList, inversePointList, levelOffsets);
180+
181+
CalculateChain(levelOffsets, chainPtr, rowsPerBlock);
182+
}
183+
};

Common/include/linear_algebra/CPreconditioner.hpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,15 @@ class CLU_SGSPreconditioner final : public CPreconditioner<ScalarType> {
205205
* \param[out] v - CSysVector that is the result of the preconditioning.
206206
*/
207207
inline void operator()(const CSysVector<ScalarType>& u, CSysVector<ScalarType>& v) const override {
208-
#ifdef HAVE_CUDA
209-
if(config->GetCUDA())
210-
{
208+
#ifdef HAVE_CUDA
209+
if (config->GetCUDA()) {
211210
sparse_matrix.GPUComputeLU_SGSPreconditioner(u, v, geometry, config);
212-
}
213-
else {
211+
} else {
214212
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
215213
}
216-
#else
214+
#else
217215
sparse_matrix.ComputeLU_SGSPreconditioner(u, v, geometry, config);
218-
#endif
216+
#endif
219217
}
220218
};
221219

Common/include/linear_algebra/CSysMatrix.hpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,15 @@ class CSysMatrix {
148148
ScalarType* d_matrix; /*!< \brief Device Pointer to store the matrix values on the GPU. */
149149
const unsigned long* d_row_ptr; /*!< \brief Device Pointers to the first element in each row. */
150150
const unsigned long* d_col_ind; /*!< \brief Device Column index for each of the elements in val(). */
151+
<<<<<<< HEAD
151152
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
152153
Mainly used to conditionally free GPU memory in the class destructor. */
154+
=======
155+
const unsigned long* d_dia_ptr; /*!< \brief Device Column index for each of the elements in val(). */
156+
unsigned long* d_partition_offsets;
157+
bool useCuda; /*!< \brief Boolean that indicates whether user has enabled CUDA or not.
158+
Mainly used to conditionally free GPU memory in the class destructor. */
159+
>>>>>>> precond_port
153160

154161
ScalarType* ILU_matrix; /*!< \brief Entries of the ILU sparse matrix. */
155162
unsigned long nnz_ilu; /*!< \brief Number of possible nonzero entries in the matrix (ILU). */
@@ -862,7 +869,10 @@ class CSysMatrix {
862869

863870
/*!
864871
<<<<<<< HEAD
872+
<<<<<<< HEAD
865873
=======
874+
=======
875+
>>>>>>> precond_port
866876
* \brief Performs first step of the LU_SGS Preconditioner building
867877
* \param[in] vec - CSysVector to be multiplied by the sparse matrix A.
868878
* \param[in] geometry - Geometrical definition of the problem.
@@ -888,13 +898,16 @@ class CSysMatrix {
888898
void GPUGaussElimination(ScalarType& prod, CGeometry* geometry, const CConfig* config) const;
889899

890900
/*!
901+
<<<<<<< HEAD
891902
>>>>>>> upstream/develop
903+
=======
904+
>>>>>>> precond_port
892905
* \brief Multiply CSysVector by the preconditioner all of which are stored on the device
893906
* \param[in] vec - CSysVector to be multiplied by the preconditioner.
894907
* \param[out] prod - Result of the product A*vec.
895908
*/
896-
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod, CGeometry* geometry,
897-
const CConfig* config) const;
909+
void GPUComputeLU_SGSPreconditioner(const CSysVector<ScalarType>& vec, CSysVector<ScalarType>& prod,
910+
CGeometry* geometry, const CConfig* config) const;
898911

899912
/*!
900913
* \brief Build the Jacobi preconditioner.

0 commit comments

Comments
 (0)