Skip to content

Commit b84f237

Browse files
authored
Merge pull request #2679 from guptapratykshh/fix/probe-performance-clean
Fix #2546: Implemented ADT-based probe search and batched AllReduce
2 parents 02b07fb + 43ab0e6 commit b84f237

File tree

4 files changed

+174
-29
lines changed

4 files changed

+174
-29
lines changed

AUTHORS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ Paul Zhang
123123
Pedro Gomes
124124
Peng Yan
125125
Pete Bachant
126+
Pratyksh Gupta
126127
RaulFeijo55
127128
Ruben Sanchez
128129
Ryan Barrett

SU2_CFD/src/output/CFlowOutput.cpp

Lines changed: 102 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#include "../../include/output/CFlowOutput.hpp"
3434

3535
#include "../../../Common/include/geometry/CGeometry.hpp"
36+
#include "../../../Common/include/adt/CADTPointsOnlyClass.hpp"
3637
#include "../../../Common/include/toolboxes/geometry_toolbox.hpp"
3738
#include "../../include/solvers/CSolver.hpp"
3839
#include "../../include/variables/CPrimitiveIndices.hpp"
@@ -818,6 +819,54 @@ void CFlowOutput::SetCustomOutputs(const CSolver* const* solver, const CGeometry
818819
const bool adjoint = config->GetDiscrete_Adjoint();
819820
const bool axisymmetric = config->GetAxisymmetric();
820821
const auto* flowNodes = su2staticcast_p<const CFlowVariable*>(solver[FLOW_SOL]->GetNodes());
822+
auto GetPointValue = [&](const auto& output, unsigned long iPoint) {
823+
return [&, iPoint](unsigned long i) {
824+
if (i < CustomOutput::NOT_A_VARIABLE) {
825+
const auto solIdx = i / CustomOutput::MAX_VARS_PER_SOLVER;
826+
const auto varIdx = i % CustomOutput::MAX_VARS_PER_SOLVER;
827+
if (solIdx == FLOW_SOL) {
828+
return flowNodes->GetPrimitive(iPoint, varIdx);
829+
}
830+
return solver[solIdx]->GetNodes()->GetSolution(iPoint, varIdx);
831+
} else {
832+
return *output.otherOutputs[i - CustomOutput::NOT_A_VARIABLE];
833+
}
834+
};
835+
};
836+
837+
/*--- Count probes that need processing and use heuristic to decide ADT vs linear search.
838+
ADT overhead is only worth it for larger numbers of probes. ---*/
839+
unsigned long nProbes = 0;
840+
for (const auto& output : customOutputs) {
841+
if (!output.skip && output.type == OperationType::PROBE) {
842+
++nProbes;
843+
}
844+
}
845+
846+
/*--- Heuristic: Build ADT if we have more than 10 probes. For small numbers of probes,
847+
the overhead of building the ADT may not be worth it compared to linear search.
848+
Note: If this threshold is increased, the regression test (probe_performance_11)
849+
must be updated to ensure the ADT path is still tested. ---*/
850+
const unsigned long ADT_THRESHOLD = 10;
851+
const bool useADT = (nProbes > ADT_THRESHOLD);
852+
853+
/*--- Build ADT for probe nearest neighbor search if heuristic suggests it. ---*/
854+
std::unique_ptr<CADTPointsOnlyClass> probeADT;
855+
if (useADT) {
856+
const unsigned long nPointDomain = geometry->GetnPointDomain();
857+
vector<su2double> coords(nDim * nPointDomain);
858+
vector<unsigned long> pointIDs(nPointDomain);
859+
860+
for (unsigned long iPoint = 0; iPoint < nPointDomain; ++iPoint) {
861+
pointIDs[iPoint] = iPoint;
862+
for (unsigned short iDim = 0; iDim < nDim; ++iDim) {
863+
coords[iPoint * nDim + iDim] = geometry->nodes->GetCoord(iPoint, iDim);
864+
}
865+
}
866+
867+
/*--- Build global ADT to find nearest nodes across all ranks. ---*/
868+
probeADT = std::make_unique<CADTPointsOnlyClass>(nDim, nPointDomain, coords.data(), pointIDs.data(), true);
869+
}
821870

822871
for (auto& output : customOutputs) {
823872
if (output.skip) continue;
@@ -849,19 +898,34 @@ void CFlowOutput::SetCustomOutputs(const CSolver* const* solver, const CGeometry
849898
}
850899
su2double coord[3] = {};
851900
for (auto iDim = 0u; iDim < nDim; ++iDim) coord[iDim] = std::stod(output.markers[iDim]);
901+
/*--- Use ADT for efficient nearest neighbor search instead of brute force. ---*/
852902
su2double minDist = std::numeric_limits<su2double>::max();
853903
unsigned long minPoint = 0;
854-
for (auto iPoint = 0ul; iPoint < geometry->GetnPointDomain(); ++iPoint) {
855-
const su2double dist = GeometryToolbox::SquaredDistance(nDim, coord, geometry->nodes->GetCoord(iPoint));
856-
if (dist < minDist) {
857-
minDist = dist;
858-
minPoint = iPoint;
904+
int rankID = -1;
905+
int rank;
906+
SU2_MPI::Comm_rank(SU2_MPI::GetComm(), &rank);
907+
908+
if (useADT && probeADT && !probeADT->IsEmpty()) {
909+
/*--- Use ADT to find the nearest node efficiently (O(log n) instead of O(n)). ---*/
910+
probeADT->DetermineNearestNode(coord, minDist, minPoint, rankID);
911+
minDist = pow(minDist, 2);
912+
913+
/*--- Check if this rank owns the nearest point. ---*/
914+
output.iPoint = (rankID == rank) ? minPoint : CustomOutput::PROBE_NOT_OWNED;
915+
} else {
916+
/*--- Use linear search for small numbers of probes or when ADT is not available. ---*/
917+
for (auto iPoint = 0ul; iPoint < geometry->GetnPointDomain(); ++iPoint) {
918+
const su2double dist = GeometryToolbox::SquaredDistance(nDim, coord, geometry->nodes->GetCoord(iPoint));
919+
if (dist < minDist) {
920+
minDist = dist;
921+
minPoint = iPoint;
922+
}
859923
}
924+
/*--- Decide which rank owns the probe using Allreduce. ---*/
925+
su2double globMinDist;
926+
SU2_MPI::Allreduce(&minDist, &globMinDist, 1, MPI_DOUBLE, MPI_MIN, SU2_MPI::GetComm());
927+
output.iPoint = fabs(minDist - globMinDist) < EPS ? minPoint : CustomOutput::PROBE_NOT_OWNED;
860928
}
861-
/*--- Decide which rank owns the probe. ---*/
862-
su2double globMinDist;
863-
SU2_MPI::Allreduce(&minDist, &globMinDist, 1, MPI_DOUBLE, MPI_MIN, SU2_MPI::GetComm());
864-
output.iPoint = fabs(minDist - globMinDist) < EPS ? minPoint : CustomOutput::PROBE_NOT_OWNED;
865929
if (output.iPoint != CustomOutput::PROBE_NOT_OWNED) {
866930
std::cout << "Probe " << output.name << " is using global point "
867931
<< geometry->nodes->GetGlobalIndex(output.iPoint)
@@ -883,29 +947,11 @@ void CFlowOutput::SetCustomOutputs(const CSolver* const* solver, const CGeometry
883947
* (see ConvertVariableSymbolsToIndices). ---*/
884948

885949
auto MakeFunctor = [&](unsigned long iPoint) {
886-
/*--- This returns another lambda that captures iPoint by value. ---*/
887-
return [&, iPoint](unsigned long i) {
888-
if (i < CustomOutput::NOT_A_VARIABLE) {
889-
const auto solIdx = i / CustomOutput::MAX_VARS_PER_SOLVER;
890-
const auto varIdx = i % CustomOutput::MAX_VARS_PER_SOLVER;
891-
if (solIdx == FLOW_SOL) {
892-
return flowNodes->GetPrimitive(iPoint, varIdx);
893-
}
894-
return solver[solIdx]->GetNodes()->GetSolution(iPoint, varIdx);
895-
} else {
896-
return *output.otherOutputs[i - CustomOutput::NOT_A_VARIABLE];
897-
}
898-
};
950+
return GetPointValue(output, iPoint);
899951
};
900952

901953
if (output.type == OperationType::PROBE) {
902-
su2double value = std::numeric_limits<su2double>::max();
903-
if (output.iPoint != CustomOutput::PROBE_NOT_OWNED) {
904-
value = output.Eval(MakeFunctor(output.iPoint));
905-
}
906-
su2double tmp = value;
907-
SU2_MPI::Allreduce(&tmp, &value, 1, MPI_DOUBLE, MPI_MIN, SU2_MPI::GetComm());
908-
SetHistoryOutputValue(output.name, value);
954+
/*--- Probe evaluation will be done after all outputs are processed, with batched AllReduce. ---*/
909955
continue;
910956
}
911957

@@ -954,6 +1000,33 @@ void CFlowOutput::SetCustomOutputs(const CSolver* const* solver, const CGeometry
9541000
}
9551001
SetHistoryOutputValue(output.name, integral[0]);
9561002
}
1003+
1004+
/*--- Batch AllReduce for all probe values to reduce MPI communication overhead. ---*/
1005+
if (nProbes > 0) {
1006+
/*--- Evaluate all probe values locally first. ---*/
1007+
vector<su2double> probeValues;
1008+
probeValues.reserve(nProbes);
1009+
for (auto& output : customOutputs) {
1010+
if (output.skip || output.type != OperationType::PROBE) continue;
1011+
su2double value = std::numeric_limits<su2double>::max();
1012+
if (output.iPoint != CustomOutput::PROBE_NOT_OWNED) {
1013+
value = output.Eval(GetPointValue(output, output.iPoint));
1014+
}
1015+
probeValues.push_back(value);
1016+
}
1017+
1018+
/*--- Single AllReduce for all probe values. ---*/
1019+
unsigned long nProbesActual = probeValues.size();
1020+
vector<su2double> probeValuesGlobal(nProbesActual);
1021+
SU2_MPI::Allreduce(probeValues.data(), probeValuesGlobal.data(), nProbesActual, MPI_DOUBLE, MPI_MIN, SU2_MPI::GetComm());
1022+
1023+
/*--- Set history output values for all probes. ---*/
1024+
unsigned long iProbe = 0;
1025+
for (auto& output : customOutputs) {
1026+
if (output.skip || output.type != OperationType::PROBE) continue;
1027+
SetHistoryOutputValue(output.name, probeValuesGlobal[iProbe++]);
1028+
}
1029+
}
9571030
}
9581031

9591032
// The "AddHistoryOutput(" must not be split over multiple lines to ensure proper python parsing

TestCases/parallel_regression.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,15 @@ def main():
314314
flatplate_udobj.test_vals = [-6.760101, -1.283906, -0.745653, 0.000587, -0.000038, 0.000977, -0.001015, 596.450000, 299.550000, 296.900000, 21.318000, 0.586640, 36.553000, 2.188800]
315315
test_list.append(flatplate_udobj)
316316

317+
# Probe performance test (11 probes, ADT path)
318+
probe_performance_11 = TestCase('probe_performance_11')
319+
probe_performance_11.cfg_dir = "user_defined_functions"
320+
probe_performance_11.cfg_file = "test_11_probes.cfg"
321+
probe_performance_11.test_iter = 4
322+
probe_performance_11.test_vals = [-6.290748, 101020, 101050, 99123] # RMS_DENSITY, probe1, probe6, probe11
323+
# Tolerances are typically 0.001 in TestCase.py
324+
test_list.append(probe_performance_11)
325+
317326
# Laminar cylinder (steady)
318327
cylinder = TestCase('cylinder')
319328
cylinder.cfg_dir = "navierstokes/cylinder"
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2+
% Test case: 11 probes (ADT path, >10)
3+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
4+
5+
SOLVER= NAVIER_STOKES
6+
KIND_TURB_MODEL= NONE
7+
RESTART_SOL= NO
8+
9+
CUSTOM_OUTPUTS= 'probe1 : Probe{PRESSURE}[0.001000, 0.001000, 0.010000]; probe2 : Probe{PRESSURE}[0.001700, 0.001700, 0.018000]; probe3 : Probe{PRESSURE}[0.002400, 0.002400, 0.026000]; probe4 : Probe{PRESSURE}[0.003100, 0.003100, 0.034000]; probe5 : Probe{PRESSURE}[0.003800, 0.003800, 0.042000]; probe6 : Probe{PRESSURE}[0.004500, 0.004500, 0.050000]; probe7 : Probe{PRESSURE}[0.005200, 0.005200, 0.058000]; probe8 : Probe{PRESSURE}[0.005900, 0.005900, 0.066000]; probe9 : Probe{PRESSURE}[0.006600, 0.006600, 0.074000]; probe10 : Probe{PRESSURE}[0.007300, 0.007300, 0.082000]; probe11 : Probe{PRESSURE}[0.008000, 0.008000, 0.090000]'
10+
11+
SCREEN_OUTPUT= INNER_ITER, RMS_DENSITY, probe1, probe6, probe11
12+
HISTORY_OUTPUT = ITER, CUSTOM
13+
14+
MACH_NUMBER= 0.1
15+
INIT_OPTION= TD_CONDITIONS
16+
FREESTREAM_OPTION= TEMPERATURE_FS
17+
FREESTREAM_TEMPERATURE= 297.62
18+
REYNOLDS_NUMBER= 600
19+
REYNOLDS_LENGTH= 0.02
20+
21+
REF_ORIGIN_MOMENT_X = 0.00
22+
REF_ORIGIN_MOMENT_Y = 0.00
23+
REF_ORIGIN_MOMENT_Z = 0.00
24+
REF_LENGTH= 0.02
25+
REF_AREA= 0.02
26+
27+
FLUID_MODEL= IDEAL_GAS
28+
GAMMA_VALUE= 1.4
29+
GAS_CONSTANT= 287.87
30+
VISCOSITY_MODEL= CONSTANT_VISCOSITY
31+
MU_CONSTANT= 0.001
32+
33+
MARKER_HEATFLUX= ( y_minus, 0.0 )
34+
MARKER_SYM= ( y_plus )
35+
MARKER_PERIODIC= ( x_minus, x_plus, 0,0,0, 0,0,0, 0.01,0,0 )
36+
MARKER_INLET= ( z_minus, 300.0, 100000.0, 0.0, 0.0, 1.0 )
37+
MARKER_OUTLET= ( z_plus, 99000.0 )
38+
MARKER_PLOTTING= ( y_minus )
39+
MARKER_MONITORING= ( y_minus )
40+
MARKER_ANALYZE= ( z_minus, z_plus )
41+
42+
NUM_METHOD_GRAD= GREEN_GAUSS
43+
CFL_NUMBER= 1e4
44+
CFL_ADAPT= NO
45+
TIME_DISCRE_FLOW= EULER_IMPLICIT
46+
47+
LINEAR_SOLVER= FGMRES
48+
LINEAR_SOLVER_PREC= ILU
49+
LINEAR_SOLVER_ERROR= 0.2
50+
LINEAR_SOLVER_ITER= 5
51+
52+
CONV_NUM_METHOD_FLOW= ROE
53+
MUSCL_FLOW= YES
54+
SLOPE_LIMITER_FLOW= NONE
55+
56+
CONV_RESIDUAL_MINVAL= -11
57+
CONV_STARTITER= 0
58+
INNER_ITER= 5
59+
60+
MESH_FORMAT= BOX
61+
MESH_BOX_LENGTH= (0.01, 0.01, 0.1)
62+
MESH_BOX_SIZE= (9, 17, 65)

0 commit comments

Comments
 (0)