Skip to content

Commit c47ed0c

Browse files
author
Anand Radhakrishnan
committed
Keep Pencil Decomposition throughout (better results)
1 parent c3425ee commit c47ed0c

File tree

10 files changed

+105
-159
lines changed

10 files changed

+105
-159
lines changed

src/common/m_mpi_common.fpp

Lines changed: 90 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,9 +1163,7 @@ contains
11631163
if (n > 0) then
11641164
11651165
if (p > 0) then
1166-
1167-
#ifdef MFC_POST_PROCESS
1168-
if (fft_wrt .and. (.not. file_per_process)) then
1166+
if (fft_wrt) then
11691167
11701168
! Initial estimate of optimal processor topology
11711169
num_procs_x = 1
@@ -1207,182 +1205,121 @@ contains
12071205
end if
12081206
12091207
end do
1210-
12111208
else
1212-
! Initial estimate of optimal processor topology
1213-
num_procs_x = 1
1214-
num_procs_y = 1
1215-
num_procs_z = num_procs
1216-
ierr = -1
12171209
1218-
! Benchmarking the quality of this initial guess
1219-
tmp_num_procs_x = num_procs_x
1220-
tmp_num_procs_y = num_procs_y
1221-
tmp_num_procs_z = num_procs_z
1222-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1223-
- (n + 1)/tmp_num_procs_y) &
1224-
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1225-
- (p + 1)/tmp_num_procs_z)
1210+
if (cyl_coord .and. p > 0) then
1211+
! Implement pencil processor blocking if using cylindrical coordinates so
1212+
! that all cells in azimuthal direction are stored on a single processor.
1213+
! This is necessary for efficient application of Fourier filter near axis.
12261214
1227-
! Optimization of the initial processor topology
1228-
do i = 1, num_procs
1215+
! Initial values of the processor factorization optimization
1216+
num_procs_x = 1
1217+
num_procs_y = num_procs
1218+
num_procs_z = 1
1219+
ierr = -1
12291220
1230-
if (mod(num_procs, i) == 0 &
1231-
.and. &
1232-
(m + 1)/i >= num_stcls_min*recon_order) then
1221+
! Computing minimization variable for these initial values
1222+
tmp_num_procs_x = num_procs_x
1223+
tmp_num_procs_y = num_procs_y
1224+
tmp_num_procs_z = num_procs_z
1225+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1226+
- (n + 1)/tmp_num_procs_y)
12331227
1234-
do j = 1, num_procs/i
1228+
! Searching for optimal computational domain distribution
1229+
do i = 1, num_procs
12351230
1236-
if (mod(num_procs/i, j) == 0 &
1237-
.and. &
1238-
(n + 1)/j >= num_stcls_min*recon_order) then
1231+
if (mod(num_procs, i) == 0 &
1232+
.and. &
1233+
(m + 1)/i >= num_stcls_min*recon_order) then
12391234
1240-
tmp_num_procs_x = i
1241-
tmp_num_procs_y = j
1242-
tmp_num_procs_z = num_procs/(i*j)
1235+
tmp_num_procs_x = i
1236+
tmp_num_procs_y = num_procs/i
12431237
1244-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1245-
- (n + 1)/tmp_num_procs_y) &
1246-
+ abs((n + 1)/tmp_num_procs_y &
1247-
- (p + 1)/tmp_num_procs_z) &
1248-
.and. &
1249-
(p + 1)/tmp_num_procs_z &
1250-
>= &
1251-
num_stcls_min*recon_order) &
1252-
then
1253-
1254-
num_procs_x = i
1255-
num_procs_y = j
1256-
num_procs_z = num_procs/(i*j)
1257-
fct_min = abs((m + 1)/tmp_num_procs_x &
1258-
- (n + 1)/tmp_num_procs_y) &
1259-
+ abs((n + 1)/tmp_num_procs_y &
1260-
- (p + 1)/tmp_num_procs_z)
1261-
ierr = 0
1238+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1239+
- (n + 1)/tmp_num_procs_y) &
1240+
.and. &
1241+
(n + 1)/tmp_num_procs_y &
1242+
>= &
1243+
num_stcls_min*recon_order) then
12621244
1263-
end if
1245+
num_procs_x = i
1246+
num_procs_y = num_procs/i
1247+
fct_min = abs((m + 1)/tmp_num_procs_x &
1248+
- (n + 1)/tmp_num_procs_y)
1249+
ierr = 0
12641250
12651251
end if
12661252
1267-
end do
1268-
1269-
end if
1270-
1271-
end do
1272-
end if
1273-
#else
1274-
if (cyl_coord .and. p > 0) then
1275-
! Implement pencil processor blocking if using cylindrical coordinates so
1276-
! that all cells in azimuthal direction are stored on a single processor.
1277-
! This is necessary for efficient application of Fourier filter near axis.
1278-
1279-
! Initial values of the processor factorization optimization
1280-
num_procs_x = 1
1281-
num_procs_y = num_procs
1282-
num_procs_z = 1
1283-
ierr = -1
1284-
1285-
! Computing minimization variable for these initial values
1286-
tmp_num_procs_x = num_procs_x
1287-
tmp_num_procs_y = num_procs_y
1288-
tmp_num_procs_z = num_procs_z
1289-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1290-
- (n + 1)/tmp_num_procs_y)
1291-
1292-
! Searching for optimal computational domain distribution
1293-
do i = 1, num_procs
1294-
1295-
if (mod(num_procs, i) == 0 &
1296-
.and. &
1297-
(m + 1)/i >= num_stcls_min*recon_order) then
1298-
1299-
tmp_num_procs_x = i
1300-
tmp_num_procs_y = num_procs/i
1301-
1302-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1303-
- (n + 1)/tmp_num_procs_y) &
1304-
.and. &
1305-
(n + 1)/tmp_num_procs_y &
1306-
>= &
1307-
num_stcls_min*recon_order) then
1308-
1309-
num_procs_x = i
1310-
num_procs_y = num_procs/i
1311-
fct_min = abs((m + 1)/tmp_num_procs_x &
1312-
- (n + 1)/tmp_num_procs_y)
1313-
ierr = 0
1314-
13151253
end if
13161254
1317-
end if
1318-
1319-
end do
1320-
1321-
else
1322-
1323-
! Initial estimate of optimal processor topology
1324-
num_procs_x = 1
1325-
num_procs_y = 1
1326-
num_procs_z = num_procs
1327-
ierr = -1
1328-
1329-
! Benchmarking the quality of this initial guess
1330-
tmp_num_procs_x = num_procs_x
1331-
tmp_num_procs_y = num_procs_y
1332-
tmp_num_procs_z = num_procs_z
1333-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1334-
- (n + 1)/tmp_num_procs_y) &
1335-
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1336-
- (p + 1)/tmp_num_procs_z)
1337-
1338-
! Optimization of the initial processor topology
1339-
do i = 1, num_procs
1340-
1341-
if (mod(num_procs, i) == 0 &
1342-
.and. &
1343-
(m + 1)/i >= num_stcls_min*recon_order) then
1255+
end do
13441256
1345-
do j = 1, num_procs/i
1257+
else
13461258
1347-
if (mod(num_procs/i, j) == 0 &
1348-
.and. &
1349-
(n + 1)/j >= num_stcls_min*recon_order) then
1259+
! Initial estimate of optimal processor topology
1260+
num_procs_x = 1
1261+
num_procs_y = 1
1262+
num_procs_z = num_procs
1263+
ierr = -1
1264+
1265+
! Benchmarking the quality of this initial guess
1266+
tmp_num_procs_x = num_procs_x
1267+
tmp_num_procs_y = num_procs_y
1268+
tmp_num_procs_z = num_procs_z
1269+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1270+
- (n + 1)/tmp_num_procs_y) &
1271+
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1272+
- (p + 1)/tmp_num_procs_z)
1273+
1274+
! Optimization of the initial processor topology
1275+
do i = 1, num_procs
1276+
1277+
if (mod(num_procs, i) == 0 &
1278+
.and. &
1279+
(m + 1)/i >= num_stcls_min*recon_order) then
13501280
1351-
tmp_num_procs_x = i
1352-
tmp_num_procs_y = j
1353-
tmp_num_procs_z = num_procs/(i*j)
1281+
do j = 1, num_procs/i
13541282
1355-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1356-
- (n + 1)/tmp_num_procs_y) &
1357-
+ abs((n + 1)/tmp_num_procs_y &
1358-
- (p + 1)/tmp_num_procs_z) &
1283+
if (mod(num_procs/i, j) == 0 &
13591284
.and. &
1360-
(p + 1)/tmp_num_procs_z &
1361-
>= &
1362-
num_stcls_min*recon_order) &
1363-
then
1364-
1365-
num_procs_x = i
1366-
num_procs_y = j
1367-
num_procs_z = num_procs/(i*j)
1368-
fct_min = abs((m + 1)/tmp_num_procs_x &
1369-
- (n + 1)/tmp_num_procs_y) &
1370-
+ abs((n + 1)/tmp_num_procs_y &
1371-
- (p + 1)/tmp_num_procs_z)
1372-
ierr = 0
1285+
(n + 1)/j >= num_stcls_min*recon_order) then
1286+
1287+
tmp_num_procs_x = i
1288+
tmp_num_procs_y = j
1289+
tmp_num_procs_z = num_procs/(i*j)
1290+
1291+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1292+
- (n + 1)/tmp_num_procs_y) &
1293+
+ abs((n + 1)/tmp_num_procs_y &
1294+
- (p + 1)/tmp_num_procs_z) &
1295+
.and. &
1296+
(p + 1)/tmp_num_procs_z &
1297+
>= &
1298+
num_stcls_min*recon_order) &
1299+
then
1300+
1301+
num_procs_x = i
1302+
num_procs_y = j
1303+
num_procs_z = num_procs/(i*j)
1304+
fct_min = abs((m + 1)/tmp_num_procs_x &
1305+
- (n + 1)/tmp_num_procs_y) &
1306+
+ abs((n + 1)/tmp_num_procs_y &
1307+
- (p + 1)/tmp_num_procs_z)
1308+
ierr = 0
1309+
1310+
end if
13731311
13741312
end if
13751313
1376-
end if
1377-
1378-
end do
1314+
end do
13791315
1380-
end if
1316+
end if
13811317
1382-
end do
1318+
end do
13831319
1320+
end if
13841321
end if
1385-
#endif
1322+
13861323
! Verifying that a valid decomposition of the computational
13871324
! domain has been established. If not, the simulation exits.
13881325
if (proc_rank == 0 .and. ierr == -1) then

src/post_process/m_checker.fpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ contains
115115
impure subroutine s_check_inputs_fft
116116
integer :: num_procs_x, num_procs_y, num_procs_z
117117

118-
@:PROHIBIT(fft_wrt .and. file_per_process, "Turn off file_per_process with fft_wrt")
119118
@:PROHIBIT(fft_wrt .and. (n == 0 .or. p == 0), "FFT WRT only in 3D")
120119
@:PROHIBIT(fft_wrt .and. (MOD(m_glb+1,2) == 1 .or. MOD(n_glb+1,2) == 1 .or. MOD(p_glb+1,2) == 1), "FFT WRT requires global dimensions divisible by 2")
121120
num_procs_x = (m_glb + 1)/(m + 1)

src/pre_process/m_global_parameters.fpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ module m_global_parameters
288288
!! conditions data to march the solution in the physical computational domain
289289
!! to the next time-step.
290290

291+
logical :: fft_wrt
292+
291293
contains
292294

293295
!> Assigns default values to user inputs prior to reading
@@ -395,6 +397,8 @@ contains
395397
elliptic_smoothing_iters = dflt_int
396398
elliptic_smoothing = .false.
397399

400+
fft_wrt = .false.
401+
398402
! Initial condition parameters
399403
num_patches = dflt_int
400404

src/pre_process/m_mpi_proxy.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ contains
5959
& 'cfl_const_dt', 'cfl_dt', 'surface_tension', &
6060
& 'hyperelasticity', 'pre_stress', 'elliptic_smoothing', 'viscous',&
6161
& 'bubbles_lagrange', 'bc_io', 'mhd', 'relativity', 'cont_damage', &
62-
& 'igr', 'down_sample' ]
62+
& 'igr', 'down_sample','fft_wrt' ]
6363
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
6464
#:endfor
6565
call MPI_BCAST(fluid_rho(1), num_fluids_max, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)

src/pre_process/m_start_up.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ contains
150150
elliptic_smoothing, elliptic_smoothing_iters, &
151151
viscous, bubbles_lagrange, bc_x, bc_y, bc_z, num_bc_patches, &
152152
patch_bc, Bx0, relativity, cont_damage, igr, igr_order, &
153-
down_sample, recon_type, muscl_order
153+
down_sample, recon_type, muscl_order, fft_wrt
154154

155155
! Inquiring the status of the pre_process.inp file
156156
file_loc = 'pre_process.inp'

src/simulation/m_global_parameters.fpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,8 @@ module m_global_parameters
539539
logical :: powell !< Powell‐correction for div B = 0
540540
$:GPU_DECLARE(create='[Bx0,powell]')
541541
542+
logical :: fft_wrt
543+
542544
!> @name Continuum damage model parameters
543545
!> @{!
544546
real(wp) :: tau_star !< Stress threshold for continuum damage modeling
@@ -739,6 +741,8 @@ contains
739741
#:endfor
740742
#:endfor
741743
744+
fft_wrt = .false.
745+
742746
do j = 1, num_probes_max
743747
acoustic(j)%pulse = dflt_int
744748
acoustic(j)%support = dflt_int

src/simulation/m_mpi_proxy.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ contains
116116
& 'bc_z%grcbc_in', 'bc_z%grcbc_out', 'bc_z%grcbc_vel_out', &
117117
& 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', 'surface_tension', &
118118
& 'shear_stress', 'bulk_stress', 'bubbles_lagrange', &
119-
& 'hyperelasticity', 'down_sample', 'int_comp' ]
119+
& 'hyperelasticity', 'down_sample', 'int_comp','fft_wrt' ]
120120
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
121121
#:endfor
122122

src/simulation/m_start_up.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ contains
189189
cont_damage, tau_star, cont_damage_s, alpha_bar, &
190190
alf_factor, num_igr_iters, num_igr_warm_start_iters, &
191191
int_comp, ic_eps, ic_beta, nv_uvm_out_of_core, &
192-
nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample
192+
nv_uvm_igr_temps_on_gpu, nv_uvm_pref_gpu, down_sample, fft_wrt
193193

194194
! Checking that an input file has been provided by the user. If it
195195
! has, then the input file is read in, otherwise, simulation exits.

toolchain/mfc/run/case_dicts.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def analytic(self):
106106
'elliptic_smoothing_iters': ParamType.INT,
107107
'viscous': ParamType.LOG,
108108
'bubbles_lagrange': ParamType.LOG,
109+
'fft_wrt': ParamType.LOG,
109110
})
110111

111112
for ib_id in range(1, 10+1):
@@ -316,6 +317,7 @@ def analytic(self):
316317
'nv_uvm_out_of_core': ParamType.LOG,
317318
'nv_uvm_igr_temps_on_gpu': ParamType.INT,
318319
'nv_uvm_pref_gpu': ParamType.LOG,
320+
'fft_wrt': ParamType.LOG,
319321
})
320322

321323
for var in [ 'heatTransfer_model', 'massTransfer_model', 'pressure_corrector',

toolchain/modules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ p-gpu MFC_CUDA_CC=70,75,80,89,90 NVHPC_CUDA_HOME=$CUDA_HOME CC=nvc CXX=nvc++ FC=
4949

5050
f OLCF Frontier
5151
f-all cpe/25.03 rocm/6.3.1
52-
f-all cray-fftw cray-hdf5 cray-python
52+
f-all cray-fftw cray-hdf5 cray-python cmake
5353
f-gpu craype-accel-amd-gfx90a rocprofiler-compute/3.0.0
5454

5555
d NCSA Delta

0 commit comments

Comments
 (0)