Skip to content

Commit 4f8fb91

Browse files
anandrdbzAnandAnandAnandAnand Radhakrishnan
authored
MPI FFTW (#997)
Co-authored-by: Anand <[email protected]> Co-authored-by: Anand <[email protected]> Co-authored-by: Anand <[email protected]> Co-authored-by: Anand Radhakrishnan <[email protected]> Co-authored-by: Anand Radhakrishnan <[email protected]> Co-authored-by: Anand <[email protected]> Co-authored-by: Anand <[email protected]> Co-authored-by: Spencer Bryngelson <[email protected]> Co-authored-by: Anand <[email protected]> Co-authored-by: Ben Wilfong <[email protected]>
1 parent d91cb68 commit 4f8fb91

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3358
-123
lines changed

src/common/include/3dHardcodedIC.fpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#:def Hardcoded3DVariables()
22
! Place any declaration of intermediate variables here
3-
real(wp) :: rhoH, rhoL, pRef, pInt, h, lam, wl, amp, intH, alph
3+
real(wp) :: rhoH, rhoL, pRef, pInt, h, lam, wl, amp, intH, alph, Mach
44

55
real(wp) :: eps
66

@@ -94,10 +94,11 @@
9494
! This is patch is hard-coded for test suite optimization used in the
9595
! 3D_TaylorGreenVortex case:
9696
! This analytic patch used geometry 9
97+
Mach = 0.1
9798
if (patch_id == 1) then
98-
q_prim_vf(E_idx)%sf(i, j, k) = 101325 + (1*37.6636429464809**2/16)*(cos(2*x_cc(i)/1) + cos(2*y_cc(j)/1))*(cos(2*z_cc(k)/1) + 2)
99-
q_prim_vf(momxb + 0)%sf(i, j, k) = 37.6636429464809*sin(x_cc(i)/1)*cos(y_cc(j)/1)*sin(z_cc(k)/1)
100-
q_prim_vf(momxb + 1)%sf(i, j, k) = -37.6636429464809*cos(x_cc(i)/1)*sin(y_cc(j)/1)*sin(z_cc(k)/1)
99+
q_prim_vf(E_idx)%sf(i, j, k) = 101325 + (Mach**2*376.636429464809**2/16)*(cos(2*x_cc(i)/1) + cos(2*y_cc(j)/1))*(cos(2*z_cc(k)/1) + 2)
100+
q_prim_vf(momxb + 0)%sf(i, j, k) = Mach*376.636429464809*sin(x_cc(i)/1)*cos(y_cc(j)/1)*sin(z_cc(k)/1)
101+
q_prim_vf(momxb + 1)%sf(i, j, k) = -Mach*376.636429464809*cos(x_cc(i)/1)*sin(y_cc(j)/1)*sin(z_cc(k)/1)
101102
end if
102103

103104
case default

src/common/m_mpi_common.fpp

Lines changed: 114 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,117 +1174,161 @@ contains
11741174
if (n > 0) then
11751175
11761176
if (p > 0) then
1177+
if (fft_wrt) then
11771178
1178-
if (cyl_coord .and. p > 0) then
1179-
! Implement pencil processor blocking if using cylindrical coordinates so
1180-
! that all cells in azimuthal direction are stored on a single processor.
1181-
! This is necessary for efficient application of Fourier filter near axis.
1182-
1183-
! Initial values of the processor factorization optimization
1179+
! Initial estimate of optimal processor topology
11841180
num_procs_x = 1
1185-
num_procs_y = num_procs
1186-
num_procs_z = 1
1181+
num_procs_y = 1
1182+
num_procs_z = num_procs
11871183
ierr = -1
11881184
1189-
! Computing minimization variable for these initial values
1190-
tmp_num_procs_x = num_procs_x
1185+
! Benchmarking the quality of this initial guess
11911186
tmp_num_procs_y = num_procs_y
11921187
tmp_num_procs_z = num_procs_z
1193-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1194-
- (n + 1)/tmp_num_procs_y)
1188+
fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y &
1189+
- (p + 1)/tmp_num_procs_z)
11951190
1196-
! Searching for optimal computational domain distribution
1191+
! Optimization of the initial processor topology
11971192
do i = 1, num_procs
11981193
11991194
if (mod(num_procs, i) == 0 &
12001195
.and. &
1201-
(m + 1)/i >= num_stcls_min*recon_order) then
1196+
(n + 1)/i >= num_stcls_min*recon_order) then
12021197
1203-
tmp_num_procs_x = i
1204-
tmp_num_procs_y = num_procs/i
1198+
tmp_num_procs_y = i
1199+
tmp_num_procs_z = num_procs/i
12051200
1206-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1207-
- (n + 1)/tmp_num_procs_y) &
1201+
if (fct_min >= abs((n + 1)/tmp_num_procs_y &
1202+
- (p + 1)/tmp_num_procs_z) &
12081203
.and. &
1209-
(n + 1)/tmp_num_procs_y &
1204+
(p + 1)/tmp_num_procs_z &
12101205
>= &
12111206
num_stcls_min*recon_order) then
12121207
1213-
num_procs_x = i
1214-
num_procs_y = num_procs/i
1215-
fct_min = abs((m + 1)/tmp_num_procs_x &
1216-
- (n + 1)/tmp_num_procs_y)
1208+
num_procs_y = i
1209+
num_procs_z = num_procs/i
1210+
fct_min = abs((n + 1)/tmp_num_procs_y &
1211+
- (p + 1)/tmp_num_procs_z)
12171212
ierr = 0
12181213
12191214
end if
12201215
12211216
end if
12221217
12231218
end do
1224-
12251219
else
12261220
1227-
! Initial estimate of optimal processor topology
1228-
num_procs_x = 1
1229-
num_procs_y = 1
1230-
num_procs_z = num_procs
1231-
ierr = -1
1221+
if (cyl_coord .and. p > 0) then
1222+
! Implement pencil processor blocking if using cylindrical coordinates so
1223+
! that all cells in azimuthal direction are stored on a single processor.
1224+
! This is necessary for efficient application of Fourier filter near axis.
12321225
1233-
! Benchmarking the quality of this initial guess
1234-
tmp_num_procs_x = num_procs_x
1235-
tmp_num_procs_y = num_procs_y
1236-
tmp_num_procs_z = num_procs_z
1237-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1238-
- (n + 1)/tmp_num_procs_y) &
1239-
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1240-
- (p + 1)/tmp_num_procs_z)
1226+
! Initial values of the processor factorization optimization
1227+
num_procs_x = 1
1228+
num_procs_y = num_procs
1229+
num_procs_z = 1
1230+
ierr = -1
12411231
1242-
! Optimization of the initial processor topology
1243-
do i = 1, num_procs
1232+
! Computing minimization variable for these initial values
1233+
tmp_num_procs_x = num_procs_x
1234+
tmp_num_procs_y = num_procs_y
1235+
tmp_num_procs_z = num_procs_z
1236+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1237+
- (n + 1)/tmp_num_procs_y)
12441238
1245-
if (mod(num_procs, i) == 0 &
1246-
.and. &
1247-
(m + 1)/i >= num_stcls_min*recon_order) then
1239+
! Searching for optimal computational domain distribution
1240+
do i = 1, num_procs
1241+
1242+
if (mod(num_procs, i) == 0 &
1243+
.and. &
1244+
(m + 1)/i >= num_stcls_min*recon_order) then
12481245
1249-
do j = 1, num_procs/i
1246+
tmp_num_procs_x = i
1247+
tmp_num_procs_y = num_procs/i
12501248
1251-
if (mod(num_procs/i, j) == 0 &
1249+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1250+
- (n + 1)/tmp_num_procs_y) &
12521251
.and. &
1253-
(n + 1)/j >= num_stcls_min*recon_order) then
1252+
(n + 1)/tmp_num_procs_y &
1253+
>= &
1254+
num_stcls_min*recon_order) then
12541255
1255-
tmp_num_procs_x = i
1256-
tmp_num_procs_y = j
1257-
tmp_num_procs_z = num_procs/(i*j)
1256+
num_procs_x = i
1257+
num_procs_y = num_procs/i
1258+
fct_min = abs((m + 1)/tmp_num_procs_x &
1259+
- (n + 1)/tmp_num_procs_y)
1260+
ierr = 0
12581261
1259-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1260-
- (n + 1)/tmp_num_procs_y) &
1261-
+ abs((n + 1)/tmp_num_procs_y &
1262-
- (p + 1)/tmp_num_procs_z) &
1262+
end if
1263+
1264+
end if
1265+
1266+
end do
1267+
1268+
else
1269+
1270+
! Initial estimate of optimal processor topology
1271+
num_procs_x = 1
1272+
num_procs_y = 1
1273+
num_procs_z = num_procs
1274+
ierr = -1
1275+
1276+
! Benchmarking the quality of this initial guess
1277+
tmp_num_procs_x = num_procs_x
1278+
tmp_num_procs_y = num_procs_y
1279+
tmp_num_procs_z = num_procs_z
1280+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1281+
- (n + 1)/tmp_num_procs_y) &
1282+
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1283+
- (p + 1)/tmp_num_procs_z)
1284+
1285+
! Optimization of the initial processor topology
1286+
do i = 1, num_procs
1287+
1288+
if (mod(num_procs, i) == 0 &
1289+
.and. &
1290+
(m + 1)/i >= num_stcls_min*recon_order) then
1291+
1292+
do j = 1, num_procs/i
1293+
1294+
if (mod(num_procs/i, j) == 0 &
12631295
.and. &
1264-
(p + 1)/tmp_num_procs_z &
1265-
>= &
1266-
num_stcls_min*recon_order) &
1267-
then
1268-
1269-
num_procs_x = i
1270-
num_procs_y = j
1271-
num_procs_z = num_procs/(i*j)
1272-
fct_min = abs((m + 1)/tmp_num_procs_x &
1273-
- (n + 1)/tmp_num_procs_y) &
1274-
+ abs((n + 1)/tmp_num_procs_y &
1275-
- (p + 1)/tmp_num_procs_z)
1276-
ierr = 0
1296+
(n + 1)/j >= num_stcls_min*recon_order) then
1297+
1298+
tmp_num_procs_x = i
1299+
tmp_num_procs_y = j
1300+
tmp_num_procs_z = num_procs/(i*j)
1301+
1302+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1303+
- (n + 1)/tmp_num_procs_y) &
1304+
+ abs((n + 1)/tmp_num_procs_y &
1305+
- (p + 1)/tmp_num_procs_z) &
1306+
.and. &
1307+
(p + 1)/tmp_num_procs_z &
1308+
>= &
1309+
num_stcls_min*recon_order) &
1310+
then
1311+
1312+
num_procs_x = i
1313+
num_procs_y = j
1314+
num_procs_z = num_procs/(i*j)
1315+
fct_min = abs((m + 1)/tmp_num_procs_x &
1316+
- (n + 1)/tmp_num_procs_y) &
1317+
+ abs((n + 1)/tmp_num_procs_y &
1318+
- (p + 1)/tmp_num_procs_z)
1319+
ierr = 0
1320+
1321+
end if
12771322
12781323
end if
12791324
1280-
end if
1281-
1282-
end do
1325+
end do
12831326
1284-
end if
1327+
end if
12851328
1286-
end do
1329+
end do
12871330
1331+
end if
12881332
end if
12891333
12901334
! Verifying that a valid decomposition of the computational

src/post_process/m_checker.fpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module m_checker
1717

1818
implicit none
1919

20-
private; public :: s_check_inputs
20+
private; public :: s_check_inputs, s_check_inputs_fft
2121

2222
contains
2323

@@ -111,6 +111,22 @@ contains
111111
@:PROHIBIT(any(omega_wrt) .and. fd_order == dflt_int, "fd_order must be set for omega_wrt")
112112
end subroutine s_check_inputs_vorticity
113113

114+
!> Checks constraints on fft_wrt
115+
impure subroutine s_check_inputs_fft
116+
integer :: num_procs_y, num_procs_z
117+
118+
@:PROHIBIT(fft_wrt .and. (n == 0 .or. p == 0), "FFT WRT only in 3D")
119+
@:PROHIBIT(fft_wrt .and. cyl_coord, "FFT WRT incompatible with cylindrical coordinates")
120+
@:PROHIBIT(fft_wrt .and. (MOD(m_glb+1,2) == 1 .or. MOD(n_glb+1,2) == 1 .or. MOD(p_glb+1,2) == 1), "FFT WRT requires global dimensions divisible by 2")
121+
@:PROHIBIT(fft_wrt .and. MOD(n_glb+1,n+1) /= 0, "FFT WRT requires n_glb to be divisible by num_procs_y")
122+
@:PROHIBIT(fft_wrt .and. MOD(p_glb+1,p+1) /= 0, "FFT WRT requires p_glb to be divisible by num_procs_z")
123+
num_procs_y = (n_glb + 1)/(n + 1)
124+
num_procs_z = (p_glb + 1)/(p + 1)
125+
@:PROHIBIT(fft_wrt .and. MOD(m_glb+1,num_procs_y) /= 0, "FFT WRT requires m_glb to be divisible by num_procs_y")
126+
@:PROHIBIT(fft_wrt .and. MOD(n_glb+1,num_procs_z) /= 0, "FFT WRT requires n_glb to be divisible by num_procs_z")
127+
@:PROHIBIT(fft_wrt .and. (bc_x%beg < -1 .or. bc_y%beg < -1 .or. bc_z%beg < -1 .or. bc_x%end < -1 .or. bc_y%end < -1 .or. bc_z%end < -1), "FFT WRT requires periodic BCs")
128+
end subroutine s_check_inputs_fft
129+
114130
!> Checks constraints on Q-criterion parameters
115131
impure subroutine s_check_inputs_qm
116132
@:PROHIBIT(n == 0 .and. qm_wrt)

src/post_process/m_global_parameters.fpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ module m_global_parameters
240240
integer :: flux_lim
241241
logical, dimension(3) :: flux_wrt
242242
logical :: E_wrt
243+
logical :: fft_wrt
243244
logical :: pres_wrt
244245
logical, dimension(num_fluids_max) :: alpha_wrt
245246
logical :: gamma_wrt
@@ -441,6 +442,7 @@ contains
441442
parallel_io = .false.
442443
file_per_process = .false.
443444
E_wrt = .false.
445+
fft_wrt = .false.
444446
pres_wrt = .false.
445447
alpha_wrt = .false.
446448
gamma_wrt = .false.

src/post_process/m_mpi_proxy.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ contains
105105
& 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', &
106106
& 'surface_tension', 'hyperelasticity', 'bubbles_lagrange', &
107107
& 'output_partial_domain', 'relativity', 'cont_damage', 'bc_io', &
108-
& 'down_sample' ]
108+
& 'down_sample','fft_wrt' ]
109109
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
110110
#:endfor
111111

0 commit comments

Comments
 (0)