Skip to content

Commit 2d1f096

Browse files
committed
Merge remote-tracking branch 'upstream/master' into openmp_nvidia_merged_master
2 parents 4b67dfa + 4f8fb91 commit 2d1f096

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+3365
-135
lines changed

src/common/include/3dHardcodedIC.fpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#:def Hardcoded3DVariables()
22
! Place any declaration of intermediate variables here
3-
real(wp) :: rhoH, rhoL, pRef, pInt, h, lam, wl, amp, intH, alph
3+
real(wp) :: rhoH, rhoL, pRef, pInt, h, lam, wl, amp, intH, alph, Mach
44

55
real(wp) :: eps
66

@@ -94,10 +94,11 @@
9494
! This is patch is hard-coded for test suite optimization used in the
9595
! 3D_TaylorGreenVortex case:
9696
! This analytic patch used geometry 9
97+
Mach = 0.1
9798
if (patch_id == 1) then
98-
q_prim_vf(E_idx)%sf(i, j, k) = 101325 + (1*37.6636429464809**2/16)*(cos(2*x_cc(i)/1) + cos(2*y_cc(j)/1))*(cos(2*z_cc(k)/1) + 2)
99-
q_prim_vf(momxb + 0)%sf(i, j, k) = 37.6636429464809*sin(x_cc(i)/1)*cos(y_cc(j)/1)*sin(z_cc(k)/1)
100-
q_prim_vf(momxb + 1)%sf(i, j, k) = -37.6636429464809*cos(x_cc(i)/1)*sin(y_cc(j)/1)*sin(z_cc(k)/1)
99+
q_prim_vf(E_idx)%sf(i, j, k) = 101325 + (Mach**2*376.636429464809**2/16)*(cos(2*x_cc(i)/1) + cos(2*y_cc(j)/1))*(cos(2*z_cc(k)/1) + 2)
100+
q_prim_vf(momxb + 0)%sf(i, j, k) = Mach*376.636429464809*sin(x_cc(i)/1)*cos(y_cc(j)/1)*sin(z_cc(k)/1)
101+
q_prim_vf(momxb + 1)%sf(i, j, k) = -Mach*376.636429464809*cos(x_cc(i)/1)*sin(y_cc(j)/1)*sin(z_cc(k)/1)
101102
end if
102103

103104
case default

src/common/m_mpi_common.fpp

Lines changed: 114 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,117 +1192,161 @@ contains
11921192
if (n > 0) then
11931193
11941194
if (p > 0) then
1195+
if (fft_wrt) then
11951196
1196-
if (cyl_coord .and. p > 0) then
1197-
! Implement pencil processor blocking if using cylindrical coordinates so
1198-
! that all cells in azimuthal direction are stored on a single processor.
1199-
! This is necessary for efficient application of Fourier filter near axis.
1200-
1201-
! Initial values of the processor factorization optimization
1197+
! Initial estimate of optimal processor topology
12021198
num_procs_x = 1
1203-
num_procs_y = num_procs
1204-
num_procs_z = 1
1199+
num_procs_y = 1
1200+
num_procs_z = num_procs
12051201
ierr = -1
12061202
1207-
! Computing minimization variable for these initial values
1208-
tmp_num_procs_x = num_procs_x
1203+
! Benchmarking the quality of this initial guess
12091204
tmp_num_procs_y = num_procs_y
12101205
tmp_num_procs_z = num_procs_z
1211-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1212-
- (n + 1)/tmp_num_procs_y)
1206+
fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y &
1207+
- (p + 1)/tmp_num_procs_z)
12131208
1214-
! Searching for optimal computational domain distribution
1209+
! Optimization of the initial processor topology
12151210
do i = 1, num_procs
12161211
12171212
if (mod(num_procs, i) == 0 &
12181213
.and. &
1219-
(m + 1)/i >= num_stcls_min*recon_order) then
1214+
(n + 1)/i >= num_stcls_min*recon_order) then
12201215
1221-
tmp_num_procs_x = i
1222-
tmp_num_procs_y = num_procs/i
1216+
tmp_num_procs_y = i
1217+
tmp_num_procs_z = num_procs/i
12231218
1224-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1225-
- (n + 1)/tmp_num_procs_y) &
1219+
if (fct_min >= abs((n + 1)/tmp_num_procs_y &
1220+
- (p + 1)/tmp_num_procs_z) &
12261221
.and. &
1227-
(n + 1)/tmp_num_procs_y &
1222+
(p + 1)/tmp_num_procs_z &
12281223
>= &
12291224
num_stcls_min*recon_order) then
12301225
1231-
num_procs_x = i
1232-
num_procs_y = num_procs/i
1233-
fct_min = abs((m + 1)/tmp_num_procs_x &
1234-
- (n + 1)/tmp_num_procs_y)
1226+
num_procs_y = i
1227+
num_procs_z = num_procs/i
1228+
fct_min = abs((n + 1)/tmp_num_procs_y &
1229+
- (p + 1)/tmp_num_procs_z)
12351230
ierr = 0
12361231
12371232
end if
12381233
12391234
end if
12401235
12411236
end do
1242-
12431237
else
12441238
1245-
! Initial estimate of optimal processor topology
1246-
num_procs_x = 1
1247-
num_procs_y = 1
1248-
num_procs_z = num_procs
1249-
ierr = -1
1239+
if (cyl_coord .and. p > 0) then
1240+
! Implement pencil processor blocking if using cylindrical coordinates so
1241+
! that all cells in azimuthal direction are stored on a single processor.
1242+
! This is necessary for efficient application of Fourier filter near axis.
12501243
1251-
! Benchmarking the quality of this initial guess
1252-
tmp_num_procs_x = num_procs_x
1253-
tmp_num_procs_y = num_procs_y
1254-
tmp_num_procs_z = num_procs_z
1255-
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1256-
- (n + 1)/tmp_num_procs_y) &
1257-
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1258-
- (p + 1)/tmp_num_procs_z)
1244+
! Initial values of the processor factorization optimization
1245+
num_procs_x = 1
1246+
num_procs_y = num_procs
1247+
num_procs_z = 1
1248+
ierr = -1
12591249
1260-
! Optimization of the initial processor topology
1261-
do i = 1, num_procs
1250+
! Computing minimization variable for these initial values
1251+
tmp_num_procs_x = num_procs_x
1252+
tmp_num_procs_y = num_procs_y
1253+
tmp_num_procs_z = num_procs_z
1254+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1255+
- (n + 1)/tmp_num_procs_y)
12621256
1263-
if (mod(num_procs, i) == 0 &
1264-
.and. &
1265-
(m + 1)/i >= num_stcls_min*recon_order) then
1257+
! Searching for optimal computational domain distribution
1258+
do i = 1, num_procs
12661259
1267-
do j = 1, num_procs/i
1260+
if (mod(num_procs, i) == 0 &
1261+
.and. &
1262+
(m + 1)/i >= num_stcls_min*recon_order) then
12681263
1269-
if (mod(num_procs/i, j) == 0 &
1264+
tmp_num_procs_x = i
1265+
tmp_num_procs_y = num_procs/i
1266+
1267+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1268+
- (n + 1)/tmp_num_procs_y) &
12701269
.and. &
1271-
(n + 1)/j >= num_stcls_min*recon_order) then
1270+
(n + 1)/tmp_num_procs_y &
1271+
>= &
1272+
num_stcls_min*recon_order) then
12721273
1273-
tmp_num_procs_x = i
1274-
tmp_num_procs_y = j
1275-
tmp_num_procs_z = num_procs/(i*j)
1274+
num_procs_x = i
1275+
num_procs_y = num_procs/i
1276+
fct_min = abs((m + 1)/tmp_num_procs_x &
1277+
- (n + 1)/tmp_num_procs_y)
1278+
ierr = 0
12761279
1277-
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1278-
- (n + 1)/tmp_num_procs_y) &
1279-
+ abs((n + 1)/tmp_num_procs_y &
1280-
- (p + 1)/tmp_num_procs_z) &
1280+
end if
1281+
1282+
end if
1283+
1284+
end do
1285+
1286+
else
1287+
1288+
! Initial estimate of optimal processor topology
1289+
num_procs_x = 1
1290+
num_procs_y = 1
1291+
num_procs_z = num_procs
1292+
ierr = -1
1293+
1294+
! Benchmarking the quality of this initial guess
1295+
tmp_num_procs_x = num_procs_x
1296+
tmp_num_procs_y = num_procs_y
1297+
tmp_num_procs_z = num_procs_z
1298+
fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1299+
- (n + 1)/tmp_num_procs_y) &
1300+
+ 10._wp*abs((n + 1)/tmp_num_procs_y &
1301+
- (p + 1)/tmp_num_procs_z)
1302+
1303+
! Optimization of the initial processor topology
1304+
do i = 1, num_procs
1305+
1306+
if (mod(num_procs, i) == 0 &
1307+
.and. &
1308+
(m + 1)/i >= num_stcls_min*recon_order) then
1309+
1310+
do j = 1, num_procs/i
1311+
1312+
if (mod(num_procs/i, j) == 0 &
12811313
.and. &
1282-
(p + 1)/tmp_num_procs_z &
1283-
>= &
1284-
num_stcls_min*recon_order) &
1285-
then
1286-
1287-
num_procs_x = i
1288-
num_procs_y = j
1289-
num_procs_z = num_procs/(i*j)
1290-
fct_min = abs((m + 1)/tmp_num_procs_x &
1291-
- (n + 1)/tmp_num_procs_y) &
1292-
+ abs((n + 1)/tmp_num_procs_y &
1293-
- (p + 1)/tmp_num_procs_z)
1294-
ierr = 0
1314+
(n + 1)/j >= num_stcls_min*recon_order) then
1315+
1316+
tmp_num_procs_x = i
1317+
tmp_num_procs_y = j
1318+
tmp_num_procs_z = num_procs/(i*j)
1319+
1320+
if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1321+
- (n + 1)/tmp_num_procs_y) &
1322+
+ abs((n + 1)/tmp_num_procs_y &
1323+
- (p + 1)/tmp_num_procs_z) &
1324+
.and. &
1325+
(p + 1)/tmp_num_procs_z &
1326+
>= &
1327+
num_stcls_min*recon_order) &
1328+
then
1329+
1330+
num_procs_x = i
1331+
num_procs_y = j
1332+
num_procs_z = num_procs/(i*j)
1333+
fct_min = abs((m + 1)/tmp_num_procs_x &
1334+
- (n + 1)/tmp_num_procs_y) &
1335+
+ abs((n + 1)/tmp_num_procs_y &
1336+
- (p + 1)/tmp_num_procs_z)
1337+
ierr = 0
12951338
1296-
end if
1339+
end if
12971340
1298-
end if
1341+
end if
12991342
1300-
end do
1343+
end do
13011344
1302-
end if
1345+
end if
13031346
1304-
end do
1347+
end do
13051348
1349+
end if
13061350
end if
13071351
13081352
! Verifying that a valid decomposition of the computational

src/post_process/m_checker.fpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module m_checker
1717

1818
implicit none
1919

20-
private; public :: s_check_inputs
20+
private; public :: s_check_inputs, s_check_inputs_fft
2121

2222
contains
2323

@@ -111,6 +111,22 @@ contains
111111
@:PROHIBIT(any(omega_wrt) .and. fd_order == dflt_int, "fd_order must be set for omega_wrt")
112112
end subroutine s_check_inputs_vorticity
113113

114+
!> Checks constraints on fft_wrt
115+
impure subroutine s_check_inputs_fft
116+
integer :: num_procs_y, num_procs_z
117+
118+
@:PROHIBIT(fft_wrt .and. (n == 0 .or. p == 0), "FFT WRT only in 3D")
119+
@:PROHIBIT(fft_wrt .and. cyl_coord, "FFT WRT incompatible with cylindrical coordinates")
120+
@:PROHIBIT(fft_wrt .and. (MOD(m_glb+1,2) == 1 .or. MOD(n_glb+1,2) == 1 .or. MOD(p_glb+1,2) == 1), "FFT WRT requires global dimensions divisible by 2")
121+
@:PROHIBIT(fft_wrt .and. MOD(n_glb+1,n+1) /= 0, "FFT WRT requires n_glb to be divisible by num_procs_y")
122+
@:PROHIBIT(fft_wrt .and. MOD(p_glb+1,p+1) /= 0, "FFT WRT requires p_glb to be divisible by num_procs_z")
123+
num_procs_y = (n_glb + 1)/(n + 1)
124+
num_procs_z = (p_glb + 1)/(p + 1)
125+
@:PROHIBIT(fft_wrt .and. MOD(m_glb+1,num_procs_y) /= 0, "FFT WRT requires m_glb to be divisible by num_procs_y")
126+
@:PROHIBIT(fft_wrt .and. MOD(n_glb+1,num_procs_z) /= 0, "FFT WRT requires n_glb to be divisible by num_procs_z")
127+
@:PROHIBIT(fft_wrt .and. (bc_x%beg < -1 .or. bc_y%beg < -1 .or. bc_z%beg < -1 .or. bc_x%end < -1 .or. bc_y%end < -1 .or. bc_z%end < -1), "FFT WRT requires periodic BCs")
128+
end subroutine s_check_inputs_fft
129+
114130
!> Checks constraints on Q-criterion parameters
115131
impure subroutine s_check_inputs_qm
116132
@:PROHIBIT(n == 0 .and. qm_wrt)

src/post_process/m_global_parameters.fpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ module m_global_parameters
240240
integer :: flux_lim
241241
logical, dimension(3) :: flux_wrt
242242
logical :: E_wrt
243+
logical :: fft_wrt
243244
logical :: pres_wrt
244245
logical, dimension(num_fluids_max) :: alpha_wrt
245246
logical :: gamma_wrt
@@ -441,6 +442,7 @@ contains
441442
parallel_io = .false.
442443
file_per_process = .false.
443444
E_wrt = .false.
445+
fft_wrt = .false.
444446
pres_wrt = .false.
445447
alpha_wrt = .false.
446448
gamma_wrt = .false.

src/post_process/m_mpi_proxy.fpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ contains
105105
& 'adv_n', 'ib', 'cfl_adap_dt', 'cfl_const_dt', 'cfl_dt', &
106106
& 'surface_tension', 'hyperelasticity', 'bubbles_lagrange', &
107107
& 'output_partial_domain', 'relativity', 'cont_damage', 'bc_io', &
108-
& 'down_sample' ]
108+
& 'down_sample','fft_wrt' ]
109109
call MPI_BCAST(${VAR}$, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD, ierr)
110110
#:endfor
111111

0 commit comments

Comments
 (0)