@@ -1192,117 +1192,161 @@ contains
11921192 if (n > 0) then
11931193
11941194 if (p > 0) then
1195+ if (fft_wrt) then
11951196
1196- if (cyl_coord .and. p > 0) then
1197- ! Implement pencil processor blocking if using cylindrical coordinates so
1198- ! that all cells in azimuthal direction are stored on a single processor.
1199- ! This is necessary for efficient application of Fourier filter near axis.
1200-
1201- ! Initial values of the processor factorization optimization
1197+ ! Initial estimate of optimal processor topology
12021198 num_procs_x = 1
1203- num_procs_y = num_procs
1204- num_procs_z = 1
1199+ num_procs_y = 1
1200+ num_procs_z = num_procs
12051201 ierr = -1
12061202
1207- ! Computing minimization variable for these initial values
1208- tmp_num_procs_x = num_procs_x
1203+ ! Benchmarking the quality of this initial guess
12091204 tmp_num_procs_y = num_procs_y
12101205 tmp_num_procs_z = num_procs_z
1211- fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1212- - (n + 1)/tmp_num_procs_y )
1206+ fct_min = 10._wp*abs((n + 1)/tmp_num_procs_y &
1207+ - (p + 1)/tmp_num_procs_z )
12131208
1214- ! Searching for optimal computational domain distribution
1209+ ! Optimization of the initial processor topology
12151210 do i = 1, num_procs
12161211
12171212 if (mod(num_procs, i) == 0 &
12181213 .and. &
1219- (m + 1)/i >= num_stcls_min*recon_order) then
1214+ (n + 1)/i >= num_stcls_min*recon_order) then
12201215
1221- tmp_num_procs_x = i
1222- tmp_num_procs_y = num_procs/i
1216+ tmp_num_procs_y = i
1217+ tmp_num_procs_z = num_procs/i
12231218
1224- if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1225- - (n + 1)/tmp_num_procs_y ) &
1219+ if (fct_min >= abs((n + 1)/tmp_num_procs_y &
1220+ - (p + 1)/tmp_num_procs_z ) &
12261221 .and. &
1227- (n + 1)/tmp_num_procs_y &
1222+ (p + 1)/tmp_num_procs_z &
12281223 >= &
12291224 num_stcls_min*recon_order) then
12301225
1231- num_procs_x = i
1232- num_procs_y = num_procs/i
1233- fct_min = abs((m + 1)/tmp_num_procs_x &
1234- - (n + 1)/tmp_num_procs_y )
1226+ num_procs_y = i
1227+ num_procs_z = num_procs/i
1228+ fct_min = abs((n + 1)/tmp_num_procs_y &
1229+ - (p + 1)/tmp_num_procs_z )
12351230 ierr = 0
12361231
12371232 end if
12381233
12391234 end if
12401235
12411236 end do
1242-
12431237 else
12441238
1245- ! Initial estimate of optimal processor topology
1246- num_procs_x = 1
1247- num_procs_y = 1
1248- num_procs_z = num_procs
1249- ierr = -1
1239+ if (cyl_coord .and. p > 0) then
1240+ ! Implement pencil processor blocking if using cylindrical coordinates so
1241+ ! that all cells in azimuthal direction are stored on a single processor.
1242+ ! This is necessary for efficient application of Fourier filter near axis.
12501243
1251- ! Benchmarking the quality of this initial guess
1252- tmp_num_procs_x = num_procs_x
1253- tmp_num_procs_y = num_procs_y
1254- tmp_num_procs_z = num_procs_z
1255- fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1256- - (n + 1)/tmp_num_procs_y) &
1257- + 10._wp*abs((n + 1)/tmp_num_procs_y &
1258- - (p + 1)/tmp_num_procs_z)
1244+ ! Initial values of the processor factorization optimization
1245+ num_procs_x = 1
1246+ num_procs_y = num_procs
1247+ num_procs_z = 1
1248+ ierr = -1
12591249
1260- ! Optimization of the initial processor topology
1261- do i = 1, num_procs
1250+ ! Computing minimization variable for these initial values
1251+ tmp_num_procs_x = num_procs_x
1252+ tmp_num_procs_y = num_procs_y
1253+ tmp_num_procs_z = num_procs_z
1254+ fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1255+ - (n + 1)/tmp_num_procs_y)
12621256
1263- if (mod(num_procs, i) == 0 &
1264- .and. &
1265- (m + 1)/i >= num_stcls_min*recon_order) then
1257+ ! Searching for optimal computational domain distribution
1258+ do i = 1, num_procs
12661259
1267- do j = 1, num_procs/i
1260+ if (mod(num_procs, i) == 0 &
1261+ .and. &
1262+ (m + 1)/i >= num_stcls_min*recon_order) then
12681263
1269- if (mod(num_procs/i, j) == 0 &
1264+ tmp_num_procs_x = i
1265+ tmp_num_procs_y = num_procs/i
1266+
1267+ if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1268+ - (n + 1)/tmp_num_procs_y) &
12701269 .and. &
1271- (n + 1)/j >= num_stcls_min*recon_order) then
1270+ (n + 1)/tmp_num_procs_y &
1271+ >= &
1272+ num_stcls_min*recon_order) then
12721273
1273- tmp_num_procs_x = i
1274- tmp_num_procs_y = j
1275- tmp_num_procs_z = num_procs/(i*j)
1274+ num_procs_x = i
1275+ num_procs_y = num_procs/i
1276+ fct_min = abs((m + 1)/tmp_num_procs_x &
1277+ - (n + 1)/tmp_num_procs_y)
1278+ ierr = 0
12761279
1277- if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1278- - (n + 1)/tmp_num_procs_y) &
1279- + abs((n + 1)/tmp_num_procs_y &
1280- - (p + 1)/tmp_num_procs_z) &
1280+ end if
1281+
1282+ end if
1283+
1284+ end do
1285+
1286+ else
1287+
1288+ ! Initial estimate of optimal processor topology
1289+ num_procs_x = 1
1290+ num_procs_y = 1
1291+ num_procs_z = num_procs
1292+ ierr = -1
1293+
1294+ ! Benchmarking the quality of this initial guess
1295+ tmp_num_procs_x = num_procs_x
1296+ tmp_num_procs_y = num_procs_y
1297+ tmp_num_procs_z = num_procs_z
1298+ fct_min = 10._wp*abs((m + 1)/tmp_num_procs_x &
1299+ - (n + 1)/tmp_num_procs_y) &
1300+ + 10._wp*abs((n + 1)/tmp_num_procs_y &
1301+ - (p + 1)/tmp_num_procs_z)
1302+
1303+ ! Optimization of the initial processor topology
1304+ do i = 1, num_procs
1305+
1306+ if (mod(num_procs, i) == 0 &
1307+ .and. &
1308+ (m + 1)/i >= num_stcls_min*recon_order) then
1309+
1310+ do j = 1, num_procs/i
1311+
1312+ if (mod(num_procs/i, j) == 0 &
12811313 .and. &
1282- (p + 1)/tmp_num_procs_z &
1283- >= &
1284- num_stcls_min*recon_order) &
1285- then
1286-
1287- num_procs_x = i
1288- num_procs_y = j
1289- num_procs_z = num_procs/(i*j)
1290- fct_min = abs((m + 1)/tmp_num_procs_x &
1291- - (n + 1)/tmp_num_procs_y) &
1292- + abs((n + 1)/tmp_num_procs_y &
1293- - (p + 1)/tmp_num_procs_z)
1294- ierr = 0
1314+ (n + 1)/j >= num_stcls_min*recon_order) then
1315+
1316+ tmp_num_procs_x = i
1317+ tmp_num_procs_y = j
1318+ tmp_num_procs_z = num_procs/(i*j)
1319+
1320+ if (fct_min >= abs((m + 1)/tmp_num_procs_x &
1321+ - (n + 1)/tmp_num_procs_y) &
1322+ + abs((n + 1)/tmp_num_procs_y &
1323+ - (p + 1)/tmp_num_procs_z) &
1324+ .and. &
1325+ (p + 1)/tmp_num_procs_z &
1326+ >= &
1327+ num_stcls_min*recon_order) &
1328+ then
1329+
1330+ num_procs_x = i
1331+ num_procs_y = j
1332+ num_procs_z = num_procs/(i*j)
1333+ fct_min = abs((m + 1)/tmp_num_procs_x &
1334+ - (n + 1)/tmp_num_procs_y) &
1335+ + abs((n + 1)/tmp_num_procs_y &
1336+ - (p + 1)/tmp_num_procs_z)
1337+ ierr = 0
12951338
1296- end if
1339+ end if
12971340
1298- end if
1341+ end if
12991342
1300- end do
1343+ end do
13011344
1302- end if
1345+ end if
13031346
1304- end do
1347+ end do
13051348
1349+ end if
13061350 end if
13071351
13081352 ! Verifying that a valid decomposition of the computational
0 commit comments