Skip to content

Commit f596397

Browse files
authored
fix(par): avoid integer overflow of MPI message size (#2280)
* - add autotest for coupling of overlapping models (using idomain) * - cleanup: initialize pointers * - use MPI_type_size_x and extended integer kind to avoid overflow for truly large models * - add to release notes * - add comments * fix develop.toml
1 parent 966c51a commit f596397

File tree

5 files changed

+268
-8
lines changed

5 files changed

+268
-8
lines changed
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
"""
2+
General test for the interface model approach.
3+
It compares the skewed decomposition of the domain
4+
to the trivial analytical result (constant gradient).
5+
In this case with the use of idomain to deactivate half
6+
of the sub-models. Note that the cells with idomain==0
7+
overlap with the active cells of the other model.
8+
9+
'leftmodel' 'rightmodel'
10+
11+
1 1 1 0 0 0 0 1 1 1 1 1 1 1
12+
1 1 1 1 0 0 0 0 1 1 1 1 1 1
13+
1 1 1 1 1 0 0 + 0 0 1 1 1 1 1
14+
1 1 1 1 1 1 0 0 0 0 1 1 1 1
15+
1 1 1 1 1 1 1 0 0 0 0 1 1 1
16+
17+
We assert equality on the head values and check budgets.
18+
"""
19+
20+
import os
21+
22+
import flopy
23+
import numpy as np
24+
import pytest
25+
from flopy.mf6.utils import Mf6Splitter
26+
from framework import TestFramework
27+
28+
cases = ["ifmod_skewed"]
29+
30+
# some global convenience...:
31+
# model name
32+
mname = "skewed"
33+
34+
# solver criterion
35+
hclose_check = 1e-9
36+
max_inner_it = 300
37+
nper = 1
38+
39+
# model spatial discretization
40+
nlay = 1
41+
ncol = 10
42+
nrow = 5
43+
44+
# idomain
45+
idomain = np.ones((nlay, nrow, ncol))
46+
47+
delr = 1.0
48+
delc = 1.0
49+
area = delr * delc
50+
51+
# top/bot of the aquifer
52+
tops = [1.0, 0.0]
53+
54+
# hydraulic conductivity
55+
hk = 10.0
56+
57+
# boundary stress period data
58+
h_left = 10.0
59+
h_right = 1.0
60+
61+
# initial head
62+
h_start = 0.0
63+
64+
# head boundaries
65+
lchd = [[(ilay, irow, 0), h_left] for irow in range(nrow) for ilay in range(nlay)]
66+
rchd = [
67+
[(ilay, irow, ncol - 1), h_right] for irow in range(nrow) for ilay in range(nlay)
68+
]
69+
chd = lchd + rchd
70+
71+
chd_spd = {0: chd}
72+
73+
74+
def get_model(idx, dir):
75+
name = cases[idx]
76+
77+
# parameters and spd
78+
# tdis
79+
tdis_rc = []
80+
for i in range(nper):
81+
tdis_rc.append((1.0, 1, 1))
82+
83+
# solver data
84+
nouter, ninner = 100, max_inner_it
85+
hclose, rclose, relax = hclose_check, 1e-3, 0.97
86+
87+
sim = flopy.mf6.MFSimulation(
88+
sim_name=name, version="mf6", exe_name="mf6", sim_ws=dir
89+
)
90+
91+
tdis = flopy.mf6.ModflowTdis(sim, time_units="DAYS", nper=nper, perioddata=tdis_rc)
92+
93+
ims = flopy.mf6.ModflowIms(
94+
sim,
95+
print_option="SUMMARY",
96+
outer_dvclose=hclose,
97+
outer_maximum=nouter,
98+
under_relaxation="NONE",
99+
inner_maximum=ninner,
100+
inner_dvclose=hclose,
101+
rcloserecord=rclose,
102+
linear_acceleration="CG",
103+
scaling_method="NONE",
104+
reordering_method="NONE",
105+
relaxation_factor=relax,
106+
filename="gwf.ims",
107+
)
108+
109+
gwf = flopy.mf6.ModflowGwf(sim, modelname=mname, save_flows=True)
110+
111+
dis = flopy.mf6.ModflowGwfdis(
112+
gwf,
113+
nlay=nlay,
114+
nrow=nrow,
115+
ncol=ncol,
116+
delr=delr,
117+
delc=delc,
118+
xorigin=0.0,
119+
yorigin=0.0,
120+
top=tops[0],
121+
botm=tops[1:],
122+
idomain=idomain,
123+
)
124+
125+
# initial conditions
126+
ic = flopy.mf6.ModflowGwfic(gwf, strt=h_start)
127+
128+
# node property flow
129+
npf = flopy.mf6.ModflowGwfnpf(
130+
gwf,
131+
save_specific_discharge=True,
132+
icelltype=0,
133+
k=hk,
134+
)
135+
136+
# chd file
137+
chd = flopy.mf6.ModflowGwfchd(gwf, stress_period_data=chd_spd)
138+
139+
# output control
140+
oc = flopy.mf6.ModflowGwfoc(
141+
gwf,
142+
head_filerecord=f"{mname}.hds",
143+
budget_filerecord=f"{mname}.cbc",
144+
headprintrecord=[("COLUMNS", 10, "WIDTH", 15, "DIGITS", 6, "GENERAL")],
145+
saverecord=[("HEAD", "LAST"), ("BUDGET", "LAST")],
146+
)
147+
148+
# split the model
149+
splitter = Mf6Splitter(sim)
150+
mask = np.zeros(shape=(nrow, ncol))
151+
for irow in range(nrow):
152+
istart = irow + 3
153+
mask[irow, istart:] = 1
154+
split_sim = splitter.split_model(mask)
155+
split_sim.set_sim_path(dir)
156+
157+
return split_sim
158+
159+
160+
def build_models(idx, test):
161+
sim = get_model(idx, test.workspace)
162+
return sim, None
163+
164+
165+
def check_output(idx, test):
166+
print("comparing heads to single model reference...")
167+
168+
sim = flopy.mf6.MFSimulation.load(sim_ws=test.workspace)
169+
170+
mname_left = sim.model_names[0]
171+
mname_right = sim.model_names[1]
172+
173+
fpth = os.path.join(test.workspace, f"{mname_left}.hds")
174+
hds_left = flopy.utils.HeadFile(fpth).get_alldata()
175+
hds_left[hds_left == 1.0e30] = 0.0
176+
177+
fpth = os.path.join(test.workspace, f"{mname_right}.hds")
178+
hds_right = flopy.utils.HeadFile(fpth).get_alldata()
179+
hds_right[hds_right == 1.0e30] = 0.0
180+
181+
hds = np.zeros((nrow, ncol), dtype=float)
182+
hds[:, 0:7] = hds[:, 0:7] + hds_left[:, :]
183+
hds[:, 3:] = hds[:, 3:] + hds_right[:, :]
184+
185+
cst_gradient = np.linspace(10.0, 1.0, ncol)
186+
for irow in range(nrow):
187+
assert hds[irow, :] == pytest.approx(cst_gradient, rel=10 * hclose_check), (
188+
f"Head values for row {irow} do not match analytical result. "
189+
f"Expected {cst_gradient}, but got {hds[irow, :]}"
190+
)
191+
192+
# check budget error from .lst file
193+
for mname in [mname_left, mname_right]:
194+
fpth = os.path.join(test.workspace, f"{mname}.lst")
195+
for line in open(fpth):
196+
if line.lstrip().startswith("PERCENT"):
197+
cumul_balance_error = float(line.split()[3])
198+
assert abs(cumul_balance_error) < 0.00001, (
199+
f"Cumulative balance error = {cumul_balance_error} for {mname}, "
200+
"should equal 0.0"
201+
)
202+
203+
204+
@pytest.mark.parametrize("idx, name", enumerate(cases))
205+
@pytest.mark.developmode
206+
def test_mf6model(idx, name, function_tmpdir, targets):
207+
test = TestFramework(
208+
name=name,
209+
workspace=function_tmpdir,
210+
build=lambda t: build_models(idx, t),
211+
check=lambda t: check_output(idx, t),
212+
targets=targets,
213+
)
214+
test.run()

autotest/test_par_gwf_idomain02.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
"""
2+
This tests reuses the simulation data in test_gwf_ifmod_idomain02.py
3+
and runs it in parallel on two cpus
4+
"""
5+
6+
import pytest
7+
from framework import TestFramework
8+
9+
cases = ["par_idomain_skewed"]
10+
11+
12+
def build_models(idx, test):
13+
from test_gwf_ifmod_idomain02 import build_models as build
14+
15+
sim, dummy = build(idx, test)
16+
return sim, dummy
17+
18+
19+
def check_output(idx, test):
20+
from test_gwf_ifmod_idomain02 import check_output as check
21+
22+
check(idx, test)
23+
24+
25+
@pytest.mark.parallel
26+
@pytest.mark.developmode
27+
@pytest.mark.parametrize("idx, name", enumerate(cases))
28+
def test_mf6model(idx, name, function_tmpdir, targets):
29+
test = TestFramework(
30+
name=name,
31+
workspace=function_tmpdir,
32+
targets=targets,
33+
build=lambda t: build_models(idx, t),
34+
check=lambda t: check_output(idx, t),
35+
compare=None,
36+
parallel=True,
37+
ncpus=2,
38+
)
39+
test.run()

doc/ReleaseNotes/develop.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,7 @@ description = "Added interbed-compaction-pct observation to the CSUB package."
2929
[[items]]
3030
section = "fixes"
3131
description = "The mf6io.pdf guide for the SFE Package lists the availability of the STRMBD-COND observation type. However, the SFE Package did not actually support this observation type and if listed would cause the program to exit with error message. Functionality has been added to SFE for writing the amount of streambed conductive heat exchange to the CSV output file that contains the user-specified observations."
32+
33+
[[items]]
34+
section = "fixes"
35+
description = "Fixed a variable overflow in the MPI communication for parallel simulations that could cause a memory exception when running a parallel simulation with truly large subdomains (~20M nodes or more)."

src/Distributed/MpiRouter.f90

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,8 @@ subroutine route_active(this, unit, stage)
242242
! local
243243
integer(I4B) :: i
244244
integer(I4B) :: rnk
245-
integer :: ierr, msg_size
245+
integer :: ierr
246+
integer(kind=MPI_COUNT_KIND) :: msg_size !< need a longer int here, msg size can be > 2^31
246247
logical(LGP) :: from_cache
247248
! mpi handles
248249
integer, dimension(:), allocatable :: rcv_req
@@ -296,7 +297,8 @@ subroutine route_active(this, unit, stage)
296297
write (this%imon, '(4x,a,i0)') "receiving from process: ", rnk
297298
end if
298299

299-
call MPI_Type_size(body_rcv_t(i), msg_size, ierr)
300+
! call extended type size function (*_x) to avoid overflow for large submodels
301+
call MPI_Type_size_x(body_rcv_t(i), msg_size, ierr)
300302
if (msg_size > 0) then
301303
call MPI_Irecv(MPI_BOTTOM, 1, body_rcv_t(i), rnk, stage, &
302304
this%mpi_world%comm, rcv_req(i), ierr)
@@ -315,7 +317,8 @@ subroutine route_active(this, unit, stage)
315317
write (this%imon, '(4x,a,i0)') "sending to process: ", rnk
316318
end if
317319

318-
call MPI_Type_size(body_snd_t(i), msg_size, ierr)
320+
! call extended type size function (*_x) to avoid overflow for large submodels
321+
call MPI_Type_size_x(body_snd_t(i), msg_size, ierr)
319322
if (msg_size > 0) then
320323
call MPI_Isend(MPI_Bottom, 1, body_snd_t(i), rnk, stage, &
321324
this%mpi_world%comm, snd_req(i), ierr)

src/Distributed/VirtualBase.f90

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,15 @@ module VirtualBaseModule
5757
end type
5858

5959
type, public, extends(VirtualDataType) :: VirtualIntType
60-
integer(I4B), private, pointer :: intsclr
60+
integer(I4B), private, pointer :: intsclr => null()
6161
contains
6262
procedure :: vm_allocate => vm_allocate_int
6363
procedure :: vm_deallocate => vm_deallocate_int
6464
procedure :: get => get_int
6565
end type
6666

6767
type, public, extends(VirtualDataType) :: VirtualInt1dType
68-
integer(I4B), dimension(:), pointer, contiguous :: int1d
68+
integer(I4B), dimension(:), pointer, contiguous :: int1d => null()
6969
contains
7070
procedure :: vm_allocate => vm_allocate_int1d
7171
procedure :: vm_deallocate => vm_deallocate_int1d
@@ -74,15 +74,15 @@ module VirtualBaseModule
7474
end type
7575

7676
type, public, extends(VirtualDataType) :: VirtualDblType
77-
real(DP), private, pointer :: dblsclr
77+
real(DP), private, pointer :: dblsclr => null()
7878
contains
7979
procedure :: vm_allocate => vm_allocate_dbl
8080
procedure :: vm_deallocate => vm_deallocate_dbl
8181
procedure :: get => get_dbl
8282
end type
8383

8484
type, public, extends(VirtualDataType) :: VirtualDbl1dType
85-
real(DP), dimension(:), pointer, contiguous :: dbl1d
85+
real(DP), dimension(:), pointer, contiguous :: dbl1d => null()
8686
contains
8787
procedure :: vm_allocate => vm_allocate_dbl1d
8888
procedure :: vm_deallocate => vm_deallocate_dbl1d
@@ -91,7 +91,7 @@ module VirtualBaseModule
9191
end type
9292

9393
type, public, extends(VirtualDataType) :: VirtualDbl2dType
94-
real(DP), dimension(:, :), pointer, contiguous :: dbl2d
94+
real(DP), dimension(:, :), pointer, contiguous :: dbl2d => null()
9595
contains
9696
procedure :: vm_allocate => vm_allocate_dbl2D
9797
procedure :: vm_deallocate => vm_deallocate_dbl2D

0 commit comments

Comments
 (0)