Skip to content

Commit da5bc91

Browse files
committed
fix inconsistent step sizes in snapshot configs
Make sure all configs written in snapshot have same, latest, step size. And add check for consistent step sizes when distributing initial configurations. Add pytest that times out when original bug is present.
1 parent ded1e39 commit da5bc91

File tree

4 files changed

+33
-12
lines changed

4 files changed

+33
-12
lines changed

.github/workflows/pytests.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ jobs:
1414
runs-on: ubuntu-latest
1515
strategy:
1616
matrix:
17-
python-version: [ 3.8 ]
17+
python-version: [ 3.9 ]
1818
max-parallel: 5
1919
env:
20-
coverage-on-version: 3.8
20+
coverage-on-version: 3.9
2121
use-mpi: True
2222

2323
steps:
@@ -39,7 +39,7 @@ jobs:
3939
run: conda install pip
4040

4141
- name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is)
42-
run: pip install wheel setuptools ruff pytest pytest-cov
42+
run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout
4343

4444
- name: Install latest ASE from gitlab
4545
run: pip install git+https://gitlab.com/ase/ase.git
@@ -72,7 +72,7 @@ jobs:
7272
- name: Test with pytest - coverage
7373
if: env.coverage-on-version == matrix.python-version
7474
run: |
75-
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs
75+
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs
7676
7777
# # DEBUGGING
7878
# - name: Setup tmate session
@@ -84,13 +84,13 @@ jobs:
8484
if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}}
8585
run: |
8686
# envvar and test run - No coverage
87-
mpirun -n 2 pytest --with-mpi -k mpi
87+
mpirun -n 2 pytest --with-mpi -k mpi -rxXs
8888
8989
- name: MPI tests -- coverage
9090
if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}}
9191
run: |
9292
# envvar and coverage Appended to the previous
93-
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append
93+
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs
9494
9595
- name: 'Upload Coverage Data'
9696
uses: actions/upload-artifact@v4

pymatnext/ns.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil
7373
self.max_val = None
7474

7575
self.local_configs = []
76+
self.extra_config = False
7677

7778
old_state_files = NS._old_state_files(output_filename_prefix)
7879
if len(old_state_files) > 0:
@@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False):
202203
self.local_configs = []
203204
if self.comm.rank == 0:
204205
for config_i, new_config in enumerate(new_configs_generator()):
206+
if config_i == 0:
207+
first_config = new_config
205208
if config_i >= self.n_configs_global:
206209
raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}")
207210

211+
# Check that all step sizes are the same. Maybe instead we should just copy from first?
212+
assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}"
213+
208214
target_rank = config_i // self.max_n_configs_local
209215
if target_rank == self.comm.rank:
210216
self.local_configs.append(new_config)
@@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
352358
print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq)
353359
first_iter = False
354360

361+
# It looks like the following should always give the same values, hence exit
362+
# condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks
363+
# in the allreduce. The reason is that the value of done_i in the loop depends
364+
# on the value returned from _tune_from_accept_rate, which depends on the previous
365+
# step size, and if those are inconsistent between MPI tasks (as in
366+
# https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur.
367+
# Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23)
355368
done = []
356369
for param_i in range(n_params):
357370
if accept_freq[param_i][0] > 0:
@@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
369382
new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)}
370383
for ns_config in self.local_configs:
371384
ns_config.step_size = new_step_size
385+
# make sure that config used as buffer also has correct step_size
386+
if self.extra_config:
387+
self.extra_config.step_size = new_step_size
372388

373389
# if self.comm.rank == 0:
374390
# print("step_size_tune done", list(zip(done, accept_freq, step_size)))

tests/assets/do_Morse_ASE/params.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
output_filename_prefix = "Morse_ASE"
44
random_seed = 5
55

6-
max_iter = 100
6+
max_iter = 110
77

88
stdout_report_interval_s = 10
99

1010
snapshot_interval = 50
1111

1212
[global.step_size_tune]
1313

14-
interval = 1000
14+
interval = 50
1515

1616
[ns]
1717

tests/test_sample.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch):
4646
do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True)
4747

4848
@pytest.mark.mpi
49+
# github CI working test takes ~90 s
50+
@pytest.mark.timeout(120, method="thread")
4951
def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch):
52+
import time
53+
t0 = time.time()
5054
do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True)
55+
print("BOB time", time.time() - t0)
5156

5257
@pytest.mark.mpi
5358
def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch):
@@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None):
151156
# files exist
152157
assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file()
153158
assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1
154-
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval
159+
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval))
155160

156-
# from test run 12/8/2022
161+
# from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing
157162
if using_mpi:
158-
samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01])
163+
samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01])
159164
else:
160-
samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01])
165+
samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01])
161166

162167
with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin:
163168
for l in fin:

0 commit comments

Comments
 (0)