Skip to content

Commit 951c3b0

Browse files
authored
Merge pull request #23 from libAtoms/snapshot_wrong_step_sizes
make sure all configs written in snapshot have same, latest, step size
2 parents ded1e39 + da5bc91 commit 951c3b0

File tree

4 files changed

+33
-12
lines changed

4 files changed

+33
-12
lines changed

.github/workflows/pytests.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@ jobs:
1414
runs-on: ubuntu-latest
1515
strategy:
1616
matrix:
17-
python-version: [ 3.8 ]
17+
python-version: [ 3.9 ]
1818
max-parallel: 5
1919
env:
20-
coverage-on-version: 3.8
20+
coverage-on-version: 3.9
2121
use-mpi: True
2222

2323
steps:
@@ -39,7 +39,7 @@ jobs:
3939
run: conda install pip
4040

4141
- name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is)
42-
run: pip install wheel setuptools ruff pytest pytest-cov
42+
run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout
4343

4444
- name: Install latest ASE from gitlab
4545
run: pip install git+https://gitlab.com/ase/ase.git
@@ -72,7 +72,7 @@ jobs:
7272
- name: Test with pytest - coverage
7373
if: env.coverage-on-version == matrix.python-version
7474
run: |
75-
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs
75+
pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs
7676
7777
# # DEBUGGING
7878
# - name: Setup tmate session
@@ -84,13 +84,13 @@ jobs:
8484
if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}}
8585
run: |
8686
# envvar and test run - No coverage
87-
mpirun -n 2 pytest --with-mpi -k mpi
87+
mpirun -n 2 pytest --with-mpi -k mpi -rxXs
8888
8989
- name: MPI tests -- coverage
9090
if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}}
9191
run: |
9292
# envvar and coverage Appended to the previous
93-
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append
93+
mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs
9494
9595
- name: 'Upload Coverage Data'
9696
uses: actions/upload-artifact@v4

pymatnext/ns.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil
7373
self.max_val = None
7474

7575
self.local_configs = []
76+
self.extra_config = False
7677

7778
old_state_files = NS._old_state_files(output_filename_prefix)
7879
if len(old_state_files) > 0:
@@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False):
202203
self.local_configs = []
203204
if self.comm.rank == 0:
204205
for config_i, new_config in enumerate(new_configs_generator()):
206+
if config_i == 0:
207+
first_config = new_config
205208
if config_i >= self.n_configs_global:
206209
raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}")
207210

211+
# Check that all step sizes are the same. Maybe instead we should just copy from first?
212+
assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}"
213+
208214
target_rank = config_i // self.max_n_configs_local
209215
if target_rank == self.comm.rank:
210216
self.local_configs.append(new_config)
@@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
352358
print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq)
353359
first_iter = False
354360

361+
# It looks like the following should always give the same values, hence exit
362+
# condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks
363+
# in the allreduce. The reason is that the value of done_i in the loop depends
364+
# on the value returned from _tune_from_accept_rate, which depends on the previous
365+
# step size, and if those are inconsistent between MPI tasks (as in
366+
# https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur.
367+
# Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23)
355368
done = []
356369
for param_i in range(n_params):
357370
if accept_freq[param_i][0] > 0:
@@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
369382
new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)}
370383
for ns_config in self.local_configs:
371384
ns_config.step_size = new_step_size
385+
# make sure that config used as buffer also has correct step_size
386+
if self.extra_config:
387+
self.extra_config.step_size = new_step_size
372388

373389
# if self.comm.rank == 0:
374390
# print("step_size_tune done", list(zip(done, accept_freq, step_size)))

tests/assets/do_Morse_ASE/params.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,15 @@
33
output_filename_prefix = "Morse_ASE"
44
random_seed = 5
55

6-
max_iter = 100
6+
max_iter = 110
77

88
stdout_report_interval_s = 10
99

1010
snapshot_interval = 50
1111

1212
[global.step_size_tune]
1313

14-
interval = 1000
14+
interval = 50
1515

1616
[ns]
1717

tests/test_sample.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch):
4646
do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True)
4747

4848
@pytest.mark.mpi
49+
# github CI working test takes ~90 s
50+
@pytest.mark.timeout(120, method="thread")
4951
def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch):
52+
import time
53+
t0 = time.time()
5054
do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True)
55+
print("BOB time", time.time() - t0)
5156

5257
@pytest.mark.mpi
5358
def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch):
@@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None):
151156
# files exist
152157
assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file()
153158
assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1
154-
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval
159+
assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval))
155160

156-
# from test run 12/8/2022
161+
# from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing
157162
if using_mpi:
158-
samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01])
163+
samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01])
159164
else:
160-
samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01])
165+
samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01])
161166

162167
with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin:
163168
for l in fin:

0 commit comments

Comments
 (0)