fix inconsistent step sizes in snapshot configs

bernstei · bernstei · commit da5bc91d7c41 · 2025-06-16T13:25:44.000-04:00
Make sure all configs written in snapshot have same, latest, step size.
And add check for consistent step sizes when distributing initial configurations.
Add pytest that times out when original bug is present.
diff --git a/.github/workflows/pytests.yml b/.github/workflows/pytests.yml
@@ -14,10 +14,10 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8 ]
+        python-version: [ 3.9 ]
       max-parallel: 5
     env:
-      coverage-on-version: 3.8
+      coverage-on-version: 3.9
       use-mpi: True
 
     steps:
@@ -39,7 +39,7 @@ jobs:
         run: conda install pip
 
       - name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is)
-        run: pip install wheel setuptools ruff pytest pytest-cov
+        run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout
 
       - name: Install latest ASE from gitlab
         run: pip install git+https://gitlab.com/ase/ase.git
@@ -72,7 +72,7 @@ jobs:
       - name: Test with pytest - coverage
         if: env.coverage-on-version == matrix.python-version
         run: |
-          pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs
+          pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs
 
       # # DEBUGGING
       # - name: Setup tmate session
@@ -84,13 +84,13 @@ jobs:
         if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}}
         run: |
           # envvar and test run - No coverage
-          mpirun -n 2 pytest --with-mpi -k mpi
+          mpirun -n 2 pytest --with-mpi -k mpi -rxXs
 
       - name: MPI tests -- coverage
         if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}}
         run: |
           # envvar and coverage Appended to the previous
-          mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append
+          mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs
 
       - name: 'Upload Coverage Data'
         uses: actions/upload-artifact@v4
diff --git a/pymatnext/ns.py b/pymatnext/ns.py
@@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil
         self.max_val = None
 
         self.local_configs = []
+        self.extra_config = False
 
         old_state_files = NS._old_state_files(output_filename_prefix)
         if len(old_state_files) > 0:
@@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False):
         self.local_configs = []
         if self.comm.rank == 0:
             for config_i, new_config in enumerate(new_configs_generator()):
+                if config_i == 0:
+                    first_config = new_config
                 if config_i >= self.n_configs_global:
                     raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}")
 
+                # Check that all step sizes are the same. Maybe instead we should just copy from first?
+                assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}"
+
                 target_rank = config_i // self.max_n_configs_local
                 if target_rank == self.comm.rank:
                     self.local_configs.append(new_config)
@@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
                     print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq)
                 first_iter = False
 
+            # It looks like the following should always give the same values, hence exit
+            # condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks
+            # in the allreduce. The reason is that the value of done_i in the loop depends
+            # on the value returned from _tune_from_accept_rate, which depends on the previous
+            # step size, and if those are inconsistent between MPI tasks (as in
+            # https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur.
+            # Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23)
             done = []
             for param_i in range(n_params):
                 if accept_freq[param_i][0] > 0:
@@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
             new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)}
             for ns_config in self.local_configs:
                 ns_config.step_size = new_step_size
+            # make sure that config used as buffer also has correct step_size
+            if self.extra_config:
+                self.extra_config.step_size = new_step_size
 
             # if self.comm.rank == 0:
                 # print("step_size_tune done", list(zip(done, accept_freq, step_size)))
diff --git a/tests/assets/do_Morse_ASE/params.toml b/tests/assets/do_Morse_ASE/params.toml
@@ -3,15 +3,15 @@
     output_filename_prefix = "Morse_ASE"
     random_seed = 5
 
-    max_iter = 100
+    max_iter = 110
 
     stdout_report_interval_s = 10
 
     snapshot_interval = 50
 
     [global.step_size_tune]
 
-        interval = 1000
+        interval = 50
 
 [ns]
 
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch):
     do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True)
 
 @pytest.mark.mpi
+# github CI working test takes ~90 s
+@pytest.mark.timeout(120, method="thread")
 def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch):
+    import time
+    t0 = time.time()
     do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True)
+    print("BOB time", time.time() - t0)
 
 @pytest.mark.mpi
 def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch):
@@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None):
     # files exist
     assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file()
     assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1
-    assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval
+    assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval))
 
-    # from test run 12/8/2022
+    # from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing
     if using_mpi:
-        samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01])
+        samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01])
     else:
-        samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01])
+        samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01])
 
     with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin:
         for l in fin: