Merge pull request #23 from libAtoms/snapshot_wrong_step_sizes

bernstei · web-flow · commit 951c3b071727 · 2025-06-16T14:01:23.000-04:00
make sure all configs written in snapshot have same, latest, step size
diff --git a/.github/workflows/pytests.yml b/.github/workflows/pytests.yml
@@ -14,10 +14,10 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [ 3.8 ]
+        python-version: [ 3.9 ]
       max-parallel: 5
     env:
-      coverage-on-version: 3.8
+      coverage-on-version: 3.9
       use-mpi: True
 
     steps:
@@ -39,7 +39,7 @@ jobs:
         run: conda install pip
 
       - name: Install pytest requirements from pip (specific pymatnext dependencies will be automatically installed when it is)
-        run: pip install wheel setuptools ruff pytest pytest-cov
+        run: pip install wheel setuptools ruff pytest pytest-cov pytest-timeout
 
       - name: Install latest ASE from gitlab
         run: pip install git+https://gitlab.com/ase/ase.git
@@ -72,7 +72,7 @@ jobs:
       - name: Test with pytest - coverage
         if: env.coverage-on-version == matrix.python-version
         run: |
-          pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -rxXs
+          pytest -v --cov=pymatnext --cov-report term --cov-report html --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered -s -rxXs
 
       # # DEBUGGING
       # - name: Setup tmate session
@@ -84,13 +84,13 @@ jobs:
         if: ${{ env.use-mpi && env.coverage-on-version != matrix.python-version}}
         run: |
           # envvar and test run - No coverage
-          mpirun -n 2 pytest --with-mpi -k mpi
+          mpirun -n 2 pytest --with-mpi -k mpi -rxXs
 
       - name: MPI tests -- coverage
         if: ${{ env.use-mpi && env.coverage-on-version == matrix.python-version}}
         run: |
           # envvar and coverage Appended to the previous
-          mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --with-mpi -k mpi --cov-append
+          mpirun -n 2 pytest --cov=pymatnext --cov-report term --cov-config=tests/.coveragerc --cov-report term-missing --cov-report term:skip-covered --cov-append --with-mpi -k mpi -s -rxXs
 
       - name: 'Upload Coverage Data'
         uses: actions/upload-artifact@v4
diff --git a/pymatnext/ns.py b/pymatnext/ns.py
@@ -73,6 +73,7 @@ def __init__(self, params_ns, comm, MPI, random_seed, params_configs, output_fil
         self.max_val = None
 
         self.local_configs = []
+        self.extra_config = False
 
         old_state_files = NS._old_state_files(output_filename_prefix)
         if len(old_state_files) > 0:
@@ -202,9 +203,14 @@ def init_configs(self, params_configs, configs_file=None, extra=False):
         self.local_configs = []
         if self.comm.rank == 0:
             for config_i, new_config in enumerate(new_configs_generator()):
+                if config_i == 0:
+                    first_config = new_config
                 if config_i >= self.n_configs_global:
                     raise RuntimeError(f"Got too many configs (expected {self.n_configs_global}) from new config generator {new_configs_generator}")
 
+                # Check that all step sizes are the same. Maybe instead we should just copy from first?
+                assert new_config.step_size == first_config.step_size, f"Mismatched step size for config {config_i} {new_config.step_size} != 0 {first_config.step_size}"
+
                 target_rank = config_i // self.max_n_configs_local
                 if target_rank == self.comm.rank:
                     self.local_configs.append(new_config)
@@ -352,6 +358,13 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
                     print("step_size_tune initial", name, "size", size, "max", max_size, "freq", freq)
                 first_iter = False
 
+            # It looks like the following should always give the same values, hence exit
+            # condition, on all MPI tasks, but this is not guaranteed and can lead to deadlocks
+            # in the allreduce. The reason is that the value of done_i in the loop depends
+            # on the value returned from _tune_from_accept_rate, which depends on the previous
+            # step size, and if those are inconsistent between MPI tasks (as in
+            # https://github.com/libAtoms/pymatnext/issues/20), a deadlock may occur.
+            # Only fix is to make sure this doesn't happen (https://github.com/libAtoms/pymatnext/pull/23)
             done = []
             for param_i in range(n_params):
                 if accept_freq[param_i][0] > 0:
@@ -369,6 +382,9 @@ def step_size_tune(self, n_configs=1, min_accept_rate=0.25, max_accept_rate=0.5,
             new_step_size = {k: v * m for k, v, m in zip(step_size_names, step_size, max_step_size)}
             for ns_config in self.local_configs:
                 ns_config.step_size = new_step_size
+            # make sure that config used as buffer also has correct step_size
+            if self.extra_config:
+                self.extra_config.step_size = new_step_size
 
             # if self.comm.rank == 0:
                 # print("step_size_tune done", list(zip(done, accept_freq, step_size)))
diff --git a/tests/assets/do_Morse_ASE/params.toml b/tests/assets/do_Morse_ASE/params.toml
@@ -3,15 +3,15 @@
     output_filename_prefix = "Morse_ASE"
     random_seed = 5
 
-    max_iter = 100
+    max_iter = 110
 
     stdout_report_interval_s = 10
 
     snapshot_interval = 50
 
     [global.step_size_tune]
 
-        interval = 1000
+        interval = 50
 
 [ns]
 
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -46,8 +46,13 @@ def test_Morse_ASE_mpi(mpi_tmp_path, monkeypatch):
     do_Morse_ASE(mpi_tmp_path, monkeypatch, using_mpi=True)
 
 @pytest.mark.mpi
+# github CI working test takes ~90 s
+@pytest.mark.timeout(120, method="thread")
 def test_Morse_ASE_restart_mpi(mpi_tmp_path, monkeypatch):
+    import time
+    t0 = time.time()
     do_Morse_ASE_restart(mpi_tmp_path, monkeypatch, using_mpi=True)
+    print("BOB time", time.time() - t0)
 
 @pytest.mark.mpi
 def test_EAM_LAMMPS_mpi(mpi_tmp_path, monkeypatch):
@@ -151,13 +156,13 @@ def do_Morse_ASE(tmp_path, monkeypatch, using_mpi, max_iter=None):
     # files exist
     assert (tmp_path / 'Morse_ASE.test.NS_samples').is_file()
     assert len(list(tmp_path.glob('Morse_ASE.test.traj.*xyz'))) == 1
-    assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == max_iter_use // traj_interval
+    assert len(list(ase.io.read(tmp_path / 'Morse_ASE.test.traj.extxyz', ':'))) == int(np.ceil(max_iter_use / traj_interval))
 
-    # from test run 12/8/2022
+    # from test run 6/13/2025, when max iter was extended to 110 to catch deadlock in old buggy snapshot step_size writing
     if using_mpi:
-        samples_fields_ref = np.asarray([9.90000000e+01, 8.04691943e+00, 1.28925862e+04, 1.60000000e+01])
+        samples_fields_ref = np.asarray([1.09000000e+02, 8.02719236e+00, 1.28609799e+04, 1.60000000e+01])
     else:
-        samples_fields_ref = np.asarray([9.90000000e+01, 8.08011910e+00, 1.29457779e+04, 1.60000000e+01])
+        samples_fields_ref = np.asarray([1.09000000e+02, 8.06422068e+00, 1.29203058e+04, 1.60000000e+01])
 
     with open(tmp_path / 'Morse_ASE.test.NS_samples') as fin:
         for l in fin: