Merge pull request #23 from wehs7661/fix_poor_sampling

weitse-hsu · web-flow · commit a4bae8dc7030 · 2023-08-01T22:11:28.000-06:00
Resolve the issue of poor sampling
diff --git a/docs/simulations.rst b/docs/simulations.rst
@@ -127,7 +127,8 @@ by the single GRO file, but the user should also be able to initialize different
 configurations (represented by multiple GRO files) in the near future. Also, the MDP template should contain parameters 
 common across all replicas and define the coupling parmaeters for all possible intermediate states,
 so that we can cusotmize different MDP files by defining a subset of alchemical states in different 
-replicas. Importantly, to extend an EEXE simulation, one needs to additionally provide the following
+replicas. For EEXE simulations, some MDP parameters need additional care to be taken, which we describe in
+:ref:`doc_mdp_params`. Importantly, to extend an EEXE simulation, one needs to additionally provide the following
 two checkpoint files:
 
 * One NPY file containing the replica-space trajectories of different configurations saved by the previous run of EEXE simulation with a default name as :code:`rep_trajs.npy`.
@@ -206,9 +207,9 @@ iterations (:code:`n_iterations`) is reached.
 
 .. _doc_parameters:
 
-3. Simulation parameters
+3. Input YAML parameters
 ========================
-In the current implementation of the algorithm, 22 parameters can be specified in the input YAML file.
+In the current implementation of the algorithm, 27 parameters can be specified in the input YAML file.
 Note that the two CLIs :code:`run_EEXE` and :code:`analyze_EEXE` share the same input YAML file, so we also
 include parameters for data analysis here.
 
@@ -378,3 +379,41 @@ parameters left with a blank. Note that specifying :code:`null` is the same as l
     n_bootstrap: 50
     seed : null
 
+.. _doc_mdp_params:
+
+4. Input MDP parameters
+=======================
+As mentioned above, a template MDP file should have all the parameters that will be shared
+across all replicas. It should also define the coupling parameters for the whole range of
+states so that different MDP files can be customized for different replicas. For an EEXE simulation
+launched by the CLI :code:`run_EEXE`, any GROMACS MDP parameter that could potentially lead to issues
+in the EEXE simulation will raise a warning. If the number of warnings is larger than the value
+specified for the flag `-m`/`--maxwarn` in the CLI :code:`run_EEXE`, the simulation will error
+out. To avoid warnings arised from MDP specification, we need to take extra care for the following
+MDP parameters:
+
+- We recommend setting :code:`lmc_seed = -1` so that a different random seed
+  for Monte Carlo moves in the state space will be used for each iteration. 
+- We recommend setting :code:`gen_vel = yes` to re-generating new velocities for each iteration to avoid
+  potential issues with detailed balance. 
+- We recommend setting :code:`gen_seed = -1` so that a different random seed for velocity generation
+  will be used for each iteration.
+- The MDP parameter :code:`nstlog` must be a factor of the YAML parameter :code:`nst_sim` so that the final status
+  of the simulation can be correctly parsed from the LOG file.
+- The MDP parameter :code:`nstdhdl` must be a factor of the YAML parameter :code:`nst_sim` so that the time series
+  of the state index can be correctly parsed from the DHDL file.
+- In EEXE, the MDP parameter :code:`nstdhdl` must be a factor of the MDP parameter :code:`nstexpanded`, or
+  the calculation of the acceptance ratio may be wrong. 
+- Be careful with the pull code specification if you want to apply a distance restraint between two pull groups.
+  Specifically, in an EEXE simulation, all iterations should use the same reference distance. Otherwise, poor sampling
+  can be observed in a fixed-weight EEXE simulation and the equilibration time may be much longer for a weight-updating
+  EEXE simulation. To ensure the same reference distance across all iterations in an EEXE simulation, consider the
+  following scenarios:
+    - If you would like to use the COM distance between the pull groups in the input GRO file as the reference distance
+      for all the iterations (whatever that value is), then specify :code:`pull_coord1_start = yes` with
+      :code:`pull_coord1_init = 0` in your input MDP template. In this case, :obj:`.update_MDP` will parse :code:`pullx.xvg`
+      from the first iteration to get the initial COM distance (:code:`d`) and use it as the reference distance for all the following
+      iterations using :code:`pull_coord1_start = no` with :code:`pull_coord1_init = d`. Note that this implies that
+      the MDP parameter :code:`pull_nstxout` should not be 0.
+    - If you want to explicitly specify a reference distance (:code:`d`) to use for all iterations, simply use 
+      :code:`pull_coord1_start = no` with :code:`pull_coord1_init = d` in your input MDP template.
diff --git a/ensemble_md/cli/run_EEXE.py b/ensemble_md/cli/run_EEXE.py
@@ -130,6 +130,9 @@ def main():
 
         start_idx = comm.bcast(start_idx, root=0)  # so that all the ranks are aware of start_idx
 
+    # 2-3. Get the reference distance for the distance restraint specified in the pull code, if any.
+    EEXE.get_ref_dist()
+
     for i in range(start_idx, EEXE.n_iter):
         if rank == 0:
             # Step 3: Swap the coordinates
diff --git a/ensemble_md/ensemble_EXE.py b/ensemble_md/ensemble_EXE.py
@@ -305,7 +305,7 @@ def set_params(self, analysis):
             self.warnings.append('Warning: We recommend setting gen_seed as -1 so the random seed is different for each iteration.')  # noqa: E501
 
         if 'gen_vel' not in self.template or ('gen_vel' in self.template and self.template['gen_vel'] == 'no'):
-            self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with the detailed balance.')  # noqa: E501
+            self.warnings.append('Warning: We recommend generating new velocities for each iteration to avoid potential issues with detailed balance.')  # noqa: E501
 
         if self.nst_sim % self.template['nstlog'] != 0:
             raise ParameterError(
@@ -341,6 +341,22 @@ def set_params(self, analysis):
                 raise ParameterError(
                     'In EEXE, the parameter "nstdhdl" must be a factor of the parameter "nstexpanded", or the calculation of acceptance ratios might be wrong.')  # noqa: E501
 
+        if 'pull' in self.template and self.template['pull'] == 'yes':
+            pull_ncoords = self.template['pull_ncoords']
+            self.set_ref_dist = []
+            for i in range(pull_ncoords):
+                if self.template[f'pull_coord{i+1}_geometry'] == 'distance':
+                    if self.template[f'pull_coord{i+1}_start'] == 'yes':
+                        self.set_ref_dist.append(True)  # starting from the second iteration, set pull_coord*_init.
+                        if 'pull_nstxout' not in self.template:
+                            self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.')  # noqa: E501
+                        if self.template['pull_nstxout'] == 0:
+                            self.warnings.append('A non-zero value should be specified for pull_nstxout if pull_coord*_start is set to yes.')  # noqa: E501
+                    else:
+                        self.set_ref_dist.append(False)  # Here we assume that the user know what reference distance to use.  # noqa: E501
+                else:
+                    self.set_ref_dist.append(False)  # we only deal with distance restraints for now.
+
         # Step 7: Set up derived parameters
         # 7-1. kT in kJ/mol
         self.kT = k * NA * self.temp / 1000  # 1 kT in kJ/mol
@@ -546,6 +562,21 @@ def initialize_MDP(self, idx):
 
         return MDP
 
+    def get_ref_dist(self):
+        """
+        Gets the reference distance(s) to use starting from the second iteration if distance restraint(s) are used.
+        Specifically, a reference distance determined here is the initial COM distance between the pull groups
+        in the input GRO file. This function initializes the attribute :code:`ref_dist`.
+        """
+        if hasattr(self, 'set_ref_dist'):
+            self.ref_dist = []
+            pullx_file = 'sim_0/iteration_0/pullx.xvg'
+            headers = get_headers(pullx_file)
+            for i in range(len(self.set_ref_dist)):
+                if self.set_ref_dist[i] is True:
+                    dist = list(extract_dataframe(pullx_file, headers=headers)[f'{i+1}'])[0]
+                    self.ref_dist.append(dist)
+
     def update_MDP(self, new_template, sim_idx, iter_idx, states, wl_delta, weights, counts=None):
         """
         Updates the MDP file for a new iteration based on the new MDP template coming from the previous iteration.
@@ -598,6 +629,12 @@ def update_MDP(self, new_template, sim_idx, iter_idx, states, wl_delta, weights,
             MDP["lmc_weights_equil"] = ""
             MDP["weight_equil_wl_delta"] = ""
 
+        # Here we deal with the distance restraint in the pull code, if any.
+        if hasattr(self, 'ref_dist'):
+            for i in range(len(self.ref_dist)):
+                MDP[f'pull_coord{i+1}_start'] = "no"
+                MDP[f'pull_coord{i+1}_init'] = self.ref_dist[i]
+
         return MDP
 
     def extract_final_dhdl_info(self, dhdl_files):