Add ignore_equilibration flag to NAMD parser (#451)

jhenin · web-flow · commit bc0a67f71480 · 2026-01-21T10:36:12.000Z
* Add ignore_equilibration flag to NAMD parser

This is useful because equilibrium detection is often run after the fact
anyway.
Replace the parsing flag with in_window.
The code can now parse files obtained with alchEquilSteps 0.

* Remove zealous check

* ruff format

* Add tests for new NAMD parser behavior

* Add flag to extract(), update CHANGES

* Ignore undefined attributes
diff --git a/CHANGES b/CHANGES
@@ -12,6 +12,11 @@ The rules for this file:
   * accompany each entry with github issue/PR number (Issue #xyz)
   * release numbers follow "Semantic Versioning" https://semver.org
 
+01/20/2026 jhenin
+
+  - Add flag to ignore NAMD's equilibration in NAMD parser
+  - Accept NAMD data with 0 equilibration steps
+
 10/21/2025 xiki-tempula, jaclark5, yuxuanzhuang. orbeckst
 
   * 2.5.0
diff --git a/src/alchemlyb/parsing/gmx.py b/src/alchemlyb/parsing/gmx.py
@@ -248,7 +248,7 @@ def extract_dHdl(xvg: str, T: float, filter: bool = True) -> pd.DataFrame:
     newind = ["time"] + lambdas
     dHdl = dHdl.reset_index().set_index(newind)
 
-    dHdl.name = "dH/dl"
+    dHdl.name = "dH/dl"  # type: ignore[attr-defined]
 
     return dHdl
 
diff --git a/src/alchemlyb/parsing/gomc.py b/src/alchemlyb/parsing/gomc.py
@@ -152,7 +152,7 @@ def extract_dHdl(filename: str, T: float) -> pd.DataFrame:
     newind = ["time"] + cols
     dHdl = dHdl.reset_index().set_index(newind)
 
-    dHdl.name = "dH/dl"
+    dHdl.name = "dH/dl"  # type: ignore[attr-defined]
 
     return dHdl
 
diff --git a/src/alchemlyb/parsing/lammps.py b/src/alchemlyb/parsing/lammps.py
@@ -656,7 +656,7 @@ def extract_u_nk_from_u_n(
                     )
 
     u_nk.set_index(["time", "fep-lambda"], inplace=True)
-    u_nk.name = "u_nk"
+    u_nk.name = "u_nk"  # type: ignore[attr-defined]
 
     return u_nk
 
@@ -1012,7 +1012,7 @@ def extract_u_nk(
         u_nk.set_index(["time", "fep-lambda"], inplace=True)
     else:
         u_nk.set_index(["time", "coul-lambda", "vdw-lambda"], inplace=True)
-    u_nk.name = "u_nk"
+    u_nk.name = "u_nk"  # type: ignore[attr-defined]
 
     u_nk = u_nk.dropna()
 
@@ -1115,7 +1115,7 @@ def extract_dHdl_from_u_n(
 
     dHdl.set_index(["time", "fep-lambda"], inplace=True)
     dHdl["fep"] = dHdl["fep"] * beta
-    dHdl.name = "dH_dl"
+    dHdl.name = "dH_dl"  # type: ignore[attr-defined]
 
     return dHdl
 
@@ -1311,7 +1311,7 @@ def extract_dHdl(
         else:
             dHdl["coul"] = dHdl["coul"] * beta
 
-    dHdl.name = "dH_dl"
+    dHdl.name = "dH_dl"  # type: ignore[attr-defined]
 
     return dHdl
 
diff --git a/src/alchemlyb/parsing/namd.py b/src/alchemlyb/parsing/namd.py
@@ -124,12 +124,14 @@ def _get_lambdas(fep_files: str | list[str]) -> None | list[float]:
 
 
 @_init_attrs
-def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
+def extract_u_nk(
+    fep_files: str | list[str], T: float, ignore_equilibration: bool = False
+) -> pd.DataFrame:
     """Return reduced potentials `u_nk` from NAMD fepout file(s).
 
     Parameters
     ----------
-    fep_file : str or list of str
+    fep_files : str or list of str
         Path to fepout file(s) to extract data from. These are sorted by filename,
         not including the path, prior to processing, using natural-sort. This way,
         filenames including numbers without leading zeros are handled intuitively.
@@ -144,13 +146,21 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
     T : float
         Temperature in Kelvin at which the simulation was sampled.
 
+    ignore_equilibration : bool, optional
+        If True, the parser will begin collecting energy data immediately from
+        the start of a window, ignoring the "#STARTING COLLECTION..." line.
+        This effectively includes the equilibration steps (defined by NAMD's
+        alchEquilSteps) in the resulting DataFrame. Default is False.
+
     Returns
     -------
     u_nk : :class:`pandas.DataFrame`
         Potential energy for each alchemical state (k) for each frame (n).
 
-    Note
-    ----
+    Notes
+    -----
+    Only samples collected after alchEquilSteps steps in each window are read.
+    If post-hoc equilibrium detection is used, alchEquilSteps can be set to 0 in NAMD.
     If the number of forward and backward samples in a given window are different,
     the extra sample(s) will be discarded. This is typically zero or one sample.
 
@@ -164,6 +174,10 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
         robustness checks.
 
         :param:`fep_files` can now be a list of filenames.
+
+    .. versionchanged:: 2.6.0
+        Added parameter ignore_equilibration.
+
     """
     beta = 1 / (k_b * T)
 
@@ -176,8 +190,8 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
     # create dataframe for results
     u_nk = pd.DataFrame(columns=["time", "fep-lambda"])
 
-    # boolean flag to parse data after equil time
-    parsing = False
+    # Flag to detect inconsistencies like truncated windows
+    in_window = False
 
     if type(fep_files) is str:
         fep_files = [fep_files]
@@ -209,14 +223,15 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
                 # within the same file, then complain. This can happen if truncated fepout files
                 # are presented in the wrong order.
                 if line_split[0] == "#NEW":
-                    if parsing:
+                    if in_window:
                         logger.error(
                             f"Window with lambda1: {lambda1_at_start} lambda2: {lambda2_at_start} lambda_idws: {lambda_idws_at_start} appears truncated"
                         )
                         logger.error(
                             f"because a new window was encountered in {fep_file} before the previous one finished."
                         )
                         raise ValueError("New window begun after truncated window")
+                    in_window = True
 
                     lambda1_at_start, lambda2_at_start = (
                         float(line_split[6]),
@@ -229,6 +244,9 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
 
                 # this line marks end of window; dump data into dataframe
                 if line_split[0] == "#Free":
+                    # Note: in_window might already be false if this window was restarted without another "#NEW" line
+                    in_window = False
+
                     # extract lambda values for finished window
                     # lambda1 = sampling lambda (row), lambda2 = comparison lambda (col)
                     lambda1 = float(line_split[7])
@@ -320,28 +338,30 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
                     win_ts = []
                     win_de_back = []
                     win_ts_back = []
-                    parsing = False
                     has_idws = False
                     lambda1_at_start, lambda2_at_start, lambda_idws_at_start = (
                         None,
                         None,
                         None,
                     )
+                    # end of "#Free" line processing
 
                 # append work value from 'dE' column of fepout file
-                if parsing:
-                    if line_split[0] == "FepEnergy:":
-                        win_de.append(float(line_split[6]))
-                        win_ts.append(float(line_split[1]))
-                    elif line_split[0] == "FepE_back:":
-                        win_de_back.append(float(line_split[6]))
-                        win_ts_back.append(float(line_split[1]))
-
-                # Turn parsing on after line 'STARTING COLLECTION OF ENSEMBLE AVERAGE'
-                if "#STARTING" in line_split:
-                    parsing = True
-
-    if len(win_de) != 0 or len(win_de_back) != 0:  # pragma: no cover
+                if line_split[0] == "FepEnergy:":
+                    win_de.append(float(line_split[6]))
+                    win_ts.append(float(line_split[1]))
+                elif line_split[0] == "FepE_back:":
+                    win_de_back.append(float(line_split[6]))
+                    win_ts_back.append(float(line_split[1]))
+
+                # Forget previous data for this point after line 'STARTING COLLECTION OF ENSEMBLE AVERAGE'
+                if "#STARTING" in line_split and not ignore_equilibration:
+                    win_de = []
+                    win_ts = []
+                    win_de_back = []
+                    win_ts_back = []
+
+    if in_window or len(win_de) != 0 or len(win_de_back) != 0:  # pragma: no cover
         logger.warning(
             'Trailing data without footer line ("#Free energy..."). Interrupted run?'
         )
@@ -358,7 +378,9 @@ def extract_u_nk(fep_files: str | list[str], T: float) -> pd.DataFrame:
     return u_nk
 
 
-def extract(fep_files: str | list[str], T: float) -> dict[str, pd.DataFrame | None]:
+def extract(
+    fep_files: str | list[str], T: float, ignore_equilibration: bool = False
+) -> dict[str, pd.DataFrame | None]:
     r"""Return reduced potentials `u_nk` and gradients `dH/dl`
     from NAMD fepout file(s).
 
@@ -395,5 +417,5 @@ def extract(fep_files: str | list[str], T: float) -> dict[str, pd.DataFrame | No
     """
 
     return {
-        "u_nk": extract_u_nk(fep_files, T)
+        "u_nk": extract_u_nk(fep_files, T, ignore_equilibration)
     }  # NOTE: maybe we should also have 'dHdl': None
diff --git a/src/alchemlyb/tests/parsing/test_namd.py b/src/alchemlyb/tests/parsing/test_namd.py
@@ -441,3 +441,27 @@ def test_u_nk_restarted_last_window_truncated(restarted_dataset_last_window_trun
 
     with pytest.raises(ValueError, match="Last window is truncated"):
         extract_u_nk(restarted_dataset_last_window_truncated["data"]["both"], T=300)
+
+
+def test_u_nk_ignore_equilibration(dataset):
+    """Test that ignore_equilibration=True returns all data"""
+
+    original_file = dataset["data"]["forward"][0]
+    u_nk_original = extract_u_nk(original_file, T=300)
+
+    # New behavior: Should return the full dataset (including equilibration)
+    u_nk_ignored = extract_u_nk(original_file, T=300, ignore_equilibration=True)
+    assert len(u_nk_ignored) > len(u_nk_original)
+
+
+def test_u_nk_no_equilibration(dataset, tmp_path):
+    """Test that all data is read when the #STARTING line is missing from the fepout file"""
+
+    original_file = dataset["data"]["forward"][0]
+
+    no_starting_file = _corrupt_fepout(
+        original_file, [("#STARTING", lambda tokens: None)], tmp_path
+    )
+
+    u_nk_no_equilibration = extract_u_nk(no_starting_file, T=300)
+    assert len(u_nk_no_equilibration) > 0