diff --git a/package/AUTHORS b/package/AUTHORS index e23f2afcf2..5800893181 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -192,7 +192,7 @@ Chronological list of authors - Mingyi Xue - Meghan Osato - Anirvinya G - - Rishabh Shukla + - Rishabh Shukla - Manish Kumar - Aditi Tripathi - Sukeerti T @@ -247,14 +247,14 @@ Chronological list of authors - Jia-Xin Zhu - Tanish Yelgoe 2025 - - Joshua Raphael Uy + - Joshua Raphael Uy - Namir Oues - - Pavel Buslaev + - Pavel Buslaev - Lexi Xu - - BHM-Bob G + - BHM-Bob G - Yu-Yuan (Stuart) Yang - James Rowe - - Debasish Mohanty + - Debasish Mohanty - Abdulrahman Elbanna - Tulga-Erdene Sodjargal - Gareth Elliott @@ -262,9 +262,10 @@ Chronological list of authors - Sirsha Ganguly - Amruthesh Thirumalaiswamy - Ch Zhang - - Raúl Lois-Cuns + - Raúl Lois-Cuns - Pranay Pelapkar - Shreejan Dolai + - Brady Johnston External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index 642e717b85..86423811d4 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -15,7 +15,7 @@ The rules for this file: ------------------------------------------------------------------------------- ??/??/?? IAlibay, orbeckst, marinegor, tylerjereddy, ljwoods2, marinegor, - spyke7, talagayev + spyke7, talagayev, bradyajohnston * 2.11.0 @@ -30,6 +30,8 @@ Fixes DSSP by porting upstream PyDSSP 0.9.1 fix (Issue #4913) Enhancements + * Change `get_hbond_map` to use `distance_capped` to speedup finding H bonding pairs + and also take into account the periodic boundaries (PR #5182) * Enables parallelization for analysis.diffusionmap.DistanceMatrix (Issue #4679, PR #4745) diff --git a/package/MDAnalysis/analysis/dssp/dssp.py b/package/MDAnalysis/analysis/dssp/dssp.py index c3a76b2935..e46d0afe9f 100644 --- a/package/MDAnalysis/analysis/dssp/dssp.py +++ b/package/MDAnalysis/analysis/dssp/dssp.py @@ -144,6 +144,7 @@ .. autofunction:: translate """ +import warnings from typing import Optional, Union import numpy as np @@ -323,6 +324,12 @@ def __init__( for t in heavyatom_names } self._donor_mask: Optional[np.ndarray] = ag.residues.resnames != "PRO" + self._box = self._trajectory.ts.dimensions + if np.array_equal(self._box, (1, 1, 1, 90, 90, 90)): + self._box = None + warnings.warn( + "Box dimensions are (1, 1, 1, 90, 90, 90), not using periodic boundary conditions in DSSP calculations" + ) self._hydrogens: list["AtomGroup"] = [ res.atoms.select_atoms(f"name {hydrogen_name}") for res in ag.residues @@ -399,7 +406,11 @@ def _get_coords(self) -> np.ndarray: def _single_frame(self): coords = self._get_coords() - dssp = assign(coords, donor_mask=self._donor_mask) + dssp = assign( + coords, + donor_mask=self._donor_mask, + box=self._box, + ) self.results.dssp_ndarray.append(dssp) def _conclude(self): diff --git a/package/MDAnalysis/analysis/dssp/pydssp_numpy.py b/package/MDAnalysis/analysis/dssp/pydssp_numpy.py index 643e510f18..c30dbfa257 100644 --- a/package/MDAnalysis/analysis/dssp/pydssp_numpy.py +++ b/package/MDAnalysis/analysis/dssp/pydssp_numpy.py @@ -11,10 +11,13 @@ import numpy as np +from MDAnalysis.lib.distances import capped_distance, minimize_vectors + CONST_Q1Q2 = 0.084 CONST_F = 332 DEFAULT_CUTOFF = -0.5 DEFAULT_MARGIN = 1.0 +HBOND_SEARCH_CUTOFF = 5.0 def _upsample(a: np.ndarray, window: int) -> np.ndarray: @@ -118,10 +121,11 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray: def get_hbond_map( coord: np.ndarray, - donor_mask: np.ndarray = None, + donor_mask: np.ndarray | None = None, cutoff: float = DEFAULT_CUTOFF, margin: float = DEFAULT_MARGIN, return_e: bool = False, + box: np.ndarray | None = None, ) -> np.ndarray: """Returns hydrogen bond map @@ -145,6 +149,14 @@ def get_hbond_map( return_e : bool, optional if to return energy instead of hbond map, by default False + box : array_like, optional + The unitcell dimensions of the system, which can be orthogonal or + triclinic and must be provided in the same format as returned by + :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: + ``[lx, ly, lz, alpha, beta, gamma]``. + + .. versionadded:: 2.11.0 + Returns ------- np.ndarray @@ -156,6 +168,8 @@ def get_hbond_map( .. versionchanged:: 2.10.0 Support masking of hydrogen donors via `donor_mask` (especially needed for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1. + .. versionchanged:: 2.11.0 + Speedup with KDTree and support periodic boundary checking via `box` param. """ n_residues, n_atom_types, _xyz = coord.shape assert n_atom_types in ( @@ -176,58 +190,70 @@ def get_hbond_map( # h.shape == (n_residues, 3) # coord.shape == (n_residues, 4, 3) - # distance matrix - n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3] + n_atoms, c_atoms, o_atoms = coord[1:, 0], coord[:-1, 2], coord[:-1, 3] - n = n_residues - 1 - cmap = np.tile(c_0, (n, 1, 1)) - omap = np.tile(o_0, (n, 1, 1)) - nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3) - hmap = np.tile(h_1, (1, 1, n)).reshape(n, n, 3) + # We can reduce the need for a complete pairwise distance matrix by first searching for + # candidate pairs within a certain distance cutoff and only computing the energy + # for these relevant pairs rather than "potential" HBonds between far apart residues + pairs, d_on = capped_distance( + n_atoms, + o_atoms, + max_cutoff=HBOND_SEARCH_CUTOFF, + box=box, + ) - d_on = np.linalg.norm(omap - nmap, axis=-1) - d_ch = np.linalg.norm(cmap - hmap, axis=-1) - d_oh = np.linalg.norm(omap - hmap, axis=-1) - d_cn = np.linalg.norm(cmap - nmap, axis=-1) + # Exclude local pairs (i, i), (i, i+1), (i, i+2) that are too close for SS HBonds + local_mask = abs(pairs[:, 0] - pairs[:, 1]) >= 2 + pairs = pairs[local_mask] + + # Exclude donor H absence (Proline) + if donor_mask is not None: + donor_indices = np.where(np.array(donor_mask) == 0)[0] + mask = ~np.isin(pairs[:, 0], donor_indices - 1) + pairs = pairs[mask] + + # compute distances and energy as previously but only for the potential pairs + # still returning the same energy matrix that would have otherwise been made + o_indices = pairs[:, 1] + n_indices = pairs[:, 0] + + # d_on = d_on[local_mask] + def _distances(x, y, box=None): + if box is None: + return np.linalg.norm(x[o_indices] - y[n_indices], axis=-1) + else: + return np.linalg.norm( + minimize_vectors(x[o_indices] - y[n_indices], box=box), + axis=-1, + ) + + d_on = _distances(o_atoms, n_atoms, box) + d_ch = _distances(c_atoms, h_1, box) + d_oh = _distances(o_atoms, h_1, box) + d_cn = _distances(c_atoms, n_atoms, box) # electrostatic interaction energy # e[i, j] = e(CO_i) - e(NH_j) - e = np.pad( + e = np.zeros((n_residues, n_residues)) + e[n_indices + 1, o_indices] = ( CONST_Q1Q2 * (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn) - * CONST_F, - [[1, 0], [0, 1]], + * CONST_F ) if return_e: # pragma: no cover return e - # mask for local pairs (i,i), (i,i+1), (i,i+2) - local_mask = ~np.eye(n_residues, dtype=bool) - local_mask *= ~np.diag(np.ones(n_residues - 1, dtype=bool), k=-1) - local_mask *= ~np.diag(np.ones(n_residues - 2, dtype=bool), k=-2) - # mask for donor H absence (Proline) - donor_mask = ( - np.array(donor_mask).astype(float) - if donor_mask is not None - else np.ones(n_residues, dtype=float) - ) - donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_residues)) - # hydrogen bond map (continuous value extension of original definition) hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin) hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2 - assert hbond_map.shape == local_mask.shape == donor_mask.shape - - hbond_map *= local_mask - hbond_map *= donor_mask - return hbond_map def assign( coord: np.ndarray, - donor_mask: np.ndarray = None, + donor_mask: np.ndarray | None = None, + box: np.ndarray | None = None, ) -> np.ndarray: """Assigns secondary structure for a given coordinate array, either with or without assigned hydrogens @@ -240,12 +266,20 @@ def assign( (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates (when k=5). - donor_mask : np.array + donor_mask : np.array | None, optional Mask out any hydrogens that should not be considered (in particular HN in PRO). If ``None`` then all H will be used (behavior up to 2.9.0). .. versionadded:: 2.10.0 + box : array_like, optional + The unitcell dimensions of the system, which can be orthogonal or + triclinic and must be provided in the same format as returned by + :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: + ``[lx, ly, lz, alpha, beta, gamma]``. + + .. versionadded:: 2.11.0 + Returns ------- np.ndarray @@ -257,10 +291,12 @@ def assign( .. versionchanged:: 2.10.0 Support masking of donors. + .. versionchanged:: 2.11.0 + Speedup with KDTree and support periodic boundary checking via `box` param. """ # get hydrogen bond map - hbmap = get_hbond_map(coord, donor_mask=donor_mask) + hbmap = get_hbond_map(coord, donor_mask=donor_mask, box=box) hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form # identify turn 3, 4, 5 diff --git a/testsuite/MDAnalysisTests/analysis/test_dssp.py b/testsuite/MDAnalysisTests/analysis/test_dssp.py index 8ddf49db5c..4576915ad8 100644 --- a/testsuite/MDAnalysisTests/analysis/test_dssp.py +++ b/testsuite/MDAnalysisTests/analysis/test_dssp.py @@ -1,8 +1,9 @@ import glob import MDAnalysis as mda +import numpy as np import pytest -from MDAnalysis.analysis.dssp import DSSP, translate +from MDAnalysis.analysis.dssp import DSSP, assign, translate from MDAnalysisTests.datafiles import DSSP as DSSP_FOLDER from MDAnalysisTests.datafiles import TPR, XTC @@ -35,6 +36,19 @@ def test_trajectory(client_DSSP): ) +# ensure that we get different assigned results when using and not using the +# donor mask which filters out prolines from being potential HBond donors as they +# are missing hydrogens +def test_donor_mask(client_DSSP): + u = mda.Universe(TPR, XTC).select_atoms("protein").universe + dssp = DSSP(u) + coords = dssp._get_coords() + + assert not np.array_equal( + assign(coords), assign(coords, donor_mask=dssp._donor_mask) + ) + + def test_atomgroup(client_DSSP): protein = mda.Universe(TPR, XTC).select_atoms("protein") run = DSSP(protein).run(**client_DSSP, stop=10)