-
Notifications
You must be signed in to change notification settings - Fork 768
ENH: Change get_hbond_map to use KDTree for distance search
#5182
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
8f25085
a3f9ac4
617e7de
234ad87
93906df
f578aa9
d831ba2
86a045d
26c4ca8
2105bd6
e81d2e2
bdaadab
f0b2e23
4faa47c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,10 +11,13 @@ | |
|
|
||
| import numpy as np | ||
|
|
||
| from MDAnalysis.lib.distances import capped_distance, minimize_vectors | ||
|
|
||
| CONST_Q1Q2 = 0.084 | ||
| CONST_F = 332 | ||
| DEFAULT_CUTOFF = -0.5 | ||
| DEFAULT_MARGIN = 1.0 | ||
| HBOND_SEARCH_CUTOFF = 5.0 | ||
|
|
||
|
|
||
| def _upsample(a: np.ndarray, window: int) -> np.ndarray: | ||
|
|
@@ -118,10 +121,11 @@ def _get_hydrogen_atom_position(coord: np.ndarray) -> np.ndarray: | |
|
|
||
| def get_hbond_map( | ||
| coord: np.ndarray, | ||
| donor_mask: np.ndarray = None, | ||
| donor_mask: np.ndarray | None = None, | ||
| cutoff: float = DEFAULT_CUTOFF, | ||
| margin: float = DEFAULT_MARGIN, | ||
| return_e: bool = False, | ||
| box: np.ndarray | None = None, | ||
| ) -> np.ndarray: | ||
| """Returns hydrogen bond map | ||
|
|
||
|
|
@@ -145,6 +149,14 @@ def get_hbond_map( | |
| return_e : bool, optional | ||
| if to return energy instead of hbond map, by default False | ||
|
|
||
| box : array_like, optional | ||
| The unitcell dimensions of the system, which can be orthogonal or | ||
| triclinic and must be provided in the same format as returned by | ||
| :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: | ||
| ``[lx, ly, lz, alpha, beta, gamma]``. | ||
|
|
||
| .. versionadded:: 2.11.0 | ||
|
|
||
| Returns | ||
| ------- | ||
| np.ndarray | ||
|
|
@@ -156,6 +168,8 @@ def get_hbond_map( | |
| .. versionchanged:: 2.10.0 | ||
| Support masking of hydrogen donors via `donor_mask` (especially needed | ||
| for ignoring HN on proline residues). Backport of PRO fix from pydssp 0.9.1. | ||
| .. versionchanged:: 2.11.0 | ||
| Speedup with KDTree and support periodic boundary checking via `box` param. | ||
| """ | ||
| n_residues, n_atom_types, _xyz = coord.shape | ||
| assert n_atom_types in ( | ||
|
|
@@ -176,58 +190,70 @@ def get_hbond_map( | |
| # h.shape == (n_residues, 3) | ||
| # coord.shape == (n_residues, 4, 3) | ||
|
|
||
| # distance matrix | ||
| n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3] | ||
| n_atoms, c_atoms, o_atoms = coord[1:, 0], coord[:-1, 2], coord[:-1, 3] | ||
|
|
||
| n = n_residues - 1 | ||
| cmap = np.tile(c_0, (n, 1, 1)) | ||
| omap = np.tile(o_0, (n, 1, 1)) | ||
| nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3) | ||
| hmap = np.tile(h_1, (1, 1, n)).reshape(n, n, 3) | ||
| # We can reduce the need for a complete pairwise distance matrix by first searching for | ||
| # candidate pairs within a certain distance cutoff and only computing the energy | ||
| # for these relevant pairs rather than "potential" HBonds between far apart residues | ||
| pairs, d_on = capped_distance( | ||
| n_atoms, | ||
| o_atoms, | ||
| max_cutoff=HBOND_SEARCH_CUTOFF, | ||
| box=box, | ||
| ) | ||
|
|
||
| d_on = np.linalg.norm(omap - nmap, axis=-1) | ||
| d_ch = np.linalg.norm(cmap - hmap, axis=-1) | ||
| d_oh = np.linalg.norm(omap - hmap, axis=-1) | ||
| d_cn = np.linalg.norm(cmap - nmap, axis=-1) | ||
| # Exclude local pairs (i, i), (i, i+1), (i, i+2) that are too close for SS HBonds | ||
| local_mask = abs(pairs[:, 0] - pairs[:, 1]) >= 2 | ||
| pairs = pairs[local_mask] | ||
|
|
||
| # Exclude donor H absence (Proline) | ||
| if donor_mask is not None: | ||
| donor_indices = np.where(np.array(donor_mask) == 0)[0] | ||
| mask = ~np.isin(pairs[:, 0], donor_indices - 1) | ||
| pairs = pairs[mask] | ||
|
|
||
| # compute distances and energy as previously but only for the potential pairs | ||
| # still returning the same energy matrix that would have otherwise been made | ||
| o_indices = pairs[:, 1] | ||
| n_indices = pairs[:, 0] | ||
|
|
||
| # d_on = d_on[local_mask] | ||
| def _distances(x, y, box=None): | ||
| if box is None: | ||
| return np.linalg.norm(x[o_indices] - y[n_indices], axis=-1) | ||
| else: | ||
| return np.linalg.norm( | ||
| minimize_vectors(x[o_indices] - y[n_indices], box=box), | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or use directly MDAnalysis.lib.distances.calc_bonds return calc_bonds(x[o_indices], y[n_indices], box=box)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah this is good to know! (and yeah I missed it from naming). Will update to use that instead of this combo. |
||
| axis=-1, | ||
| ) | ||
|
|
||
| d_on = _distances(o_atoms, n_atoms, box) | ||
| d_ch = _distances(c_atoms, h_1, box) | ||
| d_oh = _distances(o_atoms, h_1, box) | ||
| d_cn = _distances(c_atoms, n_atoms, box) | ||
|
|
||
| # electrostatic interaction energy | ||
| # e[i, j] = e(CO_i) - e(NH_j) | ||
| e = np.pad( | ||
| e = np.zeros((n_residues, n_residues)) | ||
| e[n_indices + 1, o_indices] = ( | ||
| CONST_Q1Q2 | ||
| * (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn) | ||
| * CONST_F, | ||
| [[1, 0], [0, 1]], | ||
| * CONST_F | ||
| ) | ||
|
|
||
| if return_e: # pragma: no cover | ||
| return e | ||
|
|
||
| # mask for local pairs (i,i), (i,i+1), (i,i+2) | ||
| local_mask = ~np.eye(n_residues, dtype=bool) | ||
| local_mask *= ~np.diag(np.ones(n_residues - 1, dtype=bool), k=-1) | ||
| local_mask *= ~np.diag(np.ones(n_residues - 2, dtype=bool), k=-2) | ||
| # mask for donor H absence (Proline) | ||
| donor_mask = ( | ||
| np.array(donor_mask).astype(float) | ||
| if donor_mask is not None | ||
| else np.ones(n_residues, dtype=float) | ||
| ) | ||
| donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_residues)) | ||
| # hydrogen bond map (continuous value extension of original definition) | ||
| hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin) | ||
| hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2 | ||
|
|
||
| assert hbond_map.shape == local_mask.shape == donor_mask.shape | ||
|
|
||
| hbond_map *= local_mask | ||
| hbond_map *= donor_mask | ||
|
|
||
| return hbond_map | ||
|
|
||
|
|
||
| def assign( | ||
| coord: np.ndarray, | ||
| donor_mask: np.ndarray = None, | ||
| donor_mask: np.ndarray | None = None, | ||
| box: np.ndarray | None = None, | ||
| ) -> np.ndarray: | ||
| """Assigns secondary structure for a given coordinate array, | ||
| either with or without assigned hydrogens | ||
|
|
@@ -240,12 +266,20 @@ def assign( | |
| (N, CA, C, O) atoms coordinates (if k=4), or (N, CA, C, O, H) coordinates | ||
| (when k=5). | ||
|
|
||
| donor_mask : np.array | ||
| donor_mask : np.array | None, optional | ||
| Mask out any hydrogens that should not be considered (in particular HN | ||
| in PRO). If ``None`` then all H will be used (behavior up to 2.9.0). | ||
|
|
||
| .. versionadded:: 2.10.0 | ||
|
|
||
| box : array_like, optional | ||
| The unitcell dimensions of the system, which can be orthogonal or | ||
| triclinic and must be provided in the same format as returned by | ||
| :attr:`MDAnalysis.coordinates.timestep.Timestep.dimensions`: | ||
| ``[lx, ly, lz, alpha, beta, gamma]``. | ||
|
|
||
| .. versionadded:: 2.11.0 | ||
|
|
||
| Returns | ||
| ------- | ||
| np.ndarray | ||
|
|
@@ -257,10 +291,12 @@ def assign( | |
|
|
||
| .. versionchanged:: 2.10.0 | ||
| Support masking of donors. | ||
| .. versionchanged:: 2.11.0 | ||
| Speedup with KDTree and support periodic boundary checking via `box` param. | ||
|
|
||
| """ | ||
| # get hydrogen bond map | ||
| hbmap = get_hbond_map(coord, donor_mask=donor_mask) | ||
| hbmap = get_hbond_map(coord, donor_mask=donor_mask, box=box) | ||
| hbmap = np.swapaxes(hbmap, -1, -2) # convert into "i:C=O, j:N-H" form | ||
|
|
||
| # identify turn 3, 4, 5 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not 100% on what the warning should be - but this is how we are imagining it? @IAlibay
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes that looks about right. With regards to the warning, your message is close enough to what I was thinking, i.e. something like "Default unit cells of 1 A cubed found. This likely comes from a cryo-em file. To avoid usses, we are disabling periodic boundary conditions in the DSSP calculation".
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A warning is currently already raised twice: once when loading the universe and again when running the DSSP (not sure why it is happening for the
run(). As the unit cell dimensions are already set toNoneit's not triggering the new warning. In most cases the unit cell wouldn't make it "all the way" to DSSP but I'll change it to just be the same warning message.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah this is what I was trying to convey earlier in the conversation when I was saying that the PDB reader takes care of that edge case already. I'm not sure what the best approach is here, thoughts @marinegor ?