Skip to content
Open
68 changes: 35 additions & 33 deletions package/MDAnalysis/analysis/dssp/pydssp_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
"""

import numpy as np
from scipy.spatial import KDTree

CONST_Q1Q2 = 0.084
CONST_F = 332
DEFAULT_CUTOFF = -0.5
DEFAULT_MARGIN = 1.0
HBOND_SEARCH_CUTOFF = 5.0


def _upsample(a: np.ndarray, window: int) -> np.ndarray:
Expand Down Expand Up @@ -176,52 +178,52 @@ def get_hbond_map(
# h.shape == (n_residues, 3)
# coord.shape == (n_residues, 4, 3)

# distance matrix
n_1, c_0, o_0 = coord[1:, 0], coord[0:-1, 2], coord[0:-1, 3]
n_atoms, c_atoms, o_atoms = coord[1:, 0], coord[:-1, 2], coord[:-1, 3]

n = n_residues - 1
cmap = np.tile(c_0, (n, 1, 1))
omap = np.tile(o_0, (n, 1, 1))
nmap = np.tile(n_1, (1, 1, n)).reshape(n, n, 3)
hmap = np.tile(h_1, (1, 1, n)).reshape(n, n, 3)
# We can use KDTree to find candidate pairs within cutoff distance without having to
# compute a full pairwise distance matrix, much faster especially for larger proteins
n_kdtree = KDTree(n_atoms)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know the original code was vendored from somewhere else, so it explains the heavy reliance on linalg.norm and the lack of PBC awareness - however, now that we're making changes, should we be using one of our lib.distances calls (maybe capped_distance) instead?

Copy link
Member Author

@BradyAJohnston BradyAJohnston Dec 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not at all experienced with lib.distances are you talking about for the KDTree / potential pair generation or for the actual distance computation? If just for computing distances then on the KDTree generation we would already be discarding some potential across-boundary bonds.

We should be able to provide simple periodic checking with KDTree you can specify a boxsize parameter. But this wouldn't take into account the more complex boundaries we might come across?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay I just dug in and found capped_distance which should do exactly what we are after with better periodic boundary checking.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance isn't as good but also as expected with more checks. Still a large improvement over current.

============================================================
Scaling benchmark:
Residues     Time (ms)       Time/Res (μs)
------------------------------------------------------------
50           0.142           2.835
100          0.291           2.906
150          0.489           3.257
200          0.736           3.682
250          1.056           4.226
300          1.629           5.429
============================================================

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've currently updated the pair search to use capped_distances - but the actual distance computation still isn't taking into account periodic boundaries. Is there a periodic boundary aware distance calculation that doesn't do "everything vs everything" which distance_array seems to do? Ideally we would avoid repeatedly calling capped_distance as we have already constructed the KDTree once and can just use those results for computation later on, but still taking into account periodic boundary crossing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pair-wise distances between two equal groups of atoms can be calculated with MDAnalysis.lib.distances.calc_bonds.

o_kdtree = KDTree(o_atoms)

d_on = np.linalg.norm(omap - nmap, axis=-1)
d_ch = np.linalg.norm(cmap - hmap, axis=-1)
d_oh = np.linalg.norm(omap - hmap, axis=-1)
d_cn = np.linalg.norm(cmap - nmap, axis=-1)
# returned is a list of lists, with each neighbors[i] containing a list[int] of
# indices in the other o_kdtree that were within the cutoff which we can change
# to an array of index pairs for slicing
neighbors = n_kdtree.query_ball_tree(o_kdtree, r=HBOND_SEARCH_CUTOFF)
pairs = np.array(
[(i, j) for i, js in enumerate(neighbors) for j in js], dtype=np.int64
)

# electrostatic interaction energy
# e[i, j] = e(CO_i) - e(NH_j)
e = np.pad(
# Exclude local pairs (i, i), (i, i+1), (i, i+2) that are too close for HBonds
pairs = pairs[abs(pairs[:, 0] - pairs[:, 1]) >= 2]

# Exclude donor H absence (Proline)
if donor_mask is not None:
donor_indices = np.where(np.array(donor_mask) == 0)[0]
mask = ~np.isin(pairs[:, 0], donor_indices - 1)
pairs = pairs[mask]

# compute distances and energy as previously but only for the potential pairs
# still returning the same energy matrix that would have otherwise been made
o_indices = pairs[:, 1]
n_indices = pairs[:, 0]
d_on = np.linalg.norm(o_atoms[o_indices] - n_atoms[n_indices], axis=-1)
d_ch = np.linalg.norm(c_atoms[o_indices] - h_1[n_indices], axis=-1)
d_oh = np.linalg.norm(o_atoms[o_indices] - h_1[n_indices], axis=-1)
d_cn = np.linalg.norm(c_atoms[o_indices] - n_atoms[n_indices], axis=-1)

e = np.zeros((n_residues, n_residues))
e[n_indices + 1, o_indices] = (
CONST_Q1Q2
* (1.0 / d_on + 1.0 / d_ch - 1.0 / d_oh - 1.0 / d_cn)
* CONST_F,
[[1, 0], [0, 1]],
* CONST_F
)

if return_e: # pragma: no cover
return e

# mask for local pairs (i,i), (i,i+1), (i,i+2)
local_mask = ~np.eye(n_residues, dtype=bool)
local_mask *= ~np.diag(np.ones(n_residues - 1, dtype=bool), k=-1)
local_mask *= ~np.diag(np.ones(n_residues - 2, dtype=bool), k=-2)
# mask for donor H absence (Proline)
donor_mask = (
np.array(donor_mask).astype(float)
if donor_mask is not None
else np.ones(n_residues, dtype=float)
)
donor_mask = np.tile(donor_mask[:, np.newaxis], (1, n_residues))
# hydrogen bond map (continuous value extension of original definition)
hbond_map = np.clip(cutoff - margin - e, a_min=-margin, a_max=margin)
hbond_map = (np.sin(hbond_map / margin * np.pi / 2) + 1.0) / 2

assert hbond_map.shape == local_mask.shape == donor_mask.shape

hbond_map *= local_mask
hbond_map *= donor_mask

return hbond_map


Expand Down
Loading