Skip to content

Commit 1a606f7

Browse files
committed
fix: struct_conn bond matching when auth_asym_id differs from label_asym_id
When ptnr_label_seq_id is "." for non-polymers, the code correctly falls back to ptnr_auth_seq_id to determine the residue ID. However, the matching logic was using atom_array.res_id (which contains label_seq_id values) instead of auth_seq_id, causing inter-chain bonds to be missed. This fix: - Extracts auth_seq_id from atom_array annotations - Tracks which partner uses auth_seq_id fallback - Uses the appropriate res_ids array for matching each partner Fixes 35 affected PDB entries including 9b5c, 8an9, 1lgc.
1 parent 9b21130 commit 1a606f7

File tree

2 files changed

+51
-11
lines changed

2 files changed

+51
-11
lines changed

src/atomworks/io/utils/bonds.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,7 @@ def match_or_wildcard(array: np.ndarray, value: str) -> np.ndarray:
369369
global_atom_idx = np.arange(atom_array.array_length())
370370
alt_atom_ids = get_annotation(atom_array, "alt_atom_id", default=atom_names)
371371
uses_alt_atom_id = get_annotation(atom_array, "uses_alt_atom_id", default=np.zeros(len(atom_array), dtype=bool))
372+
auth_seq_ids = get_annotation(atom_array, "auth_seq_id", default=None)
372373

373374
all_res_names = np.append(np.unique(res_names), "*")
374375
all_chain_ids = np.unique(chain_ids)
@@ -424,35 +425,38 @@ def match_or_wildcard(array: np.ndarray, value: str) -> np.ndarray:
424425
# For non-polymers, we use the auth_seq_id if available and valid (i.e., not "." or "?"); otherwise we use the label_seq_id
425426
# (Required to avoid ambiguity, since if using `label` only we may have multiple residue within a
426427
# chain with the same label_seq_id and the same res_name; see: 6MUB)
427-
428-
res_id1 = int(
429-
row["ptnr1_label_seq_id"]
430-
if ((chain_id1 in relevant_polymer_chain_identifiers) or ("ptnr1_auth_seq_id" not in row))
428+
# We track which partner uses auth_seq_id so we can use the correct array for matching below.
429+
use_auth_seq_id1 = not (
430+
((chain_id1 in relevant_polymer_chain_identifiers) or ("ptnr1_auth_seq_id" not in row))
431431
and row["ptnr1_label_seq_id"] != "."
432-
else row["ptnr1_auth_seq_id"]
433432
)
434-
res_id2 = int(
435-
row["ptnr2_label_seq_id"]
436-
if ((chain_id2 in relevant_polymer_chain_identifiers) or ("ptnr2_auth_seq_id" not in row))
433+
use_auth_seq_id2 = not (
434+
((chain_id2 in relevant_polymer_chain_identifiers) or ("ptnr2_auth_seq_id" not in row))
437435
and row["ptnr2_label_seq_id"] != "."
438-
else row["ptnr2_auth_seq_id"]
439436
)
440437

438+
res_id1 = int(row["ptnr1_auth_seq_id"] if use_auth_seq_id1 else row["ptnr1_label_seq_id"])
439+
res_id2 = int(row["ptnr2_auth_seq_id"] if use_auth_seq_id2 else row["ptnr2_label_seq_id"])
440+
441441
ins_code1 = row.get("pdbx_ptnr1_PDB_ins_code", "")
442442
ins_code2 = row.get("pdbx_ptnr2_PDB_ins_code", "")
443443
ins_code1 = "" if ins_code1 in (".", "?") else ins_code1
444444
ins_code2 = "" if ins_code2 in (".", "?") else ins_code2
445445

446+
# Use auth_seq_ids for matching when auth_seq_id fallback was used above
447+
res_ids_for_match1 = auth_seq_ids.astype(int) if (use_auth_seq_id1 and auth_seq_ids is not None) else res_ids
448+
res_ids_for_match2 = auth_seq_ids.astype(int) if (use_auth_seq_id2 and auth_seq_ids is not None) else res_ids
449+
446450
# ... get masks for the residues to which atoms 1 & 2 belong
447451
in_res1 = (
448452
(relevant_chain_identifiers == chain_id1)
449-
& (res_ids == res_id1)
453+
& (res_ids_for_match1 == res_id1)
450454
& match_or_wildcard(res_names, res_name1)
451455
& (ins_codes == ins_code1)
452456
)
453457
in_res2 = (
454458
(relevant_chain_identifiers == chain_id2)
455-
& (res_ids == res_id2)
459+
& (res_ids_for_match2 == res_id2)
456460
& match_or_wildcard(res_names, res_name2)
457461
& (ins_codes == ins_code2)
458462
)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""Regression test for struct_conn auth_seq_id fallback bug.
2+
3+
When ptnr_label_seq_id is "." for non-polymers, the code correctly falls back to
4+
ptnr_auth_seq_id to determine the residue ID. However, the matching logic was using
5+
atom_array.res_id (which contains label_seq_id values) instead of auth_seq_id,
6+
causing inter-chain bonds to be missed when auth_asym_id differs from label_asym_id.
7+
"""
8+
9+
import pytest
10+
11+
from atomworks.io.parser import parse
12+
from tests.io.conftest import get_pdb_path
13+
14+
# Representative PDB entries affected by the bug
15+
AFFECTED_PDB_IDS = ["9b5c", "8an9", "1lgc"]
16+
17+
18+
@pytest.mark.parametrize("pdb_id", AFFECTED_PDB_IDS)
19+
def test_struct_conn_auth_seq_id_fallback(pdb_id: str):
20+
"""Verify inter-chain bonds are found when label_seq_id='.' requires auth_seq_id fallback."""
21+
path = get_pdb_path(pdb_id)
22+
result = parse(filename=path)
23+
# Use asym_unit (the raw parsed structure) for bond checking
24+
atom_array = result["asym_unit"][0]
25+
26+
assert atom_array.bonds is not None
27+
bonds = atom_array.bonds.as_array()
28+
chain_ids = atom_array.chain_id
29+
30+
# Check for inter-chain bonds
31+
inter_chain_bonds = [(chain_ids[b[0]], chain_ids[b[1]]) for b in bonds if chain_ids[b[0]] != chain_ids[b[1]]]
32+
assert len(inter_chain_bonds) > 0, f"No inter-chain bonds found for {pdb_id}"
33+
34+
35+
if __name__ == "__main__":
36+
pytest.main([__file__])

0 commit comments

Comments
 (0)