fix: struct_conn bond matching when auth_asym_id differs from label_asym_id

N283T · N283T · commit 1a606f785fbd · 2026-01-17T21:04:51.000+09:00
When ptnr_label_seq_id is "." for non-polymers, the code correctly falls
back to ptnr_auth_seq_id to determine the residue ID. However, the
matching logic was using atom_array.res_id (which contains label_seq_id
values) instead of auth_seq_id, causing inter-chain bonds to be missed.

This fix:
- Extracts auth_seq_id from atom_array annotations
- Tracks which partner uses auth_seq_id fallback
- Uses the appropriate res_ids array for matching each partner

Fixes 35 affected PDB entries including 9b5c, 8an9, 1lgc.
diff --git a/src/atomworks/io/utils/bonds.py b/src/atomworks/io/utils/bonds.py
@@ -369,6 +369,7 @@ def match_or_wildcard(array: np.ndarray, value: str) -> np.ndarray:
     global_atom_idx = np.arange(atom_array.array_length())
     alt_atom_ids = get_annotation(atom_array, "alt_atom_id", default=atom_names)
     uses_alt_atom_id = get_annotation(atom_array, "uses_alt_atom_id", default=np.zeros(len(atom_array), dtype=bool))
+    auth_seq_ids = get_annotation(atom_array, "auth_seq_id", default=None)
 
     all_res_names = np.append(np.unique(res_names), "*")
     all_chain_ids = np.unique(chain_ids)
@@ -424,35 +425,38 @@ def match_or_wildcard(array: np.ndarray, value: str) -> np.ndarray:
         # For non-polymers, we use the auth_seq_id if available and valid (i.e., not "." or "?"); otherwise we use the label_seq_id
         # (Required to avoid ambiguity, since if using `label` only we may have multiple residue within a
         # chain with the same label_seq_id and the same res_name; see: 6MUB)
-
-        res_id1 = int(
-            row["ptnr1_label_seq_id"]
-            if ((chain_id1 in relevant_polymer_chain_identifiers) or ("ptnr1_auth_seq_id" not in row))
+        # We track which partner uses auth_seq_id so we can use the correct array for matching below.
+        use_auth_seq_id1 = not (
+            ((chain_id1 in relevant_polymer_chain_identifiers) or ("ptnr1_auth_seq_id" not in row))
             and row["ptnr1_label_seq_id"] != "."
-            else row["ptnr1_auth_seq_id"]
         )
-        res_id2 = int(
-            row["ptnr2_label_seq_id"]
-            if ((chain_id2 in relevant_polymer_chain_identifiers) or ("ptnr2_auth_seq_id" not in row))
+        use_auth_seq_id2 = not (
+            ((chain_id2 in relevant_polymer_chain_identifiers) or ("ptnr2_auth_seq_id" not in row))
             and row["ptnr2_label_seq_id"] != "."
-            else row["ptnr2_auth_seq_id"]
         )
 
+        res_id1 = int(row["ptnr1_auth_seq_id"] if use_auth_seq_id1 else row["ptnr1_label_seq_id"])
+        res_id2 = int(row["ptnr2_auth_seq_id"] if use_auth_seq_id2 else row["ptnr2_label_seq_id"])
+
         ins_code1 = row.get("pdbx_ptnr1_PDB_ins_code", "")
         ins_code2 = row.get("pdbx_ptnr2_PDB_ins_code", "")
         ins_code1 = "" if ins_code1 in (".", "?") else ins_code1
         ins_code2 = "" if ins_code2 in (".", "?") else ins_code2
 
+        # Use auth_seq_ids for matching when auth_seq_id fallback was used above
+        res_ids_for_match1 = auth_seq_ids.astype(int) if (use_auth_seq_id1 and auth_seq_ids is not None) else res_ids
+        res_ids_for_match2 = auth_seq_ids.astype(int) if (use_auth_seq_id2 and auth_seq_ids is not None) else res_ids
+
         # ... get masks for the residues to which atoms 1 & 2 belong
         in_res1 = (
             (relevant_chain_identifiers == chain_id1)
-            & (res_ids == res_id1)
+            & (res_ids_for_match1 == res_id1)
             & match_or_wildcard(res_names, res_name1)
             & (ins_codes == ins_code1)
         )
         in_res2 = (
             (relevant_chain_identifiers == chain_id2)
-            & (res_ids == res_id2)
+            & (res_ids_for_match2 == res_id2)
             & match_or_wildcard(res_names, res_name2)
             & (ins_codes == ins_code2)
         )
diff --git a/tests/io/components/test_struct_conn_auth_seq_id_fallback.py b/tests/io/components/test_struct_conn_auth_seq_id_fallback.py
@@ -0,0 +1,36 @@
+"""Regression test for struct_conn auth_seq_id fallback bug.
+
+When ptnr_label_seq_id is "." for non-polymers, the code correctly falls back to
+ptnr_auth_seq_id to determine the residue ID. However, the matching logic was using
+atom_array.res_id (which contains label_seq_id values) instead of auth_seq_id,
+causing inter-chain bonds to be missed when auth_asym_id differs from label_asym_id.
+"""
+
+import pytest
+
+from atomworks.io.parser import parse
+from tests.io.conftest import get_pdb_path
+
+# Representative PDB entries affected by the bug
+AFFECTED_PDB_IDS = ["9b5c", "8an9", "1lgc"]
+
+
+@pytest.mark.parametrize("pdb_id", AFFECTED_PDB_IDS)
+def test_struct_conn_auth_seq_id_fallback(pdb_id: str):
+    """Verify inter-chain bonds are found when label_seq_id='.' requires auth_seq_id fallback."""
+    path = get_pdb_path(pdb_id)
+    result = parse(filename=path)
+    # Use asym_unit (the raw parsed structure) for bond checking
+    atom_array = result["asym_unit"][0]
+
+    assert atom_array.bonds is not None
+    bonds = atom_array.bonds.as_array()
+    chain_ids = atom_array.chain_id
+
+    # Check for inter-chain bonds
+    inter_chain_bonds = [(chain_ids[b[0]], chain_ids[b[1]]) for b in bonds if chain_ids[b[0]] != chain_ids[b[1]]]
+    assert len(inter_chain_bonds) > 0, f"No inter-chain bonds found for {pdb_id}"
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])