Skip to content

Commit a4b5265

Browse files
committed
refactor from_cif_file; add from_custom_cif_file docs
1 parent c6bf7a2 commit a4b5265

File tree

1 file changed

+131
-103
lines changed

1 file changed

+131
-103
lines changed

src/plinder/data/utils/annotations/aggregate_annotations.py

Lines changed: 131 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,61 @@ def from_json(
891891
max_ligand_chains=max_ligand_chains,
892892
)
893893

894+
def _populate_chains(self, ent: ty.Any, info: ty.Any) -> None:
895+
"""Set entry.chains and entry.water_chains from a loaded OST entity."""
896+
self.chains = {
897+
chain.name: Chain.from_ost_chain(
898+
chain, info, len(self.chain_to_seqres.get(chain.name, ""))
899+
)
900+
for chain in ent.chains
901+
if chain.type != mol.CHAINTYPE_WATER
902+
}
903+
self.water_chains = [
904+
chain.name for chain in ent.chains if chain.type == mol.CHAINTYPE_WATER
905+
]
906+
907+
def _collect_ligands_from_biounit(
908+
self,
909+
biounit: ty.Any,
910+
biounit_id: str,
911+
interface_proximal_gaps: dict[str, ty.Any],
912+
plip_complex_threshold: float,
913+
neighboring_residue_threshold: float,
914+
neighboring_ligand_threshold: float,
915+
data_dir: Path | None,
916+
) -> dict[str, "Ligand"]:
917+
"""Create Ligand objects for every ligand chain in a single biounit."""
918+
ligands: dict[str, Ligand] = {}
919+
biounit_ligand_chains = [
920+
chain.name
921+
for chain in biounit.chains
922+
if chain.name.split(".")[1] in self.ligand_like_chains
923+
]
924+
for ligand_chain in biounit_ligand_chains:
925+
ligand_instance, ligand_asym_id = ligand_chain.split(".")
926+
residue_numbers = [
927+
residue.number.num
928+
for residue in biounit.FindChain(ligand_chain).residues
929+
]
930+
ligand = Ligand.from_pli(
931+
pdb_id=self.pdb_id,
932+
biounit_id=biounit_id,
933+
biounit=biounit,
934+
ligand_instance=int(ligand_instance),
935+
ligand_chain=self.chains[ligand_asym_id],
936+
residue_numbers=residue_numbers,
937+
ligand_like_chains=self.ligand_like_chains,
938+
interface_proximal_gaps=interface_proximal_gaps,
939+
all_covalent_dict=self.covalent_bonds,
940+
plip_complex_threshold=plip_complex_threshold,
941+
neighboring_residue_threshold=neighboring_residue_threshold,
942+
neighboring_ligand_threshold=neighboring_ligand_threshold,
943+
data_dir=data_dir,
944+
)
945+
if ligand is not None:
946+
ligands[ligand.id] = ligand
947+
return ligands
948+
894949
@classmethod
895950
def from_cif_file(
896951
cls,
@@ -909,32 +964,29 @@ def from_cif_file(
909964
symmetry_mate_contact_threshold: float = 5.0,
910965
) -> Entry:
911966
"""
912-
Load an entry object from mmcif files
967+
Load an entry object from mmCIF files in the pipeline
913968
914969
Parameters
915970
----------
916971
cif_file : Path
917-
mmcif files of interest
972+
mmCIF file of interest
918973
neighboring_residue_threshold : float
919-
Distance from ligand for protein \
920-
residues to be considered a ligand
974+
Distance from ligand for protein residues to be considered a ligand
921975
neighboring_ligand_threshold : float
922-
Distance from ligand for other ligans \
923-
to be considered a ligand
976+
Distance from ligand for other ligands to be considered a ligand
924977
min_polymer_size : int = 10
925-
Minimum number of residues for chain to be seen as a \
926-
polymer, or Maximum number of residues for chain to be seen as a ligand \
978+
Minimum number of residues for chain to be seen as a polymer,
979+
or Maximum number of residues for chain to be seen as a ligand
927980
max_non_small_mol_ligand_length: int = 20
928-
Maximum length of polymer that should be assessed for potentially being ligand
981+
Maximum length of polymer to be assessed for potentially being ligand
929982
save_folder : Path
930983
Path to save files
931984
max_protein_chains_to_save : int
932985
Maximum number of protein chains to save
933986
max_ligand_chains_to_save : int
934987
Maximum number of protein chains to save
935988
plip_complex_threshold=10
936-
Maximum distance from ligand to residues to be
937-
included for plip calculations.
989+
Maximum distance from ligand to residues to be included for plip calculations
938990
skip_save_systems: bool = False
939991
skips saving system files
940992
skip_posebusters: bool = False
@@ -954,9 +1006,6 @@ def from_cif_file(
9541006
)
9551007
entry_info = get_entry_info(cif_data)
9561008
per_chain = get_chain_external_mappings(cif_data)
957-
# TODO: annotate_interface_gaps does not use the same ligand chain definitions as the rest
958-
# move this to later after protein/ligand chain assignment?
959-
interface_proximal_gaps = annotate_interface_gaps(cif_file)
9601009
resolution = entry_info.get("entry_resolution")
9611010
r = None
9621011
if resolution is not None:
@@ -984,16 +1033,7 @@ def from_cif_file(
9841033
chain_to_seqres={c.name: c.string for c in seqres},
9851034
symmetry_mate_contacts=symmetry_mate_contacts,
9861035
)
987-
entry.chains = {
988-
chain.name: Chain.from_ost_chain(
989-
chain, info, len(entry.chain_to_seqres.get(chain.name, ""))
990-
)
991-
for chain in ent.chains
992-
if chain.type != mol.CHAINTYPE_WATER
993-
}
994-
entry.water_chains = [
995-
chain.name for chain in ent.chains if chain.type == mol.CHAINTYPE_WATER
996-
]
1036+
entry._populate_chains(ent, info)
9971037

9981038
if save_folder is not None and data_dir is None:
9991039
data_dir = save_folder.parent.parent
@@ -1006,44 +1046,29 @@ def from_cif_file(
10061046
entry.ligand_like_chains = detect_ligand_chains(
10071047
ent, entry, min_polymer_size, max_non_small_mol_ligand_length
10081048
)
1009-
ligands = {}
1049+
protein_chains = [c for c in entry.chains if c not in entry.ligand_like_chains]
1050+
interface_proximal_gaps = annotate_interface_gaps(
1051+
cif_file,
1052+
protein_chains=protein_chains,
1053+
ligand_chains=list(entry.ligand_like_chains.keys()),
1054+
)
1055+
ligands: dict[str, Ligand] = {}
10101056
biounits = {}
10111057
for biounit_info in info.biounits:
1012-
biounit = mol.alg.CreateBU(ent, biounit_info)
10131058
# note, biounit chains are renamed to 1.A, 1.B, etc.
1014-
biounit_ligand_chains = [
1015-
chain.name
1016-
for chain in biounit.chains
1017-
if chain.name.split(".")[1] in entry.ligand_like_chains
1018-
]
1019-
for ligand_chain in biounit_ligand_chains:
1020-
ligand_instance, ligand_asym_id = ligand_chain.split(".")
1021-
if save_folder is not None and data_dir is None:
1022-
data_dir = save_folder.parent.parent
1023-
residue_numbers = [
1024-
residue.number.num
1025-
for residue in biounit.FindChain(ligand_chain).residues
1026-
]
1027-
ligand = Ligand.from_pli(
1028-
pdb_id=entry.pdb_id,
1029-
biounit_id=biounit_info.id,
1030-
biounit=biounit,
1031-
ligand_instance=int(ligand_instance),
1032-
ligand_chain=entry.chains[ligand_asym_id],
1033-
residue_numbers=residue_numbers,
1034-
ligand_like_chains=entry.ligand_like_chains,
1035-
interface_proximal_gaps=interface_proximal_gaps,
1036-
all_covalent_dict=entry.covalent_bonds,
1037-
plip_complex_threshold=plip_complex_threshold,
1038-
neighboring_residue_threshold=neighboring_residue_threshold,
1039-
neighboring_ligand_threshold=neighboring_ligand_threshold,
1040-
data_dir=data_dir,
1041-
)
1042-
if ligand is not None:
1043-
ligands[ligand.id] = ligand
1044-
# label crystal contacts
1045-
ligand.label_crystal_contacts(entry.symmetry_mate_contacts)
1046-
1059+
biounit = mol.alg.CreateBU(ent, biounit_info)
1060+
new_ligands = entry._collect_ligands_from_biounit(
1061+
biounit,
1062+
biounit_info.id,
1063+
interface_proximal_gaps,
1064+
plip_complex_threshold,
1065+
neighboring_residue_threshold,
1066+
neighboring_ligand_threshold,
1067+
data_dir,
1068+
)
1069+
for ligand in new_ligands.values():
1070+
ligand.label_crystal_contacts(entry.symmetry_mate_contacts)
1071+
ligands.update(new_ligands)
10471072
biounits[biounit_info.id] = biounit
10481073
entry.set_systems(ligands)
10491074
entry.label_chains()
@@ -1081,63 +1106,66 @@ def from_custom_cif_file(
10811106
max_protein_chains_to_save: int = 5,
10821107
max_ligand_chains_to_save: int = 5,
10831108
) -> Entry:
1109+
"""
1110+
Creates entry from an extrernal mmCIF file
1111+
1112+
Parameters
1113+
----------
1114+
pdb_id : str
1115+
annotation be used in PDB ID column
1116+
cif_file : Path
1117+
mmcif files of interest
1118+
neighboring_residue_threshold : float, optional
1119+
Distance from ligand for protein residues to be considered a ligand,
1120+
by default 6.0
1121+
neighboring_ligand_threshold : float, optional
1122+
Distance from ligand for protein residues to be considered a ligand,
1123+
by default 4.0
1124+
min_polymer_size : int, optional
1125+
_description_, by default 10
1126+
save_folder : Path | None, optional
1127+
_description_, by default None
1128+
max_protein_chains_to_save : int, optional
1129+
Maximum number of protein chains to save, by default 5
1130+
max_ligand_chains_to_save : int, optional
1131+
Maximum number of protein chains to save, by default 5
1132+
1133+
Returns
1134+
-------
1135+
Entry
1136+
Entry object for the given pdbid
1137+
"""
10841138
ent, seqres, info = io.LoadMMCIF(
10851139
str(cif_file), seqres=True, info=True, remote=False
10861140
)
10871141
entry = cls(
10881142
pdb_id=pdb_id,
10891143
chain_to_seqres={c.name: c.string for c in seqres},
10901144
)
1091-
entry.chains = {
1092-
chain.name: Chain.from_ost_chain(
1093-
chain, info, len(entry.chain_to_seqres.get(chain.name, ""))
1094-
)
1095-
for chain in ent.chains
1096-
if chain.type != mol.CHAINTYPE_WATER
1097-
}
1098-
entry.water_chains = [
1099-
chain.name for chain in ent.chains if chain.type == mol.CHAINTYPE_WATER
1100-
]
1145+
entry._populate_chains(ent, info)
11011146
entry.ligand_like_chains = detect_ligand_chains(
11021147
ent, entry, min_polymer_size, max_non_small_mol_ligand_length
11031148
)
1149+
protein_chains = [c for c in entry.chains if c not in entry.ligand_like_chains]
1150+
interface_proximal_gaps = annotate_interface_gaps(
1151+
cif_file,
1152+
protein_chains=protein_chains,
1153+
ligand_chains=list(entry.ligand_like_chains.keys()),
1154+
)
11041155
biounit = ent.Copy()
11051156
edi = biounit.EditXCS(mol.BUFFERED_EDIT)
11061157
for chain in biounit.chains:
11071158
edi.RenameChain(chain, f"1.{chain.name}")
11081159
edi.UpdateICS()
1109-
biounit_ligand_chains = [
1110-
chain.name
1111-
for chain in biounit.chains
1112-
if chain.name.split(".")[1] in entry.ligand_like_chains
1113-
]
1114-
ligands = {}
1115-
for ligand_chain in biounit_ligand_chains:
1116-
ligand_instance, ligand_asym_id = ligand_chain.split(".")
1117-
residue_numbers = [
1118-
residue.number.num
1119-
for residue in biounit.FindChain(ligand_chain).residues
1120-
]
1121-
ligand = Ligand.from_pli(
1122-
pdb_id=entry.pdb_id,
1123-
biounit_id="1",
1124-
biounit=biounit,
1125-
ligand_instance=int(ligand_instance),
1126-
ligand_chain=entry.chains[ligand_asym_id],
1127-
residue_numbers=residue_numbers,
1128-
ligand_like_chains=entry.ligand_like_chains,
1129-
interface_proximal_gaps={
1130-
"ppi_interface_gap_annotation": {},
1131-
"ligand_interface_gap_annotation": {},
1132-
},
1133-
all_covalent_dict=entry.covalent_bonds,
1134-
plip_complex_threshold=plip_complex_threshold,
1135-
neighboring_residue_threshold=neighboring_residue_threshold,
1136-
neighboring_ligand_threshold=neighboring_ligand_threshold,
1137-
data_dir=None,
1138-
)
1139-
if ligand is not None:
1140-
ligands[ligand.id] = ligand
1160+
ligands = entry._collect_ligands_from_biounit(
1161+
biounit,
1162+
"1", # TODO: @JAY - is this necessary to be different from `from_cif_file` ?
1163+
interface_proximal_gaps,
1164+
plip_complex_threshold,
1165+
neighboring_residue_threshold,
1166+
neighboring_ligand_threshold,
1167+
data_dir=None,
1168+
)
11411169
entry.set_systems(ligands)
11421170
entry.label_chains()
11431171
if save_folder is not None:
@@ -1317,8 +1345,8 @@ def format(
13171345
self, criteria: QualityCriteria = QualityCriteria()
13181346
) -> dict[str, ty.Any]:
13191347
"""
1320-
Format label for entry-level annotations by prepending \
1321-
label with "entry_"
1348+
Format label for entry-level annotations by prepending label with "entry_"
1349+
13221350
Parameters
13231351
----------
13241352
self : Entry

0 commit comments

Comments
 (0)