44
55import logging
66import sys
7+ import typing
78import warnings
89from pathlib import Path
910from typing import Optional , Union
1011
12+ from Bio .PDB .Atom import DisorderedAtom
1113from Bio .PDB .Chain import Chain
1214from Bio .PDB .MMCIFParser import MMCIFParser
15+ from Bio .PDB .Model import Model
1316from Bio .PDB .PDBExceptions import PDBConstructionWarning
1417from Bio .PDB .PDBParser import PDBParser
1518from Bio .PDB .Polypeptide import PPBuilder , is_aa
@@ -26,112 +29,110 @@ def get_parser(input_f: Path) -> Union[PDBParser, MMCIFParser]:
2629 return PDBParser ()
2730
2831
32+ def ignore (r ):
33+ return r .id [0 ][0 ] == "W" or r .id [0 ][0 ] == "H"
34+
35+
2936def validate_structure (
30- s : Structure , selection : Optional [list [str ]] = None , clean : bool = True
31- ) -> Structure :
32-
33- # setup logging
34- logger = logging .getLogger ("Prodigy" )
35-
36- # Keep first model only
37- if len (s ) > 1 :
38- logger .warning (
39- (
40- "[!] Structure contains more than one model."
41- " Only the first one will be kept"
42- )
43- )
44- model_one = s [0 ].id
45- for m in s .child_list [:]:
46- if m .id != model_one :
47- s .detach_child (m .id )
48-
49- # process selected chains
50- chains : list [Chain ] = list (s .get_chains ())
51- chain_ids = set ([c .id for c in chains ])
52-
53- if selection :
54- sel_chains = []
55- # Match selected chain with structure
56- for sel in selection :
57- for c_str in sel .split ("," ):
58- sel_chains .append (c_str )
59- if c_str not in chain_ids :
37+ input_strcture_obj : Structure ,
38+ selection : Optional [list [str ]] = None ,
39+ clean : bool = True ,
40+ ) -> list [Model ]:
41+
42+ result : list [Model ] = []
43+ for model in [m for m in input_strcture_obj .child_list ]:
44+
45+ # process selected chains
46+ chains : list [Chain ] = list (model .get_chains ())
47+ chain_ids = set ([c .id for c in chains ])
48+
49+ if selection :
50+ sel_chains = []
51+ # Match selected chain with structure
52+ for sel in selection :
53+ for c_str in sel .split ("," ):
54+ sel_chains .append (c_str )
55+ if c_str not in chain_ids :
56+ raise ValueError (
57+ f"Selected chain not present in provided structure: { c_str } "
58+ )
59+
60+ # Remove unselected chains
61+ def _ignore_helper (x ) -> bool :
62+ return x .id not in sel_chains
63+
64+ for c in chains :
65+ if _ignore_helper (c ):
66+ if c .parent is not None :
67+ c .parent .detach_child (c .id )
68+
69+ # Double occupancy check
70+ for atom in list (model .get_atoms ()):
71+ if atom .is_disordered ():
72+ atom = typing .cast (DisorderedAtom , atom )
73+ residue = atom .parent
74+ assert residue is not None
75+ sel_at = atom .selected_child
76+ assert sel_at is not None
77+ sel_at .altloc = " "
78+ sel_at .disordered_flag = 0
79+ residue .detach_child (atom .id )
80+ residue .add (sel_at )
81+
82+ # Insertion code check
83+ for c in chains :
84+ for residue in c .get_residues ():
85+ if residue .get_id ()[2 ] != " " :
86+ c .detach_child (residue .id )
87+
88+ if clean :
89+ # Remove HETATMs and solvent
90+ res_list = list (model .get_residues ())
91+
92+ for res in res_list :
93+ if ignore (res ):
94+ chain = res .parent
95+ assert chain is not None
96+ chain .detach_child (res .id )
97+ elif not is_aa (res , standard = True ):
6098 raise ValueError (
61- f"Selected chain not present in provided structure: { c_str } "
99+ "Unsupported non-standard amino acid found: {0}" .format (
100+ res .resname
101+ )
62102 )
63103
64- # Remove unselected chains
65- def _ignore_helper (x ) -> bool :
66- return x .id not in sel_chains
67-
68- for c in chains :
69- if _ignore_helper (c ):
70- if c .parent is not None :
71- c .parent .detach_child (c .id )
72-
73- # Double occupancy check
74- for atom in list (s .get_atoms ()):
75- if atom .is_disordered ():
76- residue = atom .parent
77- sel_at = atom .selected_child
78- sel_at .altloc = " "
79- sel_at .disordered_flag = 0
80- residue .detach_child (atom .id )
81- residue .add (sel_at )
82-
83- # Insertion code check
84- for c in chains :
85- for residue in c .get_residues ():
86- if residue .get_id ()[2 ] != " " :
87- c .detach_child (residue .id )
88-
89- if clean :
90- # Remove HETATMs and solvent
91- res_list = list (s .get_residues ())
92-
93- def _ignore (r ):
94- return r .id [0 ][0 ] == "W" or r .id [0 ][0 ] == "H"
95-
96- for res in res_list :
97- if _ignore (res ):
98- chain = res .parent
99- chain .detach_child (res .id )
100- elif not is_aa (res , standard = True ):
101- raise ValueError (
102- "Unsupported non-standard amino acid found: {0}" .format (res .resname )
104+ # Remove Hydrogens
105+ atom_list = list (model .get_atoms ())
106+
107+ def _ignore (x ):
108+ return x .element == "H"
109+
110+ for atom in atom_list :
111+ if _ignore (atom ):
112+ residue = atom .parent
113+ assert residue is not None
114+ residue .detach_child (atom .name )
115+
116+ # Detect gaps and compare with no. of chains
117+ pep_builder = PPBuilder ()
118+ peptides = pep_builder .build_peptides (model )
119+ n_peptides = len (peptides )
120+
121+ if n_peptides != len (chain_ids ):
122+ message = "[!] Structure contains gaps:\n "
123+ for i_pp , pp in enumerate (peptides ):
124+ message += (
125+ "\t {1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > "
126+ "{2.parent.id} {2.resname}{2.id[1]}\n " .format (i_pp , pp [0 ], pp [- 1 ])
103127 )
128+ log .warning (message )
104129
105- # Remove Hydrogens
106- atom_list = list (s .get_atoms ())
130+ result .append (model )
107131
108- def _ignore (x ):
109- return x .element == "H"
132+ return result
110133
111- for atom in atom_list :
112- if _ignore (atom ):
113- residue = atom .parent
114- residue .detach_child (atom .name )
115134
116- # Detect gaps and compare with no. of chains
117- pep_builder = PPBuilder ()
118- peptides = pep_builder .build_peptides (s )
119- n_peptides = len (peptides )
120-
121- if n_peptides != len (chain_ids ):
122- message = "[!] Structure contains gaps:\n "
123- for i_pp , pp in enumerate (peptides ):
124- message += (
125- "\t {1.parent.id} {1.resname}{1.id[1]} < Fragment {0} > "
126- "{2.parent.id} {2.resname}{2.id[1]}\n " .format (i_pp , pp [0 ], pp [- 1 ])
127- )
128- logger .warning (message )
129- # raise Exception(message)
130-
131- return s
132-
133-
134- def parse_structure (path : str ) -> tuple [Structure , int , int ]:
135+ def parse_structure (path : str ) -> tuple [list [Model ], int , int ]:
135136 """Return a validated `Structure`, number of chains and number of residues"""
136137
137138 extension = Path (path ).suffix
@@ -154,13 +155,33 @@ def parse_structure(path: str) -> tuple[Structure, int, int]:
154155
155156 assert isinstance (original_structure , Structure )
156157
157- structure = validate_structure (original_structure )
158+ models : list [ Model ] = validate_structure (original_structure )
158159
159160 # Get number of chains
160- number_of_chains = len (set ([c .id for c in structure .get_chains ()]))
161+ chain_dict = {}
162+ res_dict = {}
163+ for model in models :
164+ chain_dict .update ({c .id : c for c in model .get_chains ()})
165+ res_dict .update ({r .id : r for r in model .get_residues ()})
166+
167+ ## Make sure all models have the same chains
168+ # Get chain sets for all models
169+ chain_sets = [set (chain .id for chain in model .get_chains ()) for model in models ]
170+
171+ # Check if all sets are identical
172+ if not all (chain_set == chain_sets [0 ] for chain_set in chain_sets ):
173+ raise ValueError (
174+ "Not all models have the same chains. Found chain sets: "
175+ + ", " .join (str (s ) for s in chain_sets )
176+ )
177+
178+ res_sets = [set (res .id for res in model .get_residues ()) for model in models ]
161179
162- # Get number of residues
163- number_of_residues = len (list (structure .get_residues ()))
180+ if not all (res_set == res_sets [0 ] for res_set in res_sets ):
181+ raise ValueError (
182+ "Not all models have the same residues. Found residue sets: "
183+ + ", " .join (str (s ) for s in res_sets )
184+ )
164185
165186 # structure, n_chains, n_res = parse_structure(path=str(struct_path))
166- return (structure , number_of_chains , number_of_residues )
187+ return (models , len ( chain_sets [ 0 ]), len ( res_sets [ 0 ]) )
0 commit comments