3434from tqdm import tqdm
3535
3636from alphafold3_pytorch .tensor_typing import IntType , typecheck
37- from alphafold3_pytorch .utils .utils import exists
37+ from alphafold3_pytorch .utils .utils import exists , np_mode
3838from scripts .filter_pdb_mmcifs import parse_mmcif_object
3939
4040# Constants
6060 "A" : "A" ,
6161 "C" : "C" ,
6262 "G" : "G" ,
63+ "T" : "U" , # NOTE: This mapping is required for PDBs such as `41Od`
6364 "U" : "U" ,
6465}
6566DNA_LETTERS_1TO3 = {
6667 "A" : "DA" ,
6768 "C" : "DC" ,
6869 "G" : "DG" ,
6970 "T" : "DT" ,
71+ "U" : "DT" , # NOTE: This mapping is present as a precaution based on outlier PDBs such as `410d`
7072}
7173
7274
@@ -177,7 +179,7 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
177179 interface_chain_ids = set ()
178180 for chain in model :
179181 one_letter_seq_tokens = []
180- token_molecule_types = set ()
182+ token_molecule_types = []
181183
182184 for res_index , res in enumerate (chain ):
183185 # Convert each residue to a one-letter code if applicable
@@ -195,7 +197,7 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
195197 sequences [f"{ chain .id } :{ clustering_molecule_type } -{ res_id } " ] = one_letter_residue
196198 else :
197199 one_letter_seq_tokens .append (one_letter_residue )
198- token_molecule_types .add (clustering_molecule_type )
200+ token_molecule_types .append (clustering_molecule_type )
199201
200202 # Find all interfaces defined as pairs of chains with minimum heavy atom (i.e. non-hydrogen) separation less than 5 Å
201203 for atom in res :
@@ -245,12 +247,20 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
245247 len (one_letter_seq_tokens ) > 0
246248 ), f"No residues found in chain { chain .id } within the mmCIF file { filepath } ."
247249
248- token_molecule_types = list (token_molecule_types )
249- assert (
250- len (token_molecule_types ) == 1
251- ), f"More than one molecule type found (i.e., { token_molecule_types } ) in chain { chain .id } within the mmCIF file { filepath } ."
250+ unique_token_molecule_types = set (token_molecule_types )
251+ if len (unique_token_molecule_types ) > 1 :
252+ # Handle cases where a chain contains multiple polymer molecule types, such as in PDB `5a0f`
253+ molecule_type = np_mode (np .array (token_molecule_types ))[0 ].item ()
254+ logger .warning (
255+ f"More than one molecule type found (i.e., { unique_token_molecule_types } ) in chain { chain .id } within the mmCIF file { filepath } ."
256+ f" Assigning the most common molecule type to the chain (i.e., { molecule_type } ), and setting the type of all outlier residues to the unknown residue type (i.e., X)."
257+ )
258+ for token_index in range (len (one_letter_seq_tokens )):
259+ if token_molecule_types [token_index ] != molecule_type :
260+ one_letter_seq_tokens [token_index ] = "X"
261+ else :
262+ molecule_type = token_molecule_types [0 ]
252263
253- molecule_type = token_molecule_types [0 ]
254264 if (
255265 molecule_type == "protein"
256266 and len (one_letter_seq_tokens ) < min_num_residues_for_protein_classification
@@ -276,12 +286,18 @@ def parse_chain_sequences_and_interfaces_from_mmcif_directory(
276286 mmcif_filepaths = list (glob .glob (os .path .join (mmcif_dir , "*" , "*.cif" )))
277287 for cif_filepath in tqdm (mmcif_filepaths , desc = "Parsing chain sequences" ):
278288 structure_id = os .path .splitext (os .path .basename (cif_filepath ))[0 ]
279- (
280- chain_sequences ,
281- interface_chain_ids ,
282- ) = parse_chain_sequences_and_interfaces_from_mmcif_file (
283- cif_filepath , assume_one_based_residue_ids = assume_one_based_residue_ids
284- )
289+ try :
290+ (
291+ chain_sequences ,
292+ interface_chain_ids ,
293+ ) = parse_chain_sequences_and_interfaces_from_mmcif_file (
294+ cif_filepath , assume_one_based_residue_ids = assume_one_based_residue_ids
295+ )
296+ except Exception as e :
297+ logger .warning (
298+ f"Failed to parse chain sequences and interfaces from mmCIF file '{ cif_filepath } ' due to: { e } "
299+ )
300+ continue
285301 all_chain_sequences .append ({structure_id : chain_sequences })
286302 all_interface_chain_ids [structure_id ] = list (interface_chain_ids )
287303
0 commit comments