@@ -298,6 +298,9 @@ def _from_mmcif_object(
298298 ValueError: If insertion code is detected at a residue.
299299 """
300300 structure = mmcif_object .structure
301+ # Resolve alternative locations for atoms/residues by taking the one with the largest occupancy.
302+ # NOTE: For `DisorderedAtom` objects, selecting the highest-occupancy atom is already the default behavior in Biopython.
303+ # Reference: https://biopython-tutorial.readthedocs.io/en/latest/notebooks/11%20-%20Going%203D%20-%20The%20PDB%20module.html#Disordered-atoms[disordered-atoms]
301304 if isinstance (structure , Model ):
302305 model = structure
303306 else :
@@ -333,6 +336,7 @@ def _from_mmcif_object(
333336 f" { res_chem_comp_details .id } in the mmCIF chemical component dictionary for { mmcif_object .file_id } ."
334337 )
335338 is_polymer_residue = is_polymer (res_chem_comp_details .type )
339+ is_peptide_residue = "peptide" in res_chem_comp_details .type .lower ()
336340 residue_constants = get_residue_constants (res_chem_type = res_chem_comp_details .type )
337341 res_shortname = residue_constants .restype_3to1 .get (res .resname , "X" )
338342 restype_idx = residue_constants .restype_order .get (
@@ -345,12 +349,36 @@ def _from_mmcif_object(
345349 for atom in res :
346350 if is_polymer_residue and atom .name not in residue_constants .atom_types_set :
347351 continue
348- pos [residue_constants .atom_order [atom .name ]] = atom .coord
349- mask [residue_constants .atom_order [atom .name ]] = 1.0
350- res_b_factors [residue_constants .atom_order [atom .name ]] = atom .bfactor
352+ elif is_peptide_residue and atom .name .upper () == "SE" and res .get_resname () == "MSE" :
353+ # Put the coords of the selenium atom in the sulphur column.
354+ pos [residue_constants .atom_order ["SD" ]] = atom .coord
355+ mask [residue_constants .atom_order ["SD" ]] = 1.0
356+ res_b_factors [residue_constants .atom_order ["SD" ]] = atom .bfactor
357+ else :
358+ pos [residue_constants .atom_order [atom .name ]] = atom .coord
359+ mask [residue_constants .atom_order [atom .name ]] = 1.0
360+ res_b_factors [residue_constants .atom_order [atom .name ]] = atom .bfactor
351361 if np .sum (mask ) < 0.5 :
352362 # If no known atom positions are reported for a polymer residue then skip it.
353363 continue
364+ if is_peptide_residue :
365+ # Fix naming errors in arginine residues where NH2 is incorrectly
366+ # assigned to be closer to CD than NH1
367+ cd = residue_constants .atom_order ["CD" ]
368+ nh1 = residue_constants .atom_order ["NH1" ]
369+ nh2 = residue_constants .atom_order ["NH2" ]
370+ if (
371+ res .get_resname () == "ARG"
372+ and all (mask [atom_index ] for atom_index in (cd , nh1 , nh2 ))
373+ and (np .linalg .norm (pos [nh1 ] - pos [cd ]) > np .linalg .norm (pos [nh2 ] - pos [cd ]))
374+ ):
375+ pos [nh1 ], pos [nh2 ] = pos [nh2 ].copy (), pos [nh1 ].copy ()
376+ mask [nh1 ], mask [nh2 ] = mask [nh2 ].copy (), mask [nh1 ].copy ()
377+ res_b_factors [nh1 ], res_b_factors [nh2 ] = (
378+ res_b_factors [nh2 ].copy (),
379+ res_b_factors [nh1 ].copy (),
380+ )
381+ # Collect the residue's features.
354382 restype .append (restype_idx )
355383 chemid .append (res_chem_comp_details .id )
356384 chemtype .append (residue_constants .chemtype_num )
@@ -379,6 +407,7 @@ def _from_mmcif_object(
379407 # into a single ligand residue using indexing operations
380408 # working jointly on chain_index and residue_index.
381409 for atom in res :
410+ # NOTE: This code assumes water residues have previously been filtered out.
382411 pos = np .zeros ((residue_constants .atom_type_num , 3 ))
383412 mask = np .zeros ((residue_constants .atom_type_num ,))
384413 res_b_factors = np .zeros ((residue_constants .atom_type_num ,))
0 commit comments