Skip to content

Commit e8cbec4

Browse files
authored
Handle a few edge cases in cluster_pdb_mmcifs.py (#58)
1 parent d156d49 commit e8cbec4

File tree

1 file changed

+30
-14
lines changed

1 file changed

+30
-14
lines changed

scripts/cluster_pdb_mmcifs.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from tqdm import tqdm
3535

3636
from alphafold3_pytorch.tensor_typing import IntType, typecheck
37-
from alphafold3_pytorch.utils.utils import exists
37+
from alphafold3_pytorch.utils.utils import exists, np_mode
3838
from scripts.filter_pdb_mmcifs import parse_mmcif_object
3939

4040
# Constants
@@ -60,13 +60,15 @@
6060
"A": "A",
6161
"C": "C",
6262
"G": "G",
63+
"T": "U", # NOTE: This mapping is required for PDBs such as `41Od`
6364
"U": "U",
6465
}
6566
DNA_LETTERS_1TO3 = {
6667
"A": "DA",
6768
"C": "DC",
6869
"G": "DG",
6970
"T": "DT",
71+
"U": "DT", # NOTE: This mapping is present as a precaution based on outlier PDBs such as `410d`
7072
}
7173

7274

@@ -177,7 +179,7 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
177179
interface_chain_ids = set()
178180
for chain in model:
179181
one_letter_seq_tokens = []
180-
token_molecule_types = set()
182+
token_molecule_types = []
181183

182184
for res_index, res in enumerate(chain):
183185
# Convert each residue to a one-letter code if applicable
@@ -195,7 +197,7 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
195197
sequences[f"{chain.id}:{clustering_molecule_type}-{res_id}"] = one_letter_residue
196198
else:
197199
one_letter_seq_tokens.append(one_letter_residue)
198-
token_molecule_types.add(clustering_molecule_type)
200+
token_molecule_types.append(clustering_molecule_type)
199201

200202
# Find all interfaces defined as pairs of chains with minimum heavy atom (i.e. non-hydrogen) separation less than 5 Å
201203
for atom in res:
@@ -245,12 +247,20 @@ def parse_chain_sequences_and_interfaces_from_mmcif_file(
245247
len(one_letter_seq_tokens) > 0
246248
), f"No residues found in chain {chain.id} within the mmCIF file {filepath}."
247249

248-
token_molecule_types = list(token_molecule_types)
249-
assert (
250-
len(token_molecule_types) == 1
251-
), f"More than one molecule type found (i.e., {token_molecule_types}) in chain {chain.id} within the mmCIF file {filepath}."
250+
unique_token_molecule_types = set(token_molecule_types)
251+
if len(unique_token_molecule_types) > 1:
252+
# Handle cases where a chain contains multiple polymer molecule types, such as in PDB `5a0f`
253+
molecule_type = np_mode(np.array(token_molecule_types))[0].item()
254+
logger.warning(
255+
f"More than one molecule type found (i.e., {unique_token_molecule_types}) in chain {chain.id} within the mmCIF file {filepath}."
256+
f" Assigning the most common molecule type to the chain (i.e., {molecule_type}), and setting the type of all outlier residues to the unknown residue type (i.e., X)."
257+
)
258+
for token_index in range(len(one_letter_seq_tokens)):
259+
if token_molecule_types[token_index] != molecule_type:
260+
one_letter_seq_tokens[token_index] = "X"
261+
else:
262+
molecule_type = token_molecule_types[0]
252263

253-
molecule_type = token_molecule_types[0]
254264
if (
255265
molecule_type == "protein"
256266
and len(one_letter_seq_tokens) < min_num_residues_for_protein_classification
@@ -276,12 +286,18 @@ def parse_chain_sequences_and_interfaces_from_mmcif_directory(
276286
mmcif_filepaths = list(glob.glob(os.path.join(mmcif_dir, "*", "*.cif")))
277287
for cif_filepath in tqdm(mmcif_filepaths, desc="Parsing chain sequences"):
278288
structure_id = os.path.splitext(os.path.basename(cif_filepath))[0]
279-
(
280-
chain_sequences,
281-
interface_chain_ids,
282-
) = parse_chain_sequences_and_interfaces_from_mmcif_file(
283-
cif_filepath, assume_one_based_residue_ids=assume_one_based_residue_ids
284-
)
289+
try:
290+
(
291+
chain_sequences,
292+
interface_chain_ids,
293+
) = parse_chain_sequences_and_interfaces_from_mmcif_file(
294+
cif_filepath, assume_one_based_residue_ids=assume_one_based_residue_ids
295+
)
296+
except Exception as e:
297+
logger.warning(
298+
f"Failed to parse chain sequences and interfaces from mmCIF file '{cif_filepath}' due to: {e}"
299+
)
300+
continue
285301
all_chain_sequences.append({structure_id: chain_sequences})
286302
all_interface_chain_ids[structure_id] = list(interface_chain_ids)
287303

0 commit comments

Comments
 (0)