Skip to content

Commit b7ca0e5

Browse files
committed
scope: map invalid amino acids to "X"
1 parent 191c979 commit b7ca0e5

File tree

1 file changed

+8
-3
lines changed
  • chebai/preprocessing/datasets/scope

1 file changed

+8
-3
lines changed

chebai/preprocessing/datasets/scope/scope.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import gzip
1414
import os
15+
import re
1516
import shutil
1617
from abc import ABC, abstractmethod
1718
from tempfile import NamedTemporaryFile
@@ -441,14 +442,18 @@ def _parse_pdb_sequence_file(self) -> Dict[str, Dict[str, str]]:
441442
and values are dictionaries mapping chain IDs (lowercase) to their corresponding sequences.
442443
"""
443444
pdb_chain_seq_mapping: Dict[str, Dict[str, str]] = {}
445+
valid_amino_acids = "".join(ProteinDataReader.AA_LETTER)
446+
444447
for record in SeqIO.parse(
445448
os.path.join(self.scope_root_dir, self.raw_file_names_dict["PDB"]), "fasta"
446449
):
447450
pdb_id, chain = record.id.split("_")
448451
if str(record.seq):
449-
pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[chain.lower()] = (
450-
str(record.seq)
451-
)
452+
sequence = re.sub(f"[^{valid_amino_acids}]", "X", str(record.seq))
453+
454+
pdb_chain_seq_mapping.setdefault(pdb_id.lower(), {})[
455+
chain.lower()
456+
] = sequence
452457
return pdb_chain_seq_mapping
453458

454459
@staticmethod

0 commit comments

Comments
 (0)