migration: replace invalid amino acid with "X" notation

aditya0by0 · aditya0by0 · commit 85c47a05aa36 · 2024-12-04T15:55:57.000+01:00
- #64 (comment)
diff --git a/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py b/chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py
@@ -1,11 +1,13 @@
 import os
+import re
 from collections import OrderedDict
 from typing import List, Literal, Optional
 
 import pandas as pd
 from jsonargparse import CLI
 
 from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO2MigratedData
+from chebai.preprocessing.reader import ProteinDataReader
 
 
 class DeepGo2DataMigration:
@@ -88,17 +90,25 @@ def _load_data(self) -> None:
 
         try:
             print(f"Loading data from directory: {self._data_dir}......")
-            self._test_df = self._truncate_sequences(
+
+            print(
+                "Pre-processing the data before loading them into instance variables\n"
+                f"2-Steps preprocessing: \n"
+                f"\t 1: Truncating every sequence to {self._max_len}\n"
+                f"\t 2: Replacing every amino acid which is not in {ProteinDataReader.AA_LETTER}"
+            )
+
+            self._test_df = self._pre_process_data(
                 pd.DataFrame(
                     pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl"))
                 )
             )
-            self._train_df = self._truncate_sequences(
+            self._train_df = self._pre_process_data(
                 pd.DataFrame(
                     pd.read_pickle(os.path.join(self._data_dir, "train_data.pkl"))
                 )
             )
-            self._validation_df = self._truncate_sequences(
+            self._validation_df = self._pre_process_data(
                 pd.DataFrame(
                     pd.read_pickle(os.path.join(self._data_dir, "valid_data.pkl"))
                 )
@@ -114,6 +124,21 @@ def _load_data(self) -> None:
                 "Please ensure all required files are available in the specified directory."
             )
 
+    def _pre_process_data(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Pre-processes the input dataframe by truncating sequences to the maximum
+        length and replacing invalid amino acids with 'X'.
+
+        Args:
+            df (pd.DataFrame): The dataframe to preprocess.
+
+        Returns:
+            pd.DataFrame: The processed dataframe.
+        """
+        df = self._truncate_sequences(df)
+        df = self._replace_invalid_amino_acids(df)
+        return df
+
     def _truncate_sequences(
         self, df: pd.DataFrame, column: str = "sequences"
     ) -> pd.DataFrame:
@@ -133,6 +158,30 @@ def _truncate_sequences(
         df[column] = df[column].apply(lambda x: x[: self._max_len])
         return df
 
+    @staticmethod
+    def _replace_invalid_amino_acids(
+        df: pd.DataFrame, column: str = "sequences"
+    ) -> pd.DataFrame:
+        """
+        Replaces invalid amino acids in a sequence with 'X' using regex.
+
+        https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33
+        https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073
+
+        Args:
+            df (pd.DataFrame): The dataframe containing the sequences to be processed.
+            column (str, optional): The column containing the sequences. Defaults to "sequences".
+
+        Returns:
+            pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'.
+        """
+        valid_amino_acids = "".join(ProteinDataReader.AA_LETTER)
+        # Replace any character not in the valid set with 'X'
+        df[column] = df[column].apply(
+            lambda x: re.sub(f"[^{valid_amino_acids}]", "X", x)
+        )
+        return df
+
     def _record_splits(self) -> pd.DataFrame:
         """
         Creates a DataFrame that stores the IDs and their corresponding data splits.