11import os
2+ import re
23from collections import OrderedDict
34from typing import List , Literal , Optional
45
56import pandas as pd
67from jsonargparse import CLI
78
89from chebai .preprocessing .datasets .deepGO .go_uniprot import DeepGO2MigratedData
10+ from chebai .preprocessing .reader import ProteinDataReader
911
1012
1113class DeepGo2DataMigration :
@@ -88,17 +90,25 @@ def _load_data(self) -> None:
8890
8991 try :
9092 print (f"Loading data from directory: { self ._data_dir } ......" )
91- self ._test_df = self ._truncate_sequences (
93+
94+ print (
95+ "Pre-processing the data before loading them into instance variables\n "
96+ f"2-Steps preprocessing: \n "
97+ f"\t 1: Truncating every sequence to { self ._max_len } \n "
98+ f"\t 2: Replacing every amino acid which is not in { ProteinDataReader .AA_LETTER } "
99+ )
100+
101+ self ._test_df = self ._pre_process_data (
92102 pd .DataFrame (
93103 pd .read_pickle (os .path .join (self ._data_dir , "test_data.pkl" ))
94104 )
95105 )
96- self ._train_df = self ._truncate_sequences (
106+ self ._train_df = self ._pre_process_data (
97107 pd .DataFrame (
98108 pd .read_pickle (os .path .join (self ._data_dir , "train_data.pkl" ))
99109 )
100110 )
101- self ._validation_df = self ._truncate_sequences (
111+ self ._validation_df = self ._pre_process_data (
102112 pd .DataFrame (
103113 pd .read_pickle (os .path .join (self ._data_dir , "valid_data.pkl" ))
104114 )
@@ -114,6 +124,21 @@ def _load_data(self) -> None:
114124 "Please ensure all required files are available in the specified directory."
115125 )
116126
127+ def _pre_process_data (self , df : pd .DataFrame ) -> pd .DataFrame :
128+ """
129+ Pre-processes the input dataframe by truncating sequences to the maximum
130+ length and replacing invalid amino acids with 'X'.
131+
132+ Args:
133+ df (pd.DataFrame): The dataframe to preprocess.
134+
135+ Returns:
136+ pd.DataFrame: The processed dataframe.
137+ """
138+ df = self ._truncate_sequences (df )
139+ df = self ._replace_invalid_amino_acids (df )
140+ return df
141+
117142 def _truncate_sequences (
118143 self , df : pd .DataFrame , column : str = "sequences"
119144 ) -> pd .DataFrame :
@@ -133,6 +158,30 @@ def _truncate_sequences(
133158 df [column ] = df [column ].apply (lambda x : x [: self ._max_len ])
134159 return df
135160
161+ @staticmethod
162+ def _replace_invalid_amino_acids (
163+ df : pd .DataFrame , column : str = "sequences"
164+ ) -> pd .DataFrame :
165+ """
166+ Replaces invalid amino acids in a sequence with 'X' using regex.
167+
168+ https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33
169+ https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073
170+
171+ Args:
172+ df (pd.DataFrame): The dataframe containing the sequences to be processed.
173+ column (str, optional): The column containing the sequences. Defaults to "sequences".
174+
175+ Returns:
176+ pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'.
177+ """
178+ valid_amino_acids = "" .join (ProteinDataReader .AA_LETTER )
179+ # Replace any character not in the valid set with 'X'
180+ df [column ] = df [column ].apply (
181+ lambda x : re .sub (f"[^{ valid_amino_acids } ]" , "X" , x )
182+ )
183+ return df
184+
136185 def _record_splits (self ) -> pd .DataFrame :
137186 """
138187 Creates a DataFrame that stores the IDs and their corresponding data splits.
0 commit comments