Skip to content

Commit 85c47a0

Browse files
committed
migration: replace invalid amino acid with "X" notation
- #64 (comment)
1 parent bcda11c commit 85c47a0

File tree

1 file changed

+52
-3
lines changed

1 file changed

+52
-3
lines changed

chebai/preprocessing/migration/deep_go/migrate_deep_go_2_data.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import os
2+
import re
23
from collections import OrderedDict
34
from typing import List, Literal, Optional
45

56
import pandas as pd
67
from jsonargparse import CLI
78

89
from chebai.preprocessing.datasets.deepGO.go_uniprot import DeepGO2MigratedData
10+
from chebai.preprocessing.reader import ProteinDataReader
911

1012

1113
class DeepGo2DataMigration:
@@ -88,17 +90,25 @@ def _load_data(self) -> None:
8890

8991
try:
9092
print(f"Loading data from directory: {self._data_dir}......")
91-
self._test_df = self._truncate_sequences(
93+
94+
print(
95+
"Pre-processing the data before loading them into instance variables\n"
96+
f"2-Steps preprocessing: \n"
97+
f"\t 1: Truncating every sequence to {self._max_len}\n"
98+
f"\t 2: Replacing every amino acid which is not in {ProteinDataReader.AA_LETTER}"
99+
)
100+
101+
self._test_df = self._pre_process_data(
92102
pd.DataFrame(
93103
pd.read_pickle(os.path.join(self._data_dir, "test_data.pkl"))
94104
)
95105
)
96-
self._train_df = self._truncate_sequences(
106+
self._train_df = self._pre_process_data(
97107
pd.DataFrame(
98108
pd.read_pickle(os.path.join(self._data_dir, "train_data.pkl"))
99109
)
100110
)
101-
self._validation_df = self._truncate_sequences(
111+
self._validation_df = self._pre_process_data(
102112
pd.DataFrame(
103113
pd.read_pickle(os.path.join(self._data_dir, "valid_data.pkl"))
104114
)
@@ -114,6 +124,21 @@ def _load_data(self) -> None:
114124
"Please ensure all required files are available in the specified directory."
115125
)
116126

127+
def _pre_process_data(self, df: pd.DataFrame) -> pd.DataFrame:
128+
"""
129+
Pre-processes the input dataframe by truncating sequences to the maximum
130+
length and replacing invalid amino acids with 'X'.
131+
132+
Args:
133+
df (pd.DataFrame): The dataframe to preprocess.
134+
135+
Returns:
136+
pd.DataFrame: The processed dataframe.
137+
"""
138+
df = self._truncate_sequences(df)
139+
df = self._replace_invalid_amino_acids(df)
140+
return df
141+
117142
def _truncate_sequences(
118143
self, df: pd.DataFrame, column: str = "sequences"
119144
) -> pd.DataFrame:
@@ -133,6 +158,30 @@ def _truncate_sequences(
133158
df[column] = df[column].apply(lambda x: x[: self._max_len])
134159
return df
135160

161+
@staticmethod
162+
def _replace_invalid_amino_acids(
163+
df: pd.DataFrame, column: str = "sequences"
164+
) -> pd.DataFrame:
165+
"""
166+
Replaces invalid amino acids in a sequence with 'X' using regex.
167+
168+
https://github.com/bio-ontology-research-group/deepgo2/blob/main/deepgo/aminoacids.py#L26-L33
169+
https://github.com/ChEB-AI/python-chebai/pull/64#issuecomment-2517067073
170+
171+
Args:
172+
df (pd.DataFrame): The dataframe containing the sequences to be processed.
173+
column (str, optional): The column containing the sequences. Defaults to "sequences".
174+
175+
Returns:
176+
pd.DataFrame: The dataframe with invalid amino acids replaced by 'X'.
177+
"""
178+
valid_amino_acids = "".join(ProteinDataReader.AA_LETTER)
179+
# Replace any character not in the valid set with 'X'
180+
df[column] = df[column].apply(
181+
lambda x: re.sub(f"[^{valid_amino_acids}]", "X", x)
182+
)
183+
return df
184+
136185
def _record_splits(self) -> pd.DataFrame:
137186
"""
138187
Creates a DataFrame that stores the IDs and their corresponding data splits.

0 commit comments

Comments
 (0)