ignore proteins exceeding max len in preprocessing

aditya0by0 · aditya0by0 · commit 710d703fd54d · 2024-10-12T11:51:39.000+02:00
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
@@ -415,8 +415,8 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
                 # To consider only manually-annotated swiss data
                 continue
 
-            if not record.sequence:
-                # Consider protein with only sequence representation
+            if not record.sequence or record.sequence > self.max_sequence_length:
+                # Consider protein with only sequence representation and seq. length not greater than max seq. length
                 continue
 
             if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
@@ -537,39 +537,6 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
         return df_train, df_val, df_test
 
-    # ------------------------------ Phase: DataLoaders -----------------------------------
-    def dataloader(self, kind: str, **kwargs) -> DataLoader:
-        """
-        Returns a DataLoader object with truncated sequences for the specified kind of data (train, val, or test).
-
-        This method overrides the dataloader method from the superclass. After fetching the dataset from the
-        superclass, it truncates the 'features' of each data instance to a maximum length specified by
-        `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that
-        the correct number of amino acids is preserved in the truncated sequences.
-
-        Args:
-            kind (str): The kind of data to load (e.g., 'train', 'val', 'test').
-            **kwargs: Additional keyword arguments passed to the superclass dataloader method.
-
-        Returns:
-            DataLoader: A DataLoader object with the truncated sequences.
-        """
-        dataloader = super().dataloader(kind, **kwargs)
-
-        if self.reader.n_gram is None:
-            # Truncate the 'features' to max_sequence_length for each instance
-            truncate_index = self.max_sequence_length
-        else:
-            # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of
-            # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams,
-            # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters.
-            truncate_index = self.max_sequence_length - (self.reader.n_gram - 1)
-
-        for instance in dataloader.dataset:
-            instance["features"] = instance["features"][:truncate_index]
-
-        return dataloader
-
     # ------------------------------ Phase: Raw Properties -----------------------------------
     @property
     def base_dir(self) -> str:
@@ -617,13 +584,16 @@ def _name(self) -> str:
         """
         Returns the name of the dataset.
 
+        'max_sequence_length' in the name indicates that proteins with sequence lengths exceeding  are ignored
+        in the dataset.
+
         Returns:
             str: The dataset name, formatted with the current threshold value and/or given go_branch.
         """
         if self.go_branch != self._ALL_GO_BRANCHES:
-            return f"GO{self.THRESHOLD}_{self.go_branch}"
+            return f"GO{self.THRESHOLD}_{self.go_branch}_{self.max_sequence_length}"
 
-        return f"GO{self.THRESHOLD}"
+        return f"GO{self.THRESHOLD}_{self.max_sequence_length}"
 
     def select_classes(
         self, g: nx.DiGraph, *args: Any, **kwargs: Dict[str, Any]