Merge branch 'protein_prediction' into additional_unit_tests

aditya0by0 · aditya0by0 · commit e38d1abb179b · 2024-10-12T13:11:06.000+02:00
diff --git a/chebai/preprocessing/datasets/go_uniprot.py b/chebai/preprocessing/datasets/go_uniprot.py
@@ -73,13 +73,14 @@ class _GOUniProtDataExtractor(_DynamicDataset, ABC):
 
     def __init__(self, **kwargs):
         self.go_branch: str = self._get_go_branch(**kwargs)
-        super(_GOUniProtDataExtractor, self).__init__(**kwargs)
 
         self.max_sequence_length: int = int(kwargs.get("max_sequence_length", 1002))
         assert (
             self.max_sequence_length >= 1
         ), "Max sequence length should be greater than or equal to 1."
 
+        super(_GOUniProtDataExtractor, self).__init__(**kwargs)
+
         if self.reader.n_gram is not None:
             assert self.max_sequence_length >= self.reader.n_gram, (
                 f"max_sequence_length ({self.max_sequence_length}) must be greater than "
@@ -415,8 +416,8 @@ def _get_swiss_to_go_mapping(self) -> pd.DataFrame:
                 # To consider only manually-annotated swiss data
                 continue
 
-            if not record.sequence:
-                # Consider protein with only sequence representation
+            if not record.sequence or len(record.sequence) > self.max_sequence_length:
+                # Consider protein with only sequence representation and seq. length not greater than max seq. length
                 continue
 
             if any(aa in AMBIGUOUS_AMINO_ACIDS for aa in record.sequence):
@@ -537,39 +538,6 @@ def _get_data_splits(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
 
         return df_train, df_val, df_test
 
-    # ------------------------------ Phase: DataLoaders -----------------------------------
-    def dataloader(self, kind: str, **kwargs) -> DataLoader:
-        """
-        Returns a DataLoader object with truncated sequences for the specified kind of data (train, val, or test).
-
-        This method overrides the dataloader method from the superclass. After fetching the dataset from the
-        superclass, it truncates the 'features' of each data instance to a maximum length specified by
-        `self.max_sequence_length`. The truncation is adjusted based on the value of `n_gram` to ensure that
-        the correct number of amino acids is preserved in the truncated sequences.
-
-        Args:
-            kind (str): The kind of data to load (e.g., 'train', 'val', 'test').
-            **kwargs: Additional keyword arguments passed to the superclass dataloader method.
-
-        Returns:
-            DataLoader: A DataLoader object with the truncated sequences.
-        """
-        dataloader = super().dataloader(kind, **kwargs)
-
-        if self.reader.n_gram is None:
-            # Truncate the 'features' to max_sequence_length for each instance
-            truncate_index = self.max_sequence_length
-        else:
-            # If n_gram is given, adjust truncation to ensure maximum sequence length refers to the maximum number of
-            # amino acids in sequence rather than number of n-grams. Eg, Sequence "ABCDEFGHIJ" can form 8 trigrams,
-            # if max length is 5, then only first 3 trigrams should be considered as they are formed by first 5 letters.
-            truncate_index = self.max_sequence_length - (self.reader.n_gram - 1)
-
-        for instance in dataloader.dataset:
-            instance["features"] = instance["features"][:truncate_index]
-
-        return dataloader
-
     # ------------------------------ Phase: Raw Properties -----------------------------------
     @property
     def base_dir(self) -> str:
@@ -617,13 +585,16 @@ def _name(self) -> str:
         """
         Returns the name of the dataset.
 
+        'max_sequence_length' in the name indicates that proteins with sequence lengths exceeding  are ignored
+        in the dataset.
+
         Returns:
             str: The dataset name, formatted with the current threshold value and/or given go_branch.
         """
         if self.go_branch != self._ALL_GO_BRANCHES:
-            return f"GO{self.THRESHOLD}_{self.go_branch}"
+            return f"GO{self.THRESHOLD}_{self.go_branch}_{self.max_sequence_length}"
 
-        return f"GO{self.THRESHOLD}"
+        return f"GO{self.THRESHOLD}_{self.max_sequence_length}"
 
     def select_classes(
         self, g: nx.DiGraph, *args: Any, **kwargs: Dict[str, Any]