chebi tests : docstring + typehints

aditya0by0 · aditya0by0 · commit 4d37e3b0796d · 2024-07-07T13:58:13.000+02:00
diff --git a/tests/testChebiData.py b/tests/testChebiData.py
@@ -1,17 +1,36 @@
 import unittest
+from typing import List
+
+import pandas as pd
 
 from chebai.preprocessing.datasets.chebi import ChEBIOver50
 
 
 class TestChebiData(unittest.TestCase):
+    """
+    Test case for ChEBIOver50 dataset integrity, focusing on data splits and overlaps.
+
+    Attributes:
+        overlaps_train_val (List): List of overlapping entities between train and validation splits based on SMILES.
+        overlaps_train_test (List): List of overlapping entities between train and test splits based on SMILES.
+        overlaps_val_test (List): List of overlapping entities between validation and test splits based on SMILES.
+        overlaps_train_val_ids (List): List of overlapping entity IDs between train and validation splits.
+        overlaps_train_test_ids (List): List of overlapping entity IDs between train and test splits.
+        overlaps_val_test_ids (List): List of overlapping entity IDs between validation and test splits.
+    """
 
     @classmethod
     def setUpClass(cls) -> None:
+        """
+        Set up class method to initialize ChEBIOver50 instance and generate data splits and overlaps.
+        """
         cls.getDataSplitsOverlaps()
 
     @classmethod
-    def getDataSplitsOverlaps(cls):
-        """Get the overlap between data splits"""
+    def getDataSplitsOverlaps(cls) -> None:
+        """
+        Get the overlap between data splits based on SMILES and IDs.
+        """
         chebi_class_obj = ChEBIOver50()
         # Get the raw/processed data if missing
         chebi_class_obj.prepare_data()
@@ -25,37 +44,56 @@ def getDataSplitsOverlaps(cls):
         val_smiles, val_smiles_ids = cls.get_features_ids(val_set)
         test_smiles, test_smiles_ids = cls.get_features_ids(test_set)
 
-        # ----- Get the overlap between data splits based on smiles tokens/features -----
+        # Get the overlap between data splits based on smiles tokens/features
         cls.overlaps_train_val = cls.get_overlaps(train_smiles, val_smiles)
         cls.overlaps_train_test = cls.get_overlaps(train_smiles, test_smiles)
         cls.overlaps_val_test = cls.get_overlaps(val_smiles, test_smiles)
 
-        # ----- Get the overlap between data splits based on IDs -----
+        # Get the overlap between data splits based on IDs
         cls.overlaps_train_val_ids = cls.get_overlaps(train_smiles_ids, val_smiles_ids)
         cls.overlaps_train_test_ids = cls.get_overlaps(
             train_smiles_ids, test_smiles_ids
         )
         cls.overlaps_val_test_ids = cls.get_overlaps(val_smiles_ids, test_smiles_ids)
 
     @staticmethod
-    def get_features_ids(data_split_df):
-        """Returns SMILES features/tokens and SMILES IDs from the data"""
+    def get_features_ids(data_split_df: pd.DataFrame) -> tuple[List, List]:
+        """
+        Returns SMILES features/tokens and SMILES IDs from the data.
+
+        Args:
+            data_split_df: DataFrame containing data to extract features and IDs from.
+
+        Returns:
+            Tuple of lists: SMILES features/tokens list and SMILES IDs list.
+        """
         smiles_features = data_split_df["features"].tolist()
         smiles_ids = data_split_df["ident"].tolist()
 
         return smiles_features, smiles_ids
 
     @staticmethod
-    def get_overlaps(list_1, list_2):
+    def get_overlaps(list_1: List, list_2: List) -> List:
+        """
+        Get overlaps between two lists.
+
+        Args:
+            list_1: First list.
+            list_2: Second list.
+
+        Returns:
+            List: List of elements present in both lists.
+        """
         overlap = []
         for element in list_1:
             if element in list_2:
                 overlap.append(element)
         return overlap
 
     @unittest.expectedFailure
-    def test_train_val_overlap_based_on_smiles(self):
-        """Check that train-val splits are performed correctly i.e.every entity
+    def test_train_val_overlap_based_on_smiles(self) -> None:
+        """
+        Check that train-val splits are performed correctly i.e.every entity
         only appears in one of the train and validation set based on smiles tokens/features
         """
         self.assertEqual(
@@ -65,18 +103,21 @@ def test_train_val_overlap_based_on_smiles(self):
         )
 
     @unittest.expectedFailure
-    def test_train_test_overlap_based_on_smiles(self):
-        """Check that train-test splits are performed correctly i.e.every entity
-        only appears in one of the train and test set based on smiles tokens/features"""
+    def test_train_test_overlap_based_on_smiles(self) -> None:
+        """
+        Check that train-test splits are performed correctly i.e.every entity
+        only appears in one of the train and test set based on smiles tokens/features
+        """
         self.assertEqual(
             len(self.overlaps_train_test),
             0,
             "Duplicate entities present in Train and Test set based on SMILES",
         )
 
     @unittest.expectedFailure
-    def test_val_test_overlap_based_on_smiles(self):
-        """Check that val-test splits are performed correctly i.e.every entity
+    def test_val_test_overlap_based_on_smiles(self) -> None:
+        """
+        Check that val-test splits are performed correctly i.e.every entity
         only appears in one of the validation and test set based on smiles tokens/features
         """
         self.assertEqual(
@@ -85,27 +126,33 @@ def test_val_test_overlap_based_on_smiles(self):
             "Duplicate entities present in Validation and Test set based on SMILES",
         )
 
-    def test_train_val_overlap_based_on_ids(self):
-        """Check that train-val splits are performed correctly i.e.every entity
-        only appears in one of the train and validation set based on smiles IDs"""
+    def test_train_val_overlap_based_on_ids(self) -> None:
+        """
+        Check that train-val splits are performed correctly i.e.every entity
+        only appears in one of the train and validation set based on smiles IDs
+        """
         self.assertEqual(
             len(self.overlaps_train_val_ids),
             0,
             "Duplicate entities present in Train and Validation set based on IDs",
         )
 
-    def test_train_test_overlap_based_on_ids(self):
-        """Check that train-test splits are performed correctly i.e.every entity
-        only appears in one of the train and test set based on smiles IDs"""
+    def test_train_test_overlap_based_on_ids(self) -> None:
+        """
+        Check that train-test splits are performed correctly i.e.every entity
+        only appears in one of the train and test set based on smiles IDs
+        """
         self.assertEqual(
             len(self.overlaps_train_test_ids),
             0,
             "Duplicate entities present in Train and Test set based on IDs",
         )
 
-    def test_val_test_overlap_based_on_ids(self):
-        """Check that val-test splits are performed correctly i.e.every entity
-        only appears in one of the validation and test set based on smiles IDs"""
+    def test_val_test_overlap_based_on_ids(self) -> None:
+        """
+        Check that val-test splits are performed correctly i.e.every entity
+        only appears in one of the validation and test set based on smiles IDs
+        """
         self.assertEqual(
             len(self.overlaps_val_test_ids),
             0,
diff --git a/tests/testChebiDynamicDataSplits.py b/tests/testChebiDynamicDataSplits.py
@@ -1,5 +1,6 @@
 import hashlib
 import unittest
+from typing import Any, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -8,20 +9,30 @@
 
 
 class TestChebiDynamicDataSplits(unittest.TestCase):
-    """Test dynamic splits implementation's consistency"""
+    """
+    Test dynamic splits implementation's consistency for ChEBIOver50 dataset.
+
+    Attributes:
+        chebi_50_v231 (ChEBIOver50): Instance of ChEBIOver50 with ChEBI version 231.
+        chebi_50_v231_vt200 (ChEBIOver50): Instance of ChEBIOver50 with ChEBI version 231 and train version 200.
+    """
 
     @classmethod
-    def setUpClass(cls):
+    def setUpClass(cls) -> None:
+        """
+        Set up class method to initialize instances of ChEBIOver50 and generate data.
+        """
         cls.chebi_50_v231 = ChEBIOver50(chebi_version=231)
         cls.chebi_50_v231_vt200 = ChEBIOver50(
             chebi_version=231, chebi_version_train=200
         )
         cls._generate_chebi_class_data(cls.chebi_50_v231)
         cls._generate_chebi_class_data(cls.chebi_50_v231_vt200)
 
-    def testDynamicDataSplitsConsistency(self):
-        """Test Dynamic Data Splits consistency across every run"""
-
+    def testDynamicDataSplitsConsistency(self) -> None:
+        """
+        Test Dynamic Data Splits consistency across multiple runs.
+        """
         # Dynamic Data Splits in Run 1
         train_hash_1, val_hash_1, test_hash_1 = self._get_hashed_splits()
 
@@ -34,9 +45,10 @@ def testDynamicDataSplitsConsistency(self):
         self.assertEqual(val_hash_1, val_hash_2, "Validation data hashes do not match.")
         self.assertEqual(test_hash_1, test_hash_2, "Test data hashes do not match.")
 
-    def test_same_ids_and_in_test_sets(self):
-        """Check if test sets of both classes have same IDs"""
-
+    def test_same_ids_and_in_test_sets(self) -> None:
+        """
+        Check if test sets of both classes have the same IDs.
+        """
         v231_ids = set(self.chebi_50_v231.dynamic_split_dfs["test"]["ident"])
         v231_vt200_ids = set(
             self.chebi_50_v231_vt200.dynamic_split_dfs["test"]["ident"]
@@ -46,9 +58,10 @@ def test_same_ids_and_in_test_sets(self):
             v231_ids, v231_vt200_ids, "Test sets do not have the same IDs."
         )
 
-    def test_labels_vector_size_in_test_sets(self):
-        """Check if test sets of both classes have different size/shape of labels"""
-
+    def test_labels_vector_size_in_test_sets(self) -> None:
+        """
+        Check if test sets of both classes have different sizes/shapes of labels.
+        """
         v231_labels_shape = len(
             self.chebi_50_v231.dynamic_split_dfs["test"]["labels"].iloc[0]
         )
@@ -59,11 +72,13 @@ def test_labels_vector_size_in_test_sets(self):
         self.assertEqual(
             v231_labels_shape,
             v231_vt200_label_shape,
-            "Test sets have the different size of labels",
+            "Test sets have different sizes of labels",
         )
 
-    def test_no_overlaps_in_chebi_v231_vt200(self):
-        """Test the overlaps for the ChEBIOver50(chebi_version=231, chebi_version_train=200)"""
+    def test_no_overlaps_in_chebi_v231_vt200(self) -> None:
+        """
+        Test the overlaps for the ChEBIOver50(chebi_version=231, chebi_version_train=200) dataset.
+        """
         train_set = self.chebi_50_v231_vt200.dynamic_split_dfs["train"]
         val_set = self.chebi_50_v231_vt200.dynamic_split_dfs["validation"]
         test_set = self.chebi_50_v231_vt200.dynamic_split_dfs["test"]
@@ -72,7 +87,7 @@ def test_no_overlaps_in_chebi_v231_vt200(self):
         val_set_ids = val_set["ident"].tolist()
         test_set_ids = test_set["ident"].tolist()
 
-        # ----- Get the overlap between data splits based on IDs -----
+        # Get the overlap between data splits based on IDs
         self.overlaps_train_val_ids = self.get_overlaps(train_set_ids, val_set_ids)
         self.overlaps_train_test_ids = self.get_overlaps(train_set_ids, test_set_ids)
         self.overlaps_val_test_ids = self.get_overlaps(val_set_ids, test_set_ids)
@@ -93,10 +108,13 @@ def test_no_overlaps_in_chebi_v231_vt200(self):
             "Duplicate entities present in Validation and Test set based on IDs",
         )
 
-    def _get_hashed_splits(self):
-        """Returns hashed dynamic data splits"""
+    def _get_hashed_splits(self) -> Tuple[str, str, str]:
+        """
+        Returns hashed dynamic data splits.
 
-        # Get the raw/processed data if missing
+        Returns:
+            Tuple[str, str, str]: Hashes for train, validation, and test data splits.
+        """
         chebi_class_obj = self.chebi_50_v231
 
         # Get dynamic splits from class variables
@@ -112,16 +130,32 @@ def _get_hashed_splits(self):
         return train_hash, val_hash, test_hash
 
     @staticmethod
-    def compute_hash(data):
-        """Returns hash for the given data partition"""
+    def compute_hash(data: pd.DataFrame) -> str:
+        """
+        Returns hash for the given data partition.
+
+        Args:
+            data (pd.DataFrame): DataFrame containing data to be hashed.
+
+        Returns:
+            str: Hash computed for the DataFrame.
+        """
         data_for_hashing = data.map(TestChebiDynamicDataSplits.convert_to_hashable)
         return hashlib.md5(
             pd.util.hash_pandas_object(data_for_hashing, index=True).values
         ).hexdigest()
 
     @staticmethod
-    def convert_to_hashable(item):
-        """To Convert lists and numpy arrays within the DataFrame to tuples for hashing"""
+    def convert_to_hashable(item: Any) -> Any:
+        """
+        Convert lists and numpy arrays within the DataFrame to tuples for hashing.
+
+        Args:
+            item (Any): Item to convert to a hashable form.
+
+        Returns:
+            Any: Hashable representation of the input item.
+        """
         if isinstance(item, list):
             return tuple(item)
         elif isinstance(item, np.ndarray):
@@ -130,13 +164,28 @@ def convert_to_hashable(item):
             return item
 
     @staticmethod
-    def _generate_chebi_class_data(chebi_class_obj):
-        # Get the raw/processed data if missing
+    def _generate_chebi_class_data(chebi_class_obj: ChEBIOver50) -> None:
+        """
+        Generate ChEBI class data if not already generated.
+
+        Args:
+            chebi_class_obj (ChEBIOver50): Instance of ChEBIOver50 class.
+        """
         chebi_class_obj.prepare_data()
         chebi_class_obj.setup()
 
     @staticmethod
-    def get_overlaps(list_1, list_2):
+    def get_overlaps(list_1: List[Any], list_2: List[Any]) -> List[Any]:
+        """
+        Get overlaps between two lists.
+
+        Args:
+            list_1 (List[Any]): First list.
+            list_2 (List[Any]): Second list.
+
+        Returns:
+            List[Any]: List of elements present in both lists.
+        """
         overlap = []
         for element in list_1:
             if element in list_2: