Merge pull request #120 from ChEB-AI/fix/avoid_iterrows

sfluegel05 · web-flow · commit 44c6e7e12e8d · 2025-10-17T14:28:15.000+02:00
Avoid using iterrows, use vectorization wherever possible
diff --git a/chebai/preprocessing/datasets/chebi.py b/chebai/preprocessing/datasets/chebi.py
@@ -17,6 +17,7 @@
 from itertools import cycle, permutations, product
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 import torch
 from rdkit import Chem
@@ -332,7 +333,7 @@ def _graph_to_raw_dataset(self, g: "nx.DiGraph") -> pd.DataFrame:
 
         data = pd.DataFrame(data)
         data = data[~data["SMILES"].isnull()]
-        data = data[[name not in CHEBI_BLACKLIST for name, _ in data.iterrows()]]
+        data = data[~data["name"].isin(CHEBI_BLACKLIST)]
 
         return data
 
@@ -459,18 +460,18 @@ def _load_dict(self, input_file_path: str) -> Generator[dict[str, Any], None, No
         """
         with open(input_file_path, "rb") as input_file:
             df = pd.read_pickle(input_file)
-            if self.single_class is not None:
-                single_cls_index = list(df.columns).index(int(self.single_class))
-            for row in df.values:
-                if self.single_class is None:
-                    labels = row[self._LABELS_START_IDX :].astype(bool)
-                else:
-                    labels = [bool(row[single_cls_index])]
-                yield dict(
-                    features=row[self._DATA_REPRESENTATION_IDX],
-                    labels=labels,
-                    ident=row[self._ID_IDX],
-                )
+
+            if self.single_class is None:
+                all_labels = df.iloc[:, self._LABELS_START_IDX :].to_numpy(dtype=bool)
+            else:
+                single_cls_index = df.columns.get_loc(int(self.single_class))
+                all_labels = df.iloc[:, [single_cls_index]].to_numpy(dtype=bool)
+
+            features = df.iloc[:, self._DATA_REPRESENTATION_IDX].to_numpy()
+            idents = df.iloc[:, self._ID_IDX].to_numpy()
+
+            for feat, labels, ident in zip(features, all_labels, idents):
+                yield dict(features=feat, labels=labels, ident=ident)
 
     # ------------------------------ Phase: Dynamic Splits -----------------------------------
     def _get_data_splits(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
@@ -566,43 +567,43 @@ def _setup_pruned_test_set(
         Returns:
             pd.DataFrame: The pruned test dataset.
         """
-        # TODO: find a more efficient way to do this
-        filename_old = "classes.txt"
-        # filename_new = f"classes_v{self.chebi_version_train}.txt"
-        # dataset = torch.load(os.path.join(self.processed_dir, "test.pt"))
-
-        # Load original classes (from the current ChEBI version - chebi_version)
-        with open(os.path.join(self.processed_dir_main, filename_old), "r") as file:
-            orig_classes = file.readlines()
+        classes_file_name = "classes.txt"
 
-        # Load new classes (from the training ChEBI version - chebi_version_train)
+        # Load original and new classes
+        with open(os.path.join(self.processed_dir_main, classes_file_name), "r") as f:
+            orig_classes = f.readlines()
         with open(
             os.path.join(
-                self._chebi_version_train_obj.processed_dir_main, filename_old
+                self._chebi_version_train_obj.processed_dir_main, classes_file_name
             ),
             "r",
-        ) as file:
-            new_classes = file.readlines()
-
-        # Create a mapping which give index of a class from chebi_version, if the corresponding
-        # class exists in chebi_version_train, Size = Number of classes in chebi_version
-        mapping = [
-            None if or_class not in new_classes else new_classes.index(or_class)
-            for or_class in orig_classes
-        ]
+        ) as f:
+            new_classes = f.readlines()
+
+        # Mapping array (-1 means no match in new classes)
+        mapping_array = np.array(
+            [
+                -1 if oc not in new_classes else new_classes.index(oc)
+                for oc in orig_classes
+            ],
+            dtype=int,
+        )
+
+        # Convert labels column to 2D NumPy array
+        labels_matrix = np.array(df_test_chebi_version["labels"].tolist(), dtype=bool)
+
+        # Allocate new labels matrix
+        num_new_classes = len(new_classes)
+        new_labels_matrix = np.zeros(
+            (labels_matrix.shape[0], num_new_classes), dtype=bool
+        )
 
-        # Iterate over each data instance in the test set which is derived from chebi_version
-        for _, row in df_test_chebi_version.iterrows():
-            # Size = Number of classes in chebi_version_train
-            new_labels = [False for _ in new_classes]
-            for ind, label in enumerate(row["labels"]):
-                # If the chebi_version class exists in the chebi_version_train and has a True label,
-                # set the corresponding label in new_labels to True
-                if mapping[ind] is not None and label:
-                    new_labels[mapping[ind]] = label
-            # Update the labels from test instance from chebi_version to the new labels, which are compatible to both versions
-            row["labels"] = new_labels
+        # Copy only valid columns
+        valid_mask = mapping_array != -1
+        new_labels_matrix[:, mapping_array[valid_mask]] = labels_matrix[:, valid_mask]
 
+        # Assign back
+        df_test_chebi_version["labels"] = new_labels_matrix.tolist()
         return df_test_chebi_version
 
     # ------------------------------ Phase: Raw Properties -----------------------------------
diff --git a/chebai/preprocessing/datasets/pubchem.py b/chebai/preprocessing/datasets/pubchem.py
@@ -628,8 +628,8 @@ def download(self):
                     if not os.path.exists(os.path.join(self.raw_dir, f"{name}.txt")):
                         open(os.path.join(self.raw_dir, f"{name}.txt"), "x").close()
                     with open(os.path.join(self.raw_dir, f"{name}.txt"), "w") as f:
-                        for id, row in splits[i].iterrows():
-                            f.writelines(f"{id}\t{row['smiles']}\n")
+                        for id, row in splits[i].itertuples(index=True):
+                            f.writelines(f"{id}\t{row.smiles}\n")
 
 
 class PubChemDissimilarSMILES(PubChemDissimilar):
@@ -809,12 +809,12 @@ def download(self):
         csv_path = os.path.join(self.raw_dir, "pubchem_hazardous_compound_list.csv")
         compounds = pd.read_csv(csv_path)
         smiles_list = []
-        for id, compound in compounds.iterrows():
+        for compound in compounds.itertuples(index=False):
             if (
-                not isinstance(compound["cmpdsynonym"], str)
-                or "CHEBI" not in compound["cmpdsynonym"]
+                not isinstance(compound.cmpdsynonym, str)
+                or "CHEBI" not in compound.cmpdsynonym
             ):
-                smiles_list.append(f"{compound['cid']}\t{compound['isosmiles']}")
+                smiles_list.append(f"{compound.cid}\t{compound.isosmiles}")
         with open(os.path.join(self.raw_dir, "smiles.txt"), "w") as f:
             f.write("\n".join(smiles_list))
 
diff --git a/chebai/train.py b/chebai/train.py
@@ -246,11 +246,11 @@ def prepare_data(infile: pickle.Pickler) -> pd.DataFrame:
         data_frame[col] = data_frame[col].astype(int)
 
     train_data = []
-    for index, row in data_frame.iterrows():
+    for row in data_frame.itertuples(index=False):
         train_data.append(
             [
-                data_frame.iloc[index].values[1],
-                data_frame.iloc[index].values[2:502].tolist(),
+                row.SMILES,
+                row.LABELS,
             ]
         )
 
@@ -309,28 +309,28 @@ def load_data() -> (
         train_dataset = []
         train_actual_labels = []
 
-        for index, row in prepare_data(train_infile).iterrows():
+        for row in prepare_data(train_infile).itertuples(index=False):
             try:
-                mol = Molecule(row["SMILES"], True)
+                mol = Molecule(row.SMILES, True)
 
                 # DAGs_meta_info = mol.dag_to_node
                 train_dataset.append(mol)
-                train_actual_labels.append(torch.tensor(row["LABELS"]).float())
+                train_actual_labels.append(torch.tensor(row.LABELS).float())
             except Exception:
                 pass
 
         print("prepare validation data!")
         validation_dataset = []
         validation_actual_labels = []
 
-        for index, row in prepare_data(validation_infile).iterrows():
+        for row in prepare_data(validation_infile).itertuples(index=False):
             try:
-                mol = Molecule(row["SMILES"], True)
+                mol = Molecule(row.SMILES, True)
 
                 # DAGs_meta_info = mol.dag_to_node
 
                 validation_dataset.append(mol)
-                validation_actual_labels.append(torch.tensor(row["LABELS"]).float())
+                validation_actual_labels.append(torch.tensor(row.LABELS).float())
             except Exception:
                 pass