add label filter

sfluegel05 · sfluegel05 · commit 4ab760e80120 · 2025-10-14T13:42:30.000+02:00
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -4,6 +4,7 @@
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
 
 import lightning as pl
+import numpy as np
 import pandas as pd
 import torch
 import tqdm
@@ -708,11 +709,14 @@ class _DynamicDataset(XYBaseDataModule, ABC):
     Args:
         dynamic_data_split_seed (int, optional): The seed for random data splitting. Defaults to 42.
         splits_file_path (str, optional): Path to the splits CSV file. Defaults to None.
+        apply_label_filter (Optional[str]): Path to a classes.txt file - only labels that are in the labels filter
+        file will be used (in that order). All labels in the label filter have to be present in the dataset.
         **kwargs: Additional keyword arguments passed to XYBaseDataModule.
 
     Attributes:
         dynamic_data_split_seed (int): The seed for random data splitting, default is 42.
         splits_file_path (Optional[str]): Path to the CSV file containing split assignments.
+        apply_label_filter (Optional[str]): Path to a classes.txt file for label filtering.
     """
 
     # ---- Index for columns of processed `data.pkl` (should be derived from `_graph_to_raw_dataset` method) ------
@@ -722,6 +726,7 @@ class _DynamicDataset(XYBaseDataModule, ABC):
 
     def __init__(
         self,
+        apply_label_filter: Optional[str] = None,
         **kwargs,
     ):
         super(_DynamicDataset, self).__init__(**kwargs)
@@ -735,6 +740,7 @@ def __init__(
         self.splits_file_path = self._validate_splits_file_path(
             kwargs.get("splits_file_path", None)
         )
+        self.apply_label_filter = apply_label_filter
 
     @staticmethod
     def _validate_splits_file_path(splits_file_path: Optional[str]) -> Optional[str]:
@@ -1134,6 +1140,18 @@ def _retrieve_splits_from_csv(self) -> None:
         )
         df_data = pd.DataFrame(data)
 
+        if self.apply_label_filter:
+            print(f"Applying label filter from {self.apply_label_filter}...")
+            with open(self.apply_label_filter, "r") as f:
+                label_filter = [line.strip() for line in f]
+            with open(os.path.join(self.processed_dir_main, "classes.txt"), "r") as cf:
+                classes = [line.strip() for line in cf]
+            # reorder labels
+            old_labels = np.stack(df_data["labels"])
+            label_mapping = [classes.index(lbl) for lbl in label_filter]
+            new_labels = old_labels[:, label_mapping]
+            df_data["labels"] = list(new_labels)
+
         train_ids = splits_df[splits_df["split"] == "train"]["id"]
         validation_ids = splits_df[splits_df["split"] == "validation"]["id"]
         test_ids = splits_df[splits_df["split"] == "test"]["id"]
diff --git a/chebai/result/generate_class_properties.py b/chebai/result/generate_class_properties.py
@@ -168,7 +168,10 @@ def generate_props(
             raise ValueError(f"Unknown data partition: {data_partition}")
         print(f"Running inference on {data_partition} data...")
 
-        classes_file = Path(data_module.processed_dir_main) / "classes.txt"
+        if data_module.apply_label_filter is not None:
+            classes_file = data_module.apply_label_filter
+        else:
+            classes_file = Path(data_module.processed_dir_main) / "classes.txt"
         class_names = self.load_class_labels(classes_file)
         num_classes = len(class_names)
         metrics_obj_dict: dict[str, torchmetrics.Metric] = {
@@ -181,6 +184,7 @@ def generate_props(
         }
 
         for batch_idx, batch in enumerate(data_loader):
+            batch = batch.to(device=model.device)
             data = model._process_batch(batch, batch_idx=batch_idx)
             labels = data["labels"].to(device=model.device)
             data["features"][0].to(device=model.device)