add argument linking for data_extractor, propagation from semantic loss to base loss

sfluegel · sfluegel · commit 3851ee895058 · 2024-05-07T18:10:26.000+02:00
diff --git a/chebai/cli.py b/chebai/cli.py
@@ -19,6 +19,9 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser):
         parser.link_arguments(
             "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"
         )
+        parser.link_arguments(
+            "data", "model.init_args.criterion.init_args.data_extractor"
+        )
 
     @staticmethod
     def subcommands() -> Dict[str, Set[str]]:
diff --git a/chebai/loss/bce_weighted.py b/chebai/loss/bce_weighted.py
@@ -1,5 +1,6 @@
 import torch
-from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
+from chebai.preprocessing.datasets.base import XYBaseDataModule
+from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 import pandas as pd
 import os
 import pickle
@@ -10,9 +11,16 @@ class BCEWeighted(torch.nn.BCEWithLogitsLoss):
     https://openaccess.thecvf.com/content_CVPR_2019/papers/Cui_Class-Balanced_Loss_Based_on_Effective_Number_of_Samples_CVPR_2019_paper.pdf)
     """
 
-    def __init__(self, beta: float = None, data_extractor: _ChEBIDataExtractor = None):
+    def __init__(
+        self,
+        beta: float = None,
+        data_extractor: XYBaseDataModule = None,
+    ):
         self.beta = beta
+        if isinstance(data_extractor, LabeledUnlabeledMixed):
+            data_extractor = data_extractor.labeled
         self.data_extractor = data_extractor
+
         super().__init__()
 
     def set_pos_weight(self, input):
@@ -31,12 +39,12 @@ def set_pos_weight(self, input):
                         open(
                             os.path.join(
                                 self.data_extractor.raw_dir,
-                                self.data_extractor.raw_file_names_dict[set],
+                                raw_file_name,
                             ),
                             "rb",
                         )
                     )
-                    for set in ["train", "validation", "test"]
+                    for raw_file_name in self.data_extractor.raw_file_names
                 ]
             )
             value_counts = []
diff --git a/chebai/loss/semantic.py b/chebai/loss/semantic.py
@@ -7,12 +7,14 @@
 from typing import Literal
 
 from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor, ChEBIOver100
+from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
+from chebai.loss.bce_weighted import BCEWeighted
 
 
 class ImplicationLoss(torch.nn.Module):
     def __init__(
         self,
-        data_extractor: _ChEBIDataExtractor,
+        data_extractor: _ChEBIDataExtractor | LabeledUnlabeledMixed,
         base_loss: torch.nn.Module = None,
         tnorm: Literal["product", "lukasiewicz", "xu19"] = "product",
         impl_loss_weight=0.1,  # weight of implication loss in relation to base_loss
@@ -21,7 +23,13 @@ def __init__(
         multiply_by_softmax=False,
     ):
         super().__init__()
+        # automatically choose labeled subset for implication filter in case of mixed dataset
+        if isinstance(data_extractor, LabeledUnlabeledMixed):
+            data_extractor = data_extractor.labeled
         self.data_extractor = data_extractor
+        # propagate data_extractor to base loss
+        if isinstance(base_loss, BCEWeighted):
+            base_loss.data_extractor = self.data_extractor
         self.base_loss = base_loss
         self.implication_cache_file = f"implications_{self.data_extractor.name}.cache"
         self.label_names = _load_label_names(
@@ -106,7 +114,7 @@ class DisjointLoss(ImplicationLoss):
     def __init__(
         self,
         path_to_disjointness,
-        data_extractor: _ChEBIDataExtractor,
+        data_extractor: _ChEBIDataExtractor | LabeledUnlabeledMixed,
         base_loss: torch.nn.Module = None,
         disjoint_loss_weight=100,
         **kwargs,
diff --git a/chebai/models/base.py b/chebai/models/base.py
@@ -28,11 +28,9 @@ class ChebaiBaseNet(LightningModule):
 
     Attributes:
         NAME (str): The name of the model.
-        LOSS (torch.nn.Module): The loss function used by the model.
     """
 
     NAME = None
-    LOSS = torch.nn.BCEWithLogitsLoss
 
     def __init__(
         self,
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -312,6 +312,10 @@ def setup_processed(self):
     def processed_file_names(self):
         raise NotImplementedError
 
+    @property
+    def raw_file_names(self):
+        raise NotImplementedError
+
     @property
     def processed_file_names_dict(self) -> dict:
         raise NotImplementedError
diff --git a/configs/loss/semantic_loss.yml b/configs/loss/semantic_loss.yml
@@ -1,11 +1,9 @@
 class_path: chebai.loss.semantic.DisjointLoss
 init_args:
   path_to_disjointness: data/disjoint.csv
-  data_extractor: &extractor ../data/chebi100.yml
   base_loss:
     class_path: chebai.loss.bce_weighted.BCEWeighted
     init_args:
       beta: 0.99
-      data_extractor: *extractor
   tnorm: product
   impl_loss_weight: 0.01

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,9 @@ def add_arguments_to_parser(self, parser: LightningArgumentParser):`
`19`	`19`	`parser.link_arguments(`
`20`	`20`	`"model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"`
`21`	`21`	`)`
	`22`	`+ parser.link_arguments(`
	`23`	`+ "data", "model.init_args.criterion.init_args.data_extractor"`
	`24`	`+ )`
`22`	`25`
`23`	`26`	`@staticmethod`
`24`	`27`	`def subcommands() -> Dict[str, Set[str]]:`