add box embedding model

sfluegel05 · sfluegel05 · commit 7579f80089f2 · 2025-10-14T20:32:41.000+02:00
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -161,17 +161,18 @@ def filter_dict(d: Dict[str, Any], filter_key: str) -> Dict[str, Any]:
     }
 
 
-class Electra(ChebaiBaseNet):
-    """
-    Electra model implementation inherited from ChebaiBaseNet.
+class ElectraProcessingMixIn:
+    """Mixin class for processing batches and outputs for Electra models."""
 
-    Args:
-        config (Dict[str, Any], optional): Configuration parameters for the Electra model. Defaults to None.
-        pretrained_checkpoint (str, optional): Path to the pretrained checkpoint file. Defaults to None.
-        load_prefix (str, optional): Prefix to filter the state_dict keys from the pretrained checkpoint. Defaults to None.
-        **kwargs: Additional keyword arguments.
+    @property
+    def as_pretrained(self) -> ElectraModel:
+        """
+        Get the pretrained Electra model.
 
-    """
+        Returns:
+            ElectraModel: The pretrained Electra model.
+        """
+        return self.electra.electra
 
     def _process_batch(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any]:
         """
@@ -209,15 +210,61 @@ def _process_batch(self, batch: Dict[str, Any], batch_idx: int) -> Dict[str, Any
             idents=batch.additional_fields["idents"],
         )
 
-    @property
-    def as_pretrained(self) -> ElectraModel:
+    def _process_for_loss(
+        self,
+        model_output: Dict[str, Tensor],
+        labels: Tensor,
+        loss_kwargs: Dict[str, Any],
+    ) -> Tuple[Tensor, Tensor, Dict[str, Any]]:
         """
-        Get the pretrained Electra model.
+        Process the model output for calculating the loss.
+
+        Args:
+            model_output (Dict[str, Tensor]): The output of the model.
+            labels (Tensor): The target labels.
+            loss_kwargs (Dict[str, Any]): Additional loss arguments.
 
         Returns:
-            ElectraModel: The pretrained Electra model.
+            tuple: A tuple containing the processed model output, labels, and loss arguments.
         """
-        return self.electra.electra
+        kwargs_copy = dict(loss_kwargs)
+        if labels is not None:
+            labels = labels.float()
+        return model_output["logits"], labels, kwargs_copy
+
+    def _get_prediction_and_labels(
+        self, data: Dict[str, Any], labels: Tensor, model_output: Dict[str, Tensor]
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Get the predictions and labels from the model output. Applies a sigmoid to the model output.
+
+        Args:
+            data (Dict[str, Any]): The input data.
+            labels (Tensor): The target labels.
+            model_output (Dict[str, Tensor]): The output of the model.
+
+        Returns:
+            tuple: A tuple containing the predictions and labels.
+        """
+        d = model_output["logits"]
+        loss_kwargs = data.get("loss_kwargs", dict())
+        if "non_null_labels" in loss_kwargs:
+            n = loss_kwargs["non_null_labels"]
+            d = d[n]
+        return torch.sigmoid(d), labels.int() if labels is not None else None
+
+
+class Electra(ElectraProcessingMixIn, ChebaiBaseNet):
+    """
+    Electra model implementation inherited from ChebaiBaseNet.
+
+    Args:
+        config (Dict[str, Any], optional): Configuration parameters for the Electra model. Defaults to None.
+        pretrained_checkpoint (str, optional): Path to the pretrained checkpoint file. Defaults to None.
+        load_prefix (str, optional): Prefix to filter the state_dict keys from the pretrained checkpoint. Defaults to None.
+        **kwargs: Additional keyword arguments.
+
+    """
 
     def __init__(
         self,
@@ -262,49 +309,6 @@ def __init__(
         else:
             self.electra = ElectraModel(config=self.config)
 
-    def _process_for_loss(
-        self,
-        model_output: Dict[str, Tensor],
-        labels: Tensor,
-        loss_kwargs: Dict[str, Any],
-    ) -> Tuple[Tensor, Tensor, Dict[str, Any]]:
-        """
-        Process the model output for calculating the loss.
-
-        Args:
-            model_output (Dict[str, Tensor]): The output of the model.
-            labels (Tensor): The target labels.
-            loss_kwargs (Dict[str, Any]): Additional loss arguments.
-
-        Returns:
-            tuple: A tuple containing the processed model output, labels, and loss arguments.
-        """
-        kwargs_copy = dict(loss_kwargs)
-        if labels is not None:
-            labels = labels.float()
-        return model_output["logits"], labels, kwargs_copy
-
-    def _get_prediction_and_labels(
-        self, data: Dict[str, Any], labels: Tensor, model_output: Dict[str, Tensor]
-    ) -> Tuple[Tensor, Tensor]:
-        """
-        Get the predictions and labels from the model output. Applies a sigmoid to the model output.
-
-        Args:
-            data (Dict[str, Any]): The input data.
-            labels (Tensor): The target labels.
-            model_output (Dict[str, Tensor]): The output of the model.
-
-        Returns:
-            tuple: A tuple containing the predictions and labels.
-        """
-        d = model_output["logits"]
-        loss_kwargs = data.get("loss_kwargs", dict())
-        if "non_null_labels" in loss_kwargs:
-            n = loss_kwargs["non_null_labels"]
-            d = d[n]
-        return torch.sigmoid(d), labels.int() if labels is not None else None
-
     def forward(self, data: Dict[str, Tensor], **kwargs: Any) -> Dict[str, Any]:
         """
         Forward pass of the Electra model.
diff --git a/chebai/models/electra_box.py b/chebai/models/electra_box.py
@@ -0,0 +1,105 @@
+import torch
+import torch.nn as nn
+from transformers import ElectraConfig, ElectraModel
+
+from chebai.models.base import ChebaiBaseNet
+from chebai.models.electra import ElectraProcessingMixIn, filter_dict
+
+
+class ElectraBox(ElectraProcessingMixIn, ChebaiBaseNet):
+    NAME = "ElectraBox"
+
+    def __init__(
+        self, config=None, pretrained_checkpoint=None, load_prefix=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        if config is None:
+            config = dict()
+        if "num_labels" not in config and self.out_dim is not None:
+            config["num_labels"] = self.out_dim
+        self.config = ElectraConfig(**config, output_attentions=True)
+        self.word_dropout = nn.Dropout(config.get("word_dropout", 0))
+
+        self.in_dim = self.config.hidden_size
+        self.hidden_dim = self.config.embeddings_to_points_hidden_size
+        self.out_dim = self.config.embeddings_dimensions
+        self.boxes = nn.Parameter(torch.rand((self.config.num_labels, self.out_dim, 2)))
+        self.embeddings_to_points = nn.Sequential(
+            nn.Linear(self.in_dim, self.hidden_dim),
+            nn.ReLU(),
+            nn.Linear(self.hidden_dim, self.hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(self.hidden_dim, self.out_dim),
+        )
+
+        if pretrained_checkpoint:
+            with open(pretrained_checkpoint, "rb") as fin:
+                model_dict = torch.load(fin, map_location=self.device)
+                if load_prefix:
+                    state_dict = filter_dict(model_dict["state_dict"], load_prefix)
+                else:
+                    state_dict = model_dict["state_dict"]
+                self.electra = ElectraModel.from_pretrained(
+                    None, state_dict=state_dict, config=self.config
+                )
+        else:
+            self.electra = ElectraModel(config=self.config)
+
+    def forward(self, data, **kwargs):
+        self.batch_size = data["features"].shape[0]
+        inp = self.electra.embeddings.forward(data["features"])
+        inp = self.word_dropout(inp)
+        electra = self.electra(inputs_embeds=inp)
+        d = electra.last_hidden_state[:, 0, :]
+
+        points = self.embeddings_to_points(d)
+
+        b = self.boxes.expand(self.batch_size, -1, -1, -1)
+        raw_l = torch.min(b, dim=-1)[0]
+        raw_r = torch.max(b, dim=-1)[0]
+
+        left = raw_l + ((raw_r - raw_l) * 0.2)
+        right = raw_r - ((raw_r - raw_l) * 0.2)
+
+        p = points.expand(self.config.num_labels, -1, -1).transpose(1, 0)
+        max_distance_per_dim = torch.max(
+            torch.stack((nn.functional.relu(left - p), nn.functional.relu(p - right))),
+            dim=0,
+        )[0]
+
+        m = torch.sum(max_distance_per_dim, dim=-1)
+        s = 2 - (2 * (torch.sigmoid(m)))
+        logits = torch.logit((s * 0.99) + 0.001)
+
+        return dict(
+            boxes=b,
+            embedded_points=points,
+            logits=logits,
+            attentions=electra.attentions,
+            target_mask=data.get("target_mask"),
+        )
+
+
+if __name__ == "__main__":
+    model = ElectraBox(
+        config={
+            "vocab_size": 4400,
+            "max_position_embeddings": 1800,
+            "num_attention_heads": 8,
+            "num_hidden_layers": 6,
+            "type_vocab_size": 1,
+            "hidden_size": 256,
+            "embeddings_to_points_hidden_size": 1200,
+            "embeddings_dimensions": 16,
+        },
+        out_dim=120,
+        input_dim=1800,
+    )
+    import torch
+
+    print(
+        model._process_for_loss(
+            torch.randint(0, 4400, (2, 1800)), torch.randint(0, 2, (2, 120))
+        )
+    )
diff --git a/configs/model/box-electra.yml b/configs/model/box-electra.yml
@@ -0,0 +1,13 @@
+class_path: chebai.models.electra_box.ElectraBox
+init_args:
+  optimizer_kwargs:
+    lr: 1e-3
+  config:
+    vocab_size: 4400
+    max_position_embeddings: 1800
+    num_attention_heads: 8
+    num_hidden_layers: 6
+    type_vocab_size: 1
+    hidden_size: 256
+    embeddings_to_points_hidden_size: 1200
+    embeddings_dimensions: 16