OpenTabular
diff --git a/‎README.md‎
Lines changed: 4 additions & 1 deletion b/‎README.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎mambular/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎mambular/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mambular/base_models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎mambular/base_models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎mambular/base_models/tangos.py‎
Lines changed: 228 additions & 0 deletions b/‎mambular/base_models/tangos.py‎
Lines changed: 228 additions & 0 deletions
diff --git a/‎mambular/base_models/utils/lightning_wrapper.py‎
Lines changed: 11 additions & 11 deletions b/‎mambular/base_models/utils/lightning_wrapper.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎mambular/base_models/utils/pretraining.py‎
Lines changed: 6 additions & 6 deletions b/‎mambular/base_models/utils/pretraining.py‎
Lines changed: 6 additions & 6 deletions
@@ -23,6 +23,8 @@ Mambular is a Python library for tabular deep learning. It includes models that
 
 <h3>⚡ What's New ⚡</h3>
 <ul>
+  <li>New Models: `Tangos`, `AutoInt`, `Trompt`</li>
+  <li>Pretraining optionality for suitable models.</li>
   <li>Individual preprocessing: preprocess each feature differently, use pre-trained models for categorical encoding</li>
   <li>Extract latent representations of tables</li>
   <li>Use embeddings as inputs</li>
@@ -78,7 +80,8 @@ Mambular is a Python package that brings the power of advanced deep learning arc
 | `NDTF`           | A neural decision forest using soft decision trees. See [Kontschieder et al.](https://openaccess.thecvf.com/content_iccv_2015/html/Kontschieder_Deep_Neural_Decision_ICCV_2015_paper.html) for inspiration. |
 | `SAINT`          | Improve neural networs via Row Attention and Contrastive Pre-Training, introduced [here](https://arxiv.org/pdf/2106.01342).                         |
 | `AutoInt`        | Automatic Feature Interaction Learning via Self-Attentive Neural Networks introduced [here](https://arxiv.org/abs/1810.11921).                      |
-| `Trompt `        | Trompt: Towards a Better Deep Neural Network for Tabular Data introduced [here](https://arxiv.org/abs/2305.18446).                                  |
+| `Trompt`        | Trompt: Towards a Better Deep Neural Network for Tabular Data introduced [here](https://arxiv.org/abs/2305.18446).                                  |
+| `Tangos`        | Tangos: Regularizing Tabular Neural Networks through Gradient Orthogonalization and Specialization introduced [here](https://openreview.net/pdf?id=n6H86gW8u0d).                                  |
 
 
 
 
@@ -17,5 +17,5 @@
 
 # The following line *must* be the last in the module, exactly as formatted:
 
-__version__ = "1.3.0"
+__version__ = "1.3.1"
 
@@ -13,8 +13,10 @@
 from .autoint import AutoInt
 from .trompt import Trompt
 from .enode import ENODE
+from .tangos import Tangos
 
 __all__ = [
+    "Tangos",
     "ENODE",
     "Trompt",
     "AutoInt",
 
@@ -0,0 +1,228 @@
+import torch
+import torch.nn as nn
+import numpy as np
+from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
+from ..configs.tangos_config import DefaultTangosConfig
+from ..utils.get_feature_dimensions import get_feature_dimensions
+from .utils.basemodel import BaseModel
+
+
+class Tangos(BaseModel):
+    """
+    A Multi-Layer Perceptron (MLP) model with optional GLU activation, batch normalization, layer normalization, and dropout. 
+    It includes a penalty term for specialization and orthogonality.
+
+    Parameters
+    ----------
+    feature_information : tuple
+        A tuple containing feature information for numerical and categorical features.
+    num_classes : int, optional (default=1)
+        The number of output classes.
+    config : DefaultTangosConfig, optional (default=DefaultTangosConfig())
+        Configuration object defining model hyperparameters.
+    **kwargs : dict
+        Additional arguments for the base model.
+
+    Attributes
+    ----------
+    returns_ensemble : bool
+        Whether the model returns an ensemble of predictions.
+    lamda1 : float
+        Regularization weight for the specialization loss.
+    lamda2 : float
+        Regularization weight for the orthogonality loss.
+    subsample : float
+        Proportion of neuron pairs to use for orthogonality loss calculation.
+    embedding_layer : EmbeddingLayer or None
+        Optional embedding layer for categorical features.
+    layers : nn.ModuleList
+        The main MLP layers including linear, normalization, and activation layers.
+    head : nn.Linear
+        The final output layer.
+    """
+    def __init__(
+        self,
+        feature_information: tuple,
+        num_classes=1,
+        config: DefaultTangosConfig = DefaultTangosConfig(),
+        **kwargs
+    ):
+        super().__init__(config=config, **kwargs)
+        self.save_hyperparameters(ignore=["feature_information"])
+        self.returns_ensemble = False
+
+        self.lamda1 = config.lamda1
+        self.lamda2 = config.lamda2
+        self.subsample = config.subsample
+
+        input_dim = get_feature_dimensions(*feature_information)
+
+        # Initialize layers
+        self.layers = nn.ModuleList()
+
+        # Input layer
+        self.layers.append(nn.Linear(input_dim, self.hparams.layer_sizes[0]))
+        if self.hparams.batch_norm:
+            self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[0]))
+
+        if self.hparams.use_glu:
+            self.layers.append(nn.GLU())
+        else:
+            self.layers.append(self.hparams.activation)
+        if self.hparams.dropout > 0.0:
+            self.layers.append(nn.Dropout(self.hparams.dropout))
+
+        # Hidden layers
+        for i in range(1, len(self.hparams.layer_sizes)):
+            self.layers.append(
+                nn.Linear(self.hparams.layer_sizes[i - 1], self.hparams.layer_sizes[i])
+            )
+            if self.hparams.batch_norm:
+                self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[i]))
+            if self.hparams.layer_norm:
+                self.layers.append(nn.LayerNorm(self.hparams.layer_sizes[i]))
+            if self.hparams.use_glu:
+                self.layers.append(nn.GLU())
+            else:
+                self.layers.append(self.hparams.activation)
+            if self.hparams.dropout > 0.0:
+                self.layers.append(nn.Dropout(self.hparams.dropout))
+
+        # Output layer
+        self.head = nn.Linear(self.hparams.layer_sizes[-1], num_classes)
+
+    def repr_forward(self, x) -> torch.Tensor:
+        """
+        Computes the forward pass for feature representations.
+
+        This method processes the input through the MLP layers, optionally using 
+        skip connections.
+
+        Parameters
+        ----------
+        x : torch.Tensor
+            Input tensor of shape (batch_size, feature_dim).
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor after passing through the representation layers.
+        """
+
+        x = x.unsqueeze(0)
+
+        for i in range(len(self.layers)):
+            if isinstance(self.layers[i], nn.Linear):
+                out = self.layers[i](x)
+                if self.hparams.skip_connections and x.shape == out.shape:
+                    x = x + out
+                else:
+                    x = out
+            else:
+                x = self.layers[i](x)
+
+        return x
+
+    def forward(self, *data) -> torch.Tensor:
+        """
+        Performs a forward pass of the MLP model.
+
+        This method concatenates all input tensors before applying MLP layers.
+
+        Parameters
+        ----------
+        data : tuple
+            A tuple containing lists of numerical, categorical, and embedded feature tensors.
+
+        Returns
+        -------
+        torch.Tensor
+            The output tensor of shape (batch_size, num_classes).
+        """
+
+        x = torch.cat([t for tensors in data for t in tensors], dim=1)
+
+        for i in range(len(self.layers)):
+            if isinstance(self.layers[i], nn.Linear):
+                out = self.layers[i](x)
+                if self.hparams.skip_connections and x.shape == out.shape:
+                    x = x + out
+                else:
+                    x = out
+            else:
+                x = self.layers[i](x)
+        x = self.head(x)
+        return x
+
+    def penalty_forward(self, *data):
+        """
+        Computes both the model predictions and a penalty term.
+
+        The penalty term includes:
+        - **Specialization loss**: Measures feature importance concentration.
+        - **Orthogonality loss**: Encourages diversity among learned features.
+
+        The method uses `jacrev` to compute the Jacobian of the representation function.
+
+        Parameters
+        ----------
+        data : tuple
+            A tuple containing lists of numerical, categorical, and embedded feature tensors.
+
+        Returns
+        -------
+        tuple
+            - predictions : torch.Tensor
+                Model predictions of shape (batch_size, num_classes).
+            - penalty : torch.Tensor
+                The computed penalty term for regularization.
+        """
+
+        x = torch.cat([t for tensors in data for t in tensors], dim=1)
+        batch_size = x.shape[0]
+        subsample = np.int32(self.subsample*batch_size)
+
+        # Flatten before passing to jacrev
+        flat_data = torch.cat([t for tensors in data for t in tensors], dim=1)      
+
+        # Compute Jacobian
+        jacobian = torch.func.vmap(torch.func.jacrev(self.repr_forward), randomness="different")(flat_data)
+        jacobian = jacobian.squeeze()
+
+        neuron_attr = jacobian.swapaxes(0, 1)
+        h_dim = neuron_attr.shape[0]
+        if len(neuron_attr.shape) > 3:
+            # h_dim x batch_size x features
+            neuron_attr = neuron_attr.flatten(start_dim=2)
+
+        # calculate specialization loss component
+        spec_loss = torch.norm(neuron_attr, p=1) / (batch_size * h_dim * neuron_attr.shape[2])
+        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
+        orth_loss = torch.tensor(0.0, requires_grad=True).to(x.device)
+        # apply subsampling routine for orthogonalization loss
+        if self.subsample > 0 and self.subsample < h_dim * (h_dim - 1) / 2:
+            tensor_pairs = [
+                list(np.random.choice(h_dim, size=(2), replace=False))
+                for i in range(subsample)
+            ]
+            for tensor_pair in tensor_pairs:
+                pairwise_corr = cos(
+                    neuron_attr[tensor_pair[0], :, :], neuron_attr[tensor_pair[1], :, :]
+                ).norm(p=1)
+                orth_loss = orth_loss + pairwise_corr
+
+            orth_loss = orth_loss / (batch_size * self.subsample)
+        else:
+            for neuron_i in range(1, h_dim):
+                for neuron_j in range(0, neuron_i):
+                    pairwise_corr = cos(
+                        neuron_attr[neuron_i, :, :], neuron_attr[neuron_j, :, :]
+                    ).norm(p=1)
+                    orth_loss = orth_loss + pairwise_corr
+            num_pairs = h_dim * (h_dim - 1) / 2
+            orth_loss = orth_loss / (batch_size * num_pairs)
+
+        penalty = self.lamda1 * spec_loss + self.lamda2 * orth_loss
+        predictions = self.forward(*data)
+
+        return predictions, penalty
@@ -89,7 +89,7 @@ def __init__(
         else:
             output_dim = num_classes
 
-        self.base_model = model_class(
+        self.estimator = model_class(
             config=config,
             feature_information=feature_information,
             num_classes=output_dim,
@@ -112,7 +112,7 @@ def forward(self, num_features, cat_features, embeddings):
             Model output.
         """
 
-        return self.base_model.forward(num_features, cat_features, embeddings)
+        return self.estimator.forward(num_features, cat_features, embeddings)
 
     def compute_loss(self, predictions, y_true):
         """Compute the loss for the given predictions and true labels.
@@ -130,7 +130,7 @@ def compute_loss(self, predictions, y_true):
             Computed loss.
         """
         if self.lss:
-            if getattr(self.base_model, "returns_ensemble", False):
+            if getattr(self.estimator, "returns_ensemble", False):
                 loss = 0.0
                 for ensemble_member in range(predictions.shape[1]):
                     loss += self.family.compute_loss(  # type: ignore
@@ -143,7 +143,7 @@ def compute_loss(self, predictions, y_true):
                     y_true.squeeze(-1),
                 )
 
-        if getattr(self.base_model, "returns_ensemble", False):  # Ensemble case
+        if getattr(self.estimator, "returns_ensemble", False):  # Ensemble case
             if (
                 self.loss_fct.__class__.__name__ == "CrossEntropyLoss"
                 and predictions.dim() == 3
@@ -191,8 +191,8 @@ def training_step(self, batch, batch_idx):  # type: ignore
         data, labels = batch
 
         # Check if the model has a `penalty_forward` method
-        if hasattr(self.base_model, "penalty_forward"):
-            preds, penalty = self.base_model.penalty_forward(*data)
+        if hasattr(self.estimator, "penalty_forward"):
+            preds, penalty = self.estimator.penalty_forward(*data)
             loss = self.compute_loss(preds, labels) + penalty
         else:
             preds = self(*data)
@@ -396,7 +396,7 @@ def configure_optimizers(self):  # type: ignore
 
         # Initialize the optimizer with the chosen class and parameters
         optimizer = optimizer_class(
-            self.base_model.parameters(),
+            self.estimator.parameters(),
             lr=self.lr,
             weight_decay=self.weight_decay,
             **self.optimizer_params,  # Pass any additional optimizer-specific parameters
@@ -443,9 +443,9 @@ def pretrain_embeddings(
             Path to save the pretrained embeddings.
         """
         print("🚀 Pretraining embeddings...")
-        self.base_model.train()
+        self.estimator.train()
 
-        optimizer = torch.optim.Adam(self.base_model.embedding_parameters(), lr=lr)
+        optimizer = torch.optim.Adam(self.estimator.embedding_parameters(), lr=lr)
 
         # 🔥 Single tqdm progress bar across all epochs and batches
         total_batches = pretrain_epochs * len(train_dataloader)
@@ -459,7 +459,7 @@ def pretrain_embeddings(
                 optimizer.zero_grad()
 
                 # Forward pass through embeddings only
-                embeddings = self.base_model.encode(data, grad=True)
+                embeddings = self.estimator.encode(data, grad=True)
 
                 # Compute nearest neighbors based on task type
                 knn_indices = self.get_knn(labels, k_neighbors, regression)
@@ -481,7 +481,7 @@ def pretrain_embeddings(
         progress_bar.close()
 
         # Save pretrained embeddings
-        torch.save(self.base_model.get_embedding_state_dict(), save_path)
+        torch.save(self.estimator.get_embedding_state_dict(), save_path)
         print(f"✅ Embeddings saved to {save_path}")
 
     def get_knn(self, labels, k_neighbors=5, regression=True, device=""):
 
@@ -20,8 +20,8 @@ def __init__(
         pool_sequence=True,
     ):
         super().__init__()
-        self.base_model = base_model
-        self.base_model.eval()
+        self.estimator = base_model
+        self.estimator.eval()
         self.k_neighbors = k_neighbors
         self.temperature = temperature
         self.lr = lr
@@ -33,9 +33,9 @@ def __init__(
         self.loss_fn = nn.CosineEmbeddingLoss(margin=margin, reduction="mean")
 
     def forward(self, x):
-        x = self.base_model.encode(x, grad=True)
+        x = self.estimator.encode(x, grad=True)
         if self.pool_sequence:
-            return self.base_model.pool_sequence(x)
+            return self.estimator.pool_sequence(x)
         return x  # Return unpooled sequence embeddings (N, S, D)
 
     def get_knn(self, labels):
@@ -140,7 +140,7 @@ def contrastive_loss(self, embeddings, knn_indices, neg_indices):
 
     def training_step(self, batch, batch_idx):
 
-        self.base_model.embedding_layer.train()
+        self.estimator.embedding_layer.train()
 
         data, labels = batch
         embeddings = self(data)
@@ -173,7 +173,7 @@ def validation_step(self, batch, batch_idx):
         return loss
 
     def configure_optimizers(self):
-        params = chain(self.base_model.parameters())
+        params = chain(self.estimator.parameters())
         return torch.optim.Adam(params, lr=self.lr)
Original file line number	Diff line number	Diff line change
`@@ -17,5 +17,5 @@`
`17`	`17`
`18`	`18`	`# The following line must be the last in the module, exactly as formatted:`
`19`	`19`
`20`		`-__version__ = "1.3.0"`
	`20`	`+__version__ = "1.3.1"`
`21`	`21`