Fix non-diversity of training samples during finetuning

anuragg1209 · anuragg1209 · commit c8c01418868b · 2025-11-12T23:57:28.000Z
diff --git a/examples/finetune/finetune_example.py b/examples/finetune/finetune_example.py
@@ -1,18 +1,23 @@
-from sklearn.datasets import fetch_covtype
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import log_loss, roc_auc_score
 import numpy as np
 import torch
-from tabpfn import TabPFNClassifier
+from sklearn.datasets import fetch_covtype
+from sklearn.metrics import log_loss, roc_auc_score
+from sklearn.model_selection import train_test_split
 
+from tabpfn import TabPFNClassifier
 from tabpfn_extensions.finetune.finetune_classifier import FinetunedTabPFNClassifier
 
 # 1. Load and prepare the data
 # We use a small subset for a quick demonstration.
 print("--- 1. Loading Data ---")
 X_all, y_all = fetch_covtype(return_X_y=True, shuffle=True)
-X, y = X_all[:10000], y_all[:10000]
+X, y = X_all[:11000], y_all[:11000]
+
+# df = pd.read_csv("/home/anurag_priorlabs_ai/tabpfn-extensions/PrudentialLifeInsuranceAssessment.csv")
 
+# print(df.columns)
+# X = df.drop(columns=["Id", "Response"])
+# y = df["Response"]
 # Create a final hold-out test set. This is NOT used during fine-tuning.
 X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.2, random_state=42, stratify=y
@@ -22,17 +27,16 @@
 # Calculate ROC AUC
 def calculate_roc_auc(y_true: np.ndarray, y_pred_proba: np.ndarray) -> float:
     if len(np.unique(y_true)) == 2:
-        return roc_auc_score(y_true, y_pred_proba[:, 1])
-    else:
-        return roc_auc_score(y_true, y_pred_proba, multi_class="ovr", average="weighted")
+        return roc_auc_score(y_true, y_pred_proba[:, 1])   # pyright: ignore[reportReturnType]
+    return roc_auc_score(y_true, y_pred_proba, multi_class="ovr", average="weighted") # pyright: ignore[reportReturnType]
 
 # 2. Initial model evaluation on test set
 
-base_clf = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu', n_estimators=2)
+base_clf = TabPFNClassifier(device="cuda" if torch.cuda.is_available() else "cpu", n_estimators=2)
 base_clf.fit(X_train, y_train)
 
 base_pred_proba = base_clf.predict_proba(X_test)
-roc_auc = calculate_roc_auc(y_test, base_pred_proba)
+roc_auc = calculate_roc_auc(y_test, base_pred_proba) # pyright: ignore[reportReturnType, reportArgumentType]
 log_loss_score = log_loss(y_test, base_pred_proba)
 
 print(f"📊 Initial Test ROC: {roc_auc:.4f}")
@@ -43,25 +47,27 @@ def calculate_roc_auc(y_true: np.ndarray, y_pred_proba: np.ndarray) -> float:
 
 # Instantiate the wrapper with your desired hyperparameters
 finetuned_clf = FinetunedTabPFNClassifier(
-    device='cuda' if torch.cuda.is_available() else 'cpu',
+    device="cuda" if torch.cuda.is_available() else "cpu",
     epochs=10,
-    learning_rate=1e-5,
+    learning_rate=1e-6,
     n_inference_context_samples=10_000,
     finetune_split_ratio=0.3,
     random_state=42,
     n_estimators=2,
-    patience=3
+    patience=3,
+    ignore_pretraining_limits=True,
+    grad_clip_value=1.0,
 )
 
 # 4. Call .fit() to start the fine-tuning process on the training data
-finetuned_clf.fit(X_train, y_train)
+finetuned_clf.fit(X_train, y_train)  # pyright: ignore[reportArgumentType]
 print("\n")
 
 # 5. Evaluate the fine-tuned model
 print("--- 3. Evaluating Model on Held-out Test Set ---\n")
-y_pred_proba = finetuned_clf.predict_proba(X_test)
+y_pred_proba = finetuned_clf.predict_proba(X_test)  # pyright: ignore[reportArgumentType]
 
-roc_auc = calculate_roc_auc(y_test, y_pred_proba)
+roc_auc = calculate_roc_auc(y_test, y_pred_proba)  # pyright: ignore[reportArgumentType]
 loss = log_loss(y_test, y_pred_proba)
 
 print(f"📊 Final Test ROC: {roc_auc:.4f}")
diff --git a/src/tabpfn_extensions/finetune/finetune_classifier.py b/src/tabpfn_extensions/finetune/finetune_classifier.py
@@ -3,10 +3,9 @@
 
 from __future__ import annotations
 
+import copy
 import logging
-import tempfile
 from functools import partial
-from pathlib import Path
 from typing import Any
 
 import numpy as np
@@ -24,7 +23,6 @@
 
 from tabpfn import TabPFNClassifier
 from tabpfn.finetune_utils import clone_model_for_evaluation
-from tabpfn.model_loading import load_fitted_tabpfn_model, save_fitted_tabpfn_model
 from tabpfn.utils import meta_dataset_collator
 
 # Configure logging to show INFO level messages (including validation metrics)
@@ -188,15 +186,15 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
         X_train, X_val, y_train, y_val = validation_splitter(X, y)
 
         # Calculate the context size used during finetuning
-        context_size = min(self.n_inference_context_samples, len(y_train))
-        print(f"Context size: {context_size}")
+        n_finetuning_fit_predict_context_samples = min(self.n_inference_context_samples, len(y_train))
 
+        # Unpack kwargs to allow any TabPFNClassifier hyperparameter to be specified,
+        # then override with required config values
         classifier_config = {
+            **self.kwargs,
             "ignore_pretraining_limits": True,
             "device": self.device,
-            "n_estimators": self.kwargs.get("n_estimators", 8),
             "random_state": self.random_state,
-            # inference_precision": torch.float32,
         }
 
         # Initialize the base TabPFNClassifier
@@ -207,38 +205,15 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
         )
         # Required to access model parameters for the optimizer
         self.finetuned_classifier_._initialize_model_variables()
+        self.finetuned_classifier_.softmax_temperature_ = self.finetuned_classifier_.softmax_temperature
 
         eval_config = {
             **classifier_config,
             "inference_config": {
-                "SUBSAMPLE_SAMPLES": context_size,  # Passing this to the dataloader causes an error, so we set eval config separately from the classifier config.
+                "SUBSAMPLE_SAMPLES": n_finetuning_fit_predict_context_samples,  # Passing this to the dataloader causes an error, so we set eval config separately from the classifier config.
             },
         }
 
-        # Prepare data for the fine-tuning loop
-        # This splitter function will be applied to the training data to create
-        # (context, query) pairs for each step of the loop.
-
-        training_splitter = partial(
-            train_test_split,
-            test_size=self.finetune_split_ratio,
-            random_state=self.random_state,
-        )
-
-        training_datasets = self.finetuned_classifier_.get_preprocessed_datasets(
-            X_train,
-            y_train,
-            training_splitter,
-            context_size,
-            equal_split_size=False,
-        )
-
-        finetuning_dataloader = DataLoader(
-            training_datasets,
-            batch_size=self.meta_batch_size,
-            collate_fn=meta_dataset_collator,
-        )
-
         # Setup optimizer and loss function
         optimizer = Adam(
             self.finetuned_classifier_.model_.parameters(),  # type: ignore
@@ -259,9 +234,34 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
         # Early stopping variables
         best_roc_auc = -np.inf
         patience_counter = 0
-        best_model_path = None
+        best_model = None
 
         for epoch in range(self.epochs):
+            # Regenerate datasets each epoch with a different random_state to ensure
+            # diversity in context/query pairs across epochs. This prevents the model
+            # from seeing the exact same splits in every epoch, which could reduce
+            # training signal diversity.
+            training_splitter = partial(
+                train_test_split,
+                test_size=self.finetune_split_ratio,
+                random_state=self.random_state + epoch,
+            )
+
+            training_datasets = self.finetuned_classifier_.get_preprocessed_datasets(
+                X_train,
+                y_train,
+                training_splitter,
+                n_finetuning_fit_predict_context_samples,
+                equal_split_size=False,
+            )
+
+            finetuning_dataloader = DataLoader(
+                training_datasets,
+                batch_size=self.meta_batch_size,
+                collate_fn=meta_dataset_collator,
+                shuffle=True,
+            )
+
             progress_bar = tqdm(
                 finetuning_dataloader,
                 desc=f"Finetuning Epoch {epoch + 1}/{self.epochs}",
@@ -274,7 +274,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
                 cat_ixs,
                 confs,
             ) in progress_bar:
-                
+
                 ctx = set(np.unique(y_context_batch))
                 qry = set(np.unique(y_query_batch))
                 if not qry.issubset(ctx):
@@ -285,11 +285,11 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
 
                 if (
                     X_context_batch[0].shape[1] + X_query_batch[0].shape[1]
-                    != context_size
+                    != n_finetuning_fit_predict_context_samples
                 ):
                     actual_size = X_context_batch[0].shape[1] + X_query_batch[0].shape[1]
                     logging.warning(
-                        f"Skipping batch: total batch size {actual_size} does not match context size {context_size}"
+                        f"Skipping batch: total batch size {actual_size} does not match n_finetuning_fit_predict_context_samples {n_finetuning_fit_predict_context_samples}"
                     )
                     continue
 
@@ -361,7 +361,7 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
                 y_train,  # pyright: ignore[reportArgumentType]
                 X_val,    # pyright: ignore[reportArgumentType]
                 y_val,    # pyright: ignore[reportArgumentType]
-            )  
+            )
 
             logging.info(
                 f"📊 Epoch {epoch + 1} Evaluation | Val ROC: {roc_auc:.4f}, Val Log Loss: {log_loss_score:.4f}\n",
@@ -375,16 +375,8 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
                 if roc_auc > best_roc_auc + self.min_delta:
                     best_roc_auc = roc_auc
                     patience_counter = 0
-                    # Save the best model using TabPFN's official save function
-                    with tempfile.NamedTemporaryFile(
-                        suffix=".tabpfn_fit",
-                        delete=False,
-                    ) as tmp_file:
-                        best_model_path = Path(tmp_file.name)
-                        save_fitted_tabpfn_model(
-                            self.finetuned_classifier_,
-                            best_model_path,
-                        )
+                    # Save the best model
+                    best_model = copy.deepcopy(self.finetuned_classifier_)
                 else:
                     patience_counter += 1
                     logging.info(
@@ -394,28 +386,19 @@ def fit(self, X: np.ndarray, y: np.ndarray) -> FinetunedTabPFNClassifier:
                 if patience_counter >= self.patience:
                     logging.info(
                         f"🛑 Early stopping triggered. Best ROC AUC: {best_roc_auc:.4f}",
-                    )
-                    # Restore the best model using TabPFN's official load function
-                    if best_model_path is not None:
-                        self.finetuned_classifier_ = load_fitted_tabpfn_model(
-                            best_model_path,
-                            device=self.device,
                         )
-                        # Clean up the temporary file
-                        best_model_path.unlink(missing_ok=True)
+                    # Restore the best model
+                    if best_model is not None:
+                        self.finetuned_classifier_ = best_model
                     break
 
         logging.info("--- ✅ Fine-tuning Finished ---")
 
-        # Clean up temporary file if early stopping didn't trigger
-        if best_model_path is not None and best_model_path.exists():
-            best_model_path.unlink(missing_ok=True)
-
         finetuned_inference_classifier = clone_model_for_evaluation(
             self.finetuned_classifier_, # type: ignore
             eval_config,
             TabPFNClassifier,
-        )  
+        )
         self.finetuned_inference_classifier_ = finetuned_inference_classifier
         self.finetuned_inference_classifier_.fit_mode = "fit_preprocessors"  # type: ignore
         self.finetuned_inference_classifier_.fit(self.X_, self.y_)  # type: ignore