Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,21 @@
- **rf_pfn**: Combine TabPFN with decision trees and random forests
- **unsupervised**: Data generation and outlier detection
- **embedding**: Get TabPFNs internal dense sample embeddings
- **tabpfgen_datasynthesizer**: Synthetic tabular data generation with TabPFGen

Detailed documentation for each extension is available in the respective module directories.

## ⚙️ Installation

```bash
# Clone and install the repository
# Clone and install the repository (Python 3.9+ compatible)
pip install "tabpfn-extensions[all] @ git+https://github.com/PriorLabs/tabpfn-extensions.git"

# Add TabPFGen Data Synthesizer (requires Python 3.10+)
pip install "tabpfn-extensions[all, tabpfgen_datasynthesizer] @ git+https://github.com/PriorLabs/tabpfn-extensions.git"
```


### 🔄 Backend Options

TabPFN Extensions works with two TabPFN implementations:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Basic Classification Example with TabPFGen Data Synthesizer

This example demonstrates how to use TabPFGen for synthetic data generation
in classification tasks, leveraging the actual TabPFGen package features.
"""

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Import TabPFN Extensions
from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import analyze_class_distribution


def main():
"""Run basic classification example."""
print("=== TabPFGen Classification Example ===\n")

# Load breast cancer dataset
print("Loading breast cancer dataset...")
X, y = load_breast_cancer(return_X_y=True)
feature_names = load_breast_cancer().feature_names

# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]} samples")

# Analyze original distribution
analyze_class_distribution(y_train, "Original Training Data")

# Initialize TabPFGen synthesizer
print("\nInitializing TabPFGen synthesizer...")
synthesizer = TabPFNDataSynthesizer(
n_sgld_steps=300, # Reduced for faster demo
device="auto",
)

# Generate synthetic data using TabPFGen's built-in methods
print("\nGenerating synthetic classification data...")
n_synthetic = 200
X_synth, y_synth = synthesizer.generate_classification(
X_train,
y_train,
n_samples=n_synthetic,
balance_classes=True, # This balances only the synthetic samples
visualize=True, # Use TabPFGen's built-in visualization
feature_names=list(feature_names),
)

print(f"\nGenerated {len(X_synth)} synthetic samples")
analyze_class_distribution(y_synth, "Synthetic Data")

# Combine original and synthetic data
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import combine_datasets

X_augmented, y_augmented = combine_datasets(
X_train, y_train, X_synth, y_synth, strategy="append"
)

analyze_class_distribution(y_augmented, "Augmented Training Data")

print("\n✅ Synthetic data generation completed successfully!")

# Calculate quality metrics
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
calculate_synthetic_quality_metrics,
)

print("\n" + "=" * 60)
print("SYNTHETIC DATA QUALITY METRICS")
print("=" * 60)

quality_metrics = calculate_synthetic_quality_metrics(
X_train, X_synth, y_train, y_synth
)

for metric, value in quality_metrics.items():
print(f"{metric}: {value:.4f}")


if __name__ == "__main__":
main()
113 changes: 113 additions & 0 deletions examples/tabpfgen_datasynthesizer/basic_regression_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Basic Regression Example with TabPFGen Data Synthesizer

This example demonstrates how to use TabPFGen for synthetic data generation
in regression tasks, using TabPFGen's built-in features.
"""

import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# Import TabPFN Extensions
from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
calculate_synthetic_quality_metrics,
)


def main():
"""Run basic regression example."""
print("=== TabPFGen Regression Example ===\n")

# Load diabetes dataset
print("Loading diabetes dataset...")
X, y = load_diabetes(return_X_y=True)
feature_names = load_diabetes().feature_names

# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)

print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]} samples")
print(f"Target range: [{y_train.min():.1f}, {y_train.max():.1f}]")

# Initialize TabPFGen synthesizer
print("\nInitializing TabPFGen synthesizer...")
synthesizer = TabPFNDataSynthesizer(
n_sgld_steps=300, # Good balance for regression
device="auto",
)

# Generate synthetic regression data
print("\nGenerating synthetic regression data...")
n_synthetic = 150
X_synth, y_synth = synthesizer.generate_regression(
X_train,
y_train,
n_samples=n_synthetic,
use_quantiles=True, # Important for regression quality
visualize=True, # Use TabPFGen's built-in visualization
feature_names=list(feature_names),
)

print(f"\nGenerated {len(X_synth)} synthetic samples")
print(f"Synthetic target range: [{y_synth.min():.1f}, {y_synth.max():.1f}]")

# Combine original and synthetic data
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import combine_datasets

X_augmented, y_augmented = combine_datasets(
X_train, y_train, X_synth, y_synth, strategy="append"
)

print(f"Combined dataset: {len(X_augmented)} samples")
print(f"Combined target range: [{y_augmented.min():.1f}, {y_augmented.max():.1f}]")

# Calculate quality metrics
print("\n" + "=" * 60)
print("SYNTHETIC DATA QUALITY METRICS")
print("=" * 60)

quality_metrics = calculate_synthetic_quality_metrics(
X_train, X_synth, y_train, y_synth
)

print("\nFeature quality metrics:")
for metric, value in quality_metrics.items():
print(f"{metric}: {value:.4f}")

# Statistical comparison
print("\nStatistical comparison:")
print(f"Original data - Mean: {np.mean(X_train):.3f}, Std: {np.std(X_train):.3f}")
print(f"Synthetic data - Mean: {np.mean(X_synth):.3f}, Std: {np.std(X_synth):.3f}")
print("Target correlation preservation:")

# Check target correlations
orig_target_corr = []
synth_target_corr = []

for i in range(X_train.shape[1]):
orig_corr = np.corrcoef(X_train[:, i], y_train)[0, 1]
synth_corr = np.corrcoef(X_synth[:, i], y_synth)[0, 1]
orig_target_corr.append(orig_corr)
synth_target_corr.append(synth_corr)

print(
f"Average target correlation - Original: {np.mean(np.abs(orig_target_corr)):.3f}"
)
print(
f"Average target correlation - Synthetic: {np.mean(np.abs(synth_target_corr)):.3f}"
)

correlation_preservation = 1 - np.mean(
np.abs(np.array(orig_target_corr) - np.array(synth_target_corr))
)
print(f"Correlation preservation score: {correlation_preservation:.3f}")

print("\n✅ Synthetic regression data generation completed successfully!")


if __name__ == "__main__":
main()
147 changes: 147 additions & 0 deletions examples/tabpfgen_datasynthesizer/class_balancing_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""Dataset Balancing Demo with TabPFGen's balance_dataset Method

This example demonstrates the new balance_dataset method in TabPFGen v0.1.3+
for automatically balancing imbalanced classification datasets.
"""


from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Import TabPFN Extensions
from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer

# Calculate quality metrics for both approaches
from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
analyze_class_distribution,
calculate_synthetic_quality_metrics,
)


def create_imbalanced_dataset():
"""Create a highly imbalanced classification dataset."""
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=15,
n_redundant=5,
n_classes=3,
weights=[0.7, 0.2, 0.1], # Highly imbalanced: 70%, 20%, 10%
random_state=42,
)
return X, y


def main():
"""Run dataset balancing demonstration."""
print("=== TabPFGen Dataset Balancing Demo ===\n")

# Create imbalanced dataset
print("Creating highly imbalanced dataset...")
X, y = create_imbalanced_dataset()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
print(f"Test data: {X_test.shape[0]} samples")

# Analyze original imbalanced distribution
original_analysis = analyze_class_distribution(
y_train, "Original Imbalanced Training Data"
)

# Initialize TabPFGen synthesizer
print("\nInitializing TabPFGen synthesizer...")
synthesizer = TabPFNDataSynthesizer(
n_sgld_steps=400, # Good balance of quality and speed
device="auto",
)

print("\n" + "=" * 70)
print("AUTOMATIC BALANCING (to majority class size)")
print("=" * 70)

# Use TabPFGen's balance_dataset method - automatic balancing
X_synth_auto, y_synth_auto, X_balanced_auto, y_balanced_auto = (
synthesizer.balance_dataset(
X_train,
y_train,
visualize=True, # Use TabPFGen's built-in visualization
feature_names=[f"feature_{i}" for i in range(X_train.shape[1])],
)
)

balanced_analysis_auto = analyze_class_distribution(
y_balanced_auto, "Auto-Balanced Dataset"
)

print("\n" + "=" * 70)
print("CUSTOM TARGET BALANCING (1000 samples per class)")
print("=" * 70)

# Use TabPFGen's balance_dataset method - custom target
X_synth_custom, y_synth_custom, X_balanced_custom, y_balanced_custom = (
synthesizer.balance_dataset(
X_train,
y_train,
target_per_class=1000, # Custom target
visualize=True,
feature_names=[f"feature_{i}" for i in range(X_train.shape[1])],
)
)

balanced_analysis_custom = analyze_class_distribution(
y_balanced_custom, "Custom-Balanced Dataset (target=1000)"
)

balanced_analysis_custom = analyze_class_distribution(
y_balanced_custom, "Custom-Balanced Dataset (target=1000)"
)

# Quality analysis
print("\n" + "=" * 70)
print("BALANCING EFFECTIVENESS SUMMARY")
print("=" * 70)

print(
f"\nOriginal dataset imbalance ratio: {original_analysis['imbalance_ratio']:.1f}:1"
)
print(
f"Auto-balanced imbalance ratio: {balanced_analysis_auto['imbalance_ratio']:.1f}:1"
)
print(
f"Custom-balanced imbalance ratio: {balanced_analysis_custom['imbalance_ratio']:.1f}:1"
)

print("\nData size summary:")
print(f"Original training: {len(X_train)} samples")
print(
f"Auto-balanced: {len(X_balanced_auto)} samples (+{len(X_synth_auto)} synthetic)"
)
print(
f"Custom-balanced: {len(X_balanced_custom)} samples (+{len(X_synth_custom)} synthetic)"
)

print("\nSynthetic data quality metrics:")
print("Auto-balanced approach:")
quality_auto = calculate_synthetic_quality_metrics(
X_train, X_synth_auto, y_train, y_synth_auto
)
for metric, value in quality_auto.items():
print(f" {metric}: {value:.4f}")

print("\nCustom-balanced approach:")
quality_custom = calculate_synthetic_quality_metrics(
X_train, X_synth_custom, y_train, y_synth_custom
)
for metric, value in quality_custom.items():
print(f" {metric}: {value:.4f}")

print("\n✅ Dataset balancing demo completed successfully!")


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ interpretability = [
"shapiq>=0.4.0",
"seaborn>=0.12.2",
]
tabpfgen_datasynthesizer = [
"tabpfgen>=0.1.4",
]

post_hoc_ensembles = [
"kditransform>=0.2.0",
"llvmlite",
Expand Down
Loading
Loading