PriorLabs · sebhaan · May 25, 2025 · May 25, 2025 · May 25, 2025 · May 25, 2025
@@ -20,16 +20,21 @@
 - **rf_pfn**: Combine TabPFN with decision trees and random forests
 - **unsupervised**: Data generation and outlier detection
 - **embedding**: Get TabPFNs internal dense sample embeddings
+- **tabpfgen_datasynthesizer**: Synthetic tabular data generation with TabPFGen
 
 Detailed documentation for each extension is available in the respective module directories.
 
 ## ⚙️ Installation
 
 ```bash
-# Clone and install the repository
+# Clone and install the repository (Python 3.9+ compatible)
 pip install "tabpfn-extensions[all] @ git+https://github.com/PriorLabs/tabpfn-extensions.git"
+
+# Add TabPFGen Data Synthesizer (requires Python 3.10+)
+pip install "tabpfn-extensions[all, tabpfgen_datasynthesizer] @ git+https://github.com/PriorLabs/tabpfn-extensions.git"
 ```
 
+
 ### 🔄 Backend Options
 
 TabPFN Extensions works with two TabPFN implementations:

@@ -0,0 +1,86 @@
+"""Basic Classification Example with TabPFGen Data Synthesizer
+
+This example demonstrates how to use TabPFGen for synthetic data generation
+in classification tasks, leveraging the actual TabPFGen package features.
+"""
+
+from sklearn.datasets import load_breast_cancer
+from sklearn.model_selection import train_test_split
+
+# Import TabPFN Extensions
+from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer
+from tabpfn_extensions.tabpfgen_datasynthesizer.utils import analyze_class_distribution
+
+
+def main():
+    """Run basic classification example."""
+    print("=== TabPFGen Classification Example ===\n")
+
+    # Load breast cancer dataset
+    print("Loading breast cancer dataset...")
+    X, y = load_breast_cancer(return_X_y=True)
+    feature_names = load_breast_cancer().feature_names
+
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42, stratify=y
+    )
+
+    print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
+    print(f"Test data: {X_test.shape[0]} samples")
+
+    # Analyze original distribution
+    analyze_class_distribution(y_train, "Original Training Data")
+
+    # Initialize TabPFGen synthesizer
+    print("\nInitializing TabPFGen synthesizer...")
+    synthesizer = TabPFNDataSynthesizer(
+        n_sgld_steps=300,  # Reduced for faster demo
+        device="auto",
+    )
+
+    # Generate synthetic data using TabPFGen's built-in methods
+    print("\nGenerating synthetic classification data...")
+    n_synthetic = 200
+    X_synth, y_synth = synthesizer.generate_classification(
+        X_train,
+        y_train,
+        n_samples=n_synthetic,
+        balance_classes=True,  # This balances only the synthetic samples
+        visualize=True,  # Use TabPFGen's built-in visualization
+        feature_names=list(feature_names),
+    )
+
+    print(f"\nGenerated {len(X_synth)} synthetic samples")
+    analyze_class_distribution(y_synth, "Synthetic Data")
+
+    # Combine original and synthetic data
+    from tabpfn_extensions.tabpfgen_datasynthesizer.utils import combine_datasets
+
+    X_augmented, y_augmented = combine_datasets(
+        X_train, y_train, X_synth, y_synth, strategy="append"
+    )
+
+    analyze_class_distribution(y_augmented, "Augmented Training Data")
+
+    print("\n✅ Synthetic data generation completed successfully!")
+
+    # Calculate quality metrics
+    from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
+        calculate_synthetic_quality_metrics,
+    )
+
+    print("\n" + "=" * 60)
+    print("SYNTHETIC DATA QUALITY METRICS")
+    print("=" * 60)
+
+    quality_metrics = calculate_synthetic_quality_metrics(
+        X_train, X_synth, y_train, y_synth
+    )
+
+    for metric, value in quality_metrics.items():
+        print(f"{metric}: {value:.4f}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,113 @@
+"""Basic Regression Example with TabPFGen Data Synthesizer
+
+This example demonstrates how to use TabPFGen for synthetic data generation
+in regression tasks, using TabPFGen's built-in features.
+"""
+
+import numpy as np
+from sklearn.datasets import load_diabetes
+from sklearn.model_selection import train_test_split
+
+# Import TabPFN Extensions
+from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer
+from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
+    calculate_synthetic_quality_metrics,
+)
+
+
+def main():
+    """Run basic regression example."""
+    print("=== TabPFGen Regression Example ===\n")
+
+    # Load diabetes dataset
+    print("Loading diabetes dataset...")
+    X, y = load_diabetes(return_X_y=True)
+    feature_names = load_diabetes().feature_names
+
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+
+    print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
+    print(f"Test data: {X_test.shape[0]} samples")
+    print(f"Target range: [{y_train.min():.1f}, {y_train.max():.1f}]")
+
+    # Initialize TabPFGen synthesizer
+    print("\nInitializing TabPFGen synthesizer...")
+    synthesizer = TabPFNDataSynthesizer(
+        n_sgld_steps=300,  # Good balance for regression
+        device="auto",
+    )
+
+    # Generate synthetic regression data
+    print("\nGenerating synthetic regression data...")
+    n_synthetic = 150
+    X_synth, y_synth = synthesizer.generate_regression(
+        X_train,
+        y_train,
+        n_samples=n_synthetic,
+        use_quantiles=True,  # Important for regression quality
+        visualize=True,  # Use TabPFGen's built-in visualization
+        feature_names=list(feature_names),
+    )
+
+    print(f"\nGenerated {len(X_synth)} synthetic samples")
+    print(f"Synthetic target range: [{y_synth.min():.1f}, {y_synth.max():.1f}]")
+
+    # Combine original and synthetic data
+    from tabpfn_extensions.tabpfgen_datasynthesizer.utils import combine_datasets
+
+    X_augmented, y_augmented = combine_datasets(
+        X_train, y_train, X_synth, y_synth, strategy="append"
+    )
+
+    print(f"Combined dataset: {len(X_augmented)} samples")
+    print(f"Combined target range: [{y_augmented.min():.1f}, {y_augmented.max():.1f}]")
+
+    # Calculate quality metrics
+    print("\n" + "=" * 60)
+    print("SYNTHETIC DATA QUALITY METRICS")
+    print("=" * 60)
+
+    quality_metrics = calculate_synthetic_quality_metrics(
+        X_train, X_synth, y_train, y_synth
+    )
+
+    print("\nFeature quality metrics:")
+    for metric, value in quality_metrics.items():
+        print(f"{metric}: {value:.4f}")
+
+    # Statistical comparison
+    print("\nStatistical comparison:")
+    print(f"Original data - Mean: {np.mean(X_train):.3f}, Std: {np.std(X_train):.3f}")
+    print(f"Synthetic data - Mean: {np.mean(X_synth):.3f}, Std: {np.std(X_synth):.3f}")
+    print("Target correlation preservation:")
+
+    # Check target correlations
+    orig_target_corr = []
+    synth_target_corr = []
+
+    for i in range(X_train.shape[1]):
+        orig_corr = np.corrcoef(X_train[:, i], y_train)[0, 1]
+        synth_corr = np.corrcoef(X_synth[:, i], y_synth)[0, 1]
+        orig_target_corr.append(orig_corr)
+        synth_target_corr.append(synth_corr)
+
+    print(
+        f"Average target correlation - Original: {np.mean(np.abs(orig_target_corr)):.3f}"
+    )
+    print(
+        f"Average target correlation - Synthetic: {np.mean(np.abs(synth_target_corr)):.3f}"
+    )
+
+    correlation_preservation = 1 - np.mean(
+        np.abs(np.array(orig_target_corr) - np.array(synth_target_corr))
+    )
+    print(f"Correlation preservation score: {correlation_preservation:.3f}")
+
+    print("\n✅ Synthetic regression data generation completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,147 @@
+"""Dataset Balancing Demo with TabPFGen's balance_dataset Method
+
+This example demonstrates the new balance_dataset method in TabPFGen v0.1.3+
+for automatically balancing imbalanced classification datasets.
+"""
+
+
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+
+# Import TabPFN Extensions
+from tabpfn_extensions.tabpfgen_datasynthesizer import TabPFNDataSynthesizer
+
+# Calculate quality metrics for both approaches
+from tabpfn_extensions.tabpfgen_datasynthesizer.utils import (
+    analyze_class_distribution,
+    calculate_synthetic_quality_metrics,
+)
+
+
+def create_imbalanced_dataset():
+    """Create a highly imbalanced classification dataset."""
+    X, y = make_classification(
+        n_samples=1000,
+        n_features=20,
+        n_informative=15,
+        n_redundant=5,
+        n_classes=3,
+        weights=[0.7, 0.2, 0.1],  # Highly imbalanced: 70%, 20%, 10%
+        random_state=42,
+    )
+    return X, y
+
+
+def main():
+    """Run dataset balancing demonstration."""
+    print("=== TabPFGen Dataset Balancing Demo ===\n")
+
+    # Create imbalanced dataset
+    print("Creating highly imbalanced dataset...")
+    X, y = create_imbalanced_dataset()
+
+    # Split data
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42, stratify=y
+    )
+
+    print(f"Training data: {X_train.shape[0]} samples, {X_train.shape[1]} features")
+    print(f"Test data: {X_test.shape[0]} samples")
+
+    # Analyze original imbalanced distribution
+    original_analysis = analyze_class_distribution(
+        y_train, "Original Imbalanced Training Data"
+    )
+
+    # Initialize TabPFGen synthesizer
+    print("\nInitializing TabPFGen synthesizer...")
+    synthesizer = TabPFNDataSynthesizer(
+        n_sgld_steps=400,  # Good balance of quality and speed
+        device="auto",
+    )
+
+    print("\n" + "=" * 70)
+    print("AUTOMATIC BALANCING (to majority class size)")
+    print("=" * 70)
+
+    # Use TabPFGen's balance_dataset method - automatic balancing
+    X_synth_auto, y_synth_auto, X_balanced_auto, y_balanced_auto = (
+        synthesizer.balance_dataset(
+            X_train,
+            y_train,
+            visualize=True,  # Use TabPFGen's built-in visualization
+            feature_names=[f"feature_{i}" for i in range(X_train.shape[1])],
+        )
+    )
+
+    balanced_analysis_auto = analyze_class_distribution(
+        y_balanced_auto, "Auto-Balanced Dataset"
+    )
+
+    print("\n" + "=" * 70)
+    print("CUSTOM TARGET BALANCING (1000 samples per class)")
+    print("=" * 70)
+
+    # Use TabPFGen's balance_dataset method - custom target
+    X_synth_custom, y_synth_custom, X_balanced_custom, y_balanced_custom = (
+        synthesizer.balance_dataset(
+            X_train,
+            y_train,
+            target_per_class=1000,  # Custom target
+            visualize=True,
+            feature_names=[f"feature_{i}" for i in range(X_train.shape[1])],
+        )
+    )
+
+    balanced_analysis_custom = analyze_class_distribution(
+        y_balanced_custom, "Custom-Balanced Dataset (target=1000)"
+    )
+
+    balanced_analysis_custom = analyze_class_distribution(
+        y_balanced_custom, "Custom-Balanced Dataset (target=1000)"
+    )
+
+    # Quality analysis
+    print("\n" + "=" * 70)
+    print("BALANCING EFFECTIVENESS SUMMARY")
+    print("=" * 70)
+
+    print(
+        f"\nOriginal dataset imbalance ratio: {original_analysis['imbalance_ratio']:.1f}:1"
+    )
+    print(
+        f"Auto-balanced imbalance ratio: {balanced_analysis_auto['imbalance_ratio']:.1f}:1"
+    )
+    print(
+        f"Custom-balanced imbalance ratio: {balanced_analysis_custom['imbalance_ratio']:.1f}:1"
+    )
+
+    print("\nData size summary:")
+    print(f"Original training: {len(X_train)} samples")
+    print(
+        f"Auto-balanced: {len(X_balanced_auto)} samples (+{len(X_synth_auto)} synthetic)"
+    )
+    print(
+        f"Custom-balanced: {len(X_balanced_custom)} samples (+{len(X_synth_custom)} synthetic)"
+    )
+
+    print("\nSynthetic data quality metrics:")
+    print("Auto-balanced approach:")
+    quality_auto = calculate_synthetic_quality_metrics(
+        X_train, X_synth_auto, y_train, y_synth_auto
+    )
+    for metric, value in quality_auto.items():
+        print(f"  {metric}: {value:.4f}")
+
+    print("\nCustom-balanced approach:")
+    quality_custom = calculate_synthetic_quality_metrics(
+        X_train, X_synth_custom, y_train, y_synth_custom
+    )
+    for metric, value in quality_custom.items():
+        print(f"  {metric}: {value:.4f}")
+
+    print("\n✅ Dataset balancing demo completed successfully!")
+
+
+if __name__ == "__main__":
+    main()
@@ -57,6 +57,10 @@ interpretability = [
     "shapiq>=0.4.0",
     "seaborn>=0.12.2",
 ]
+tabpfgen_datasynthesizer = [
+    "tabpfgen>=0.1.4",
+]
+
 post_hoc_ensembles = [
     "kditransform>=0.2.0",
     "llvmlite",