Skip to content

Model Training

SRIJA DE CHOWDHURY edited this page Jan 4, 2026 · 1 revision

🎯 Model Training

Complete Guide to Training Your Logistic Regression Model

Training Optimization


🗺️ Training Pipeline Overview

graph LR
    A[📥 Load Data] --> B[🔍 Explore Data]
    B --> C[🧹 Preprocess]
    C --> D[✂️ Split Data]
    D --> E[📏 Scale Features]
    E --> F[🎯 Train Model]
    F --> G[📊 Validate]
    G --> H{Good Performance?}
    H -->|No| I[⚙️ Tune Hyperparameters]
    I --> F
    H -->|Yes| J[🧪 Test Model]
    J --> K[🎉 Deploy]
    
    style A fill:#e1f5ff
    style C fill:#ffe1e1
    style F fill:#e1ffe1
    style H fill:#fff4e1
    style K fill:#ffd700
Loading

1️⃣ Data Preparation

Load Dataset

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load data
data = load_breast_cancer()
X = data.data
y = data.target

print(f"📊 Dataset Shape: {X. shape}")
print(f"📈 Positive samples: {np.sum(y == 1)}")
print(f"📉 Negative samples: {np.sum(y == 0)}")

Exploratory Data Analysis (EDA)

import matplotlib.pyplot as plt
import seaborn as sns

def explore_data(X, y, feature_names):
    """
    Perform basic EDA
    
    Parameters:
    -----------
    X : features array
    y : labels array
    feature_names : list of feature names
    """
    # Create DataFrame
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y
    
    print("=" * 50)
    print("📊 DATASET OVERVIEW")
    print("=" * 50)
    print(f"\nShape: {df.shape}")
    print(f"\nClass Distribution:")
    print(df['target'].value_counts())
    print(f"\nClass Balance:  {df['target'].value_counts(normalize=True)}")
    
    print("\n" + "=" * 50)
    print("📈 STATISTICAL SUMMARY")
    print("=" * 50)
    print(df.describe())
    
    print("\n" + "=" * 50)
    print("❓ MISSING VALUES")
    print("=" * 50)
    print(df.isnull().sum())
    
    # Visualize class distribution
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    df['target'].value_counts().plot(kind='bar', color=['red', 'blue'])
    plt.title('Class Distribution', fontweight='bold')
    plt.xlabel('Class')
    plt.ylabel('Count')
    plt.xticks(rotation=0)
    
    plt.subplot(1, 2, 2)
    df['target'].value_counts().plot(kind='pie', autopct='%1.1f%%', 
                                      colors=['red', 'blue'])
    plt.title('Class Proportion', fontweight='bold')
    plt.ylabel('')
    
    plt.tight_layout()
    plt.show()

# Usage
explore_data(X, y, data.feature_names)

2️⃣ Data Splitting

Train-Test Split

from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,      # 80% train, 20% test
    random_state=42,     # Reproducibility
    stratify=y           # Maintain class balance
)

print(f"✅ Training set: {X_train.shape}")
print(f"✅ Test set: {X_test. shape}")
print(f"✅ Train labels: {y_train.shape}")
print(f"✅ Test labels: {y_test. shape}")

Train-Validation-Test Split

from sklearn.model_selection import train_test_split

# First split:  separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Second split: separate train and validation
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
)  # 0.25 * 0.8 = 0.2 (20% validation)

print("📊 Data Split Summary:")
print(f"   Training:    {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Validation:  {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"   Test:       {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

Visualization

Original Dataset (100%)
├── Training Set (60%)
├── Validation Set (20%)
└── Test Set (20%)

3️⃣ Feature Scaling

Why Scale?

| Feature | Without Scaling | With Scaling | |: --------|:----------------|:-------------| | Age | 20 - 80 | 0 - 1 | | Income | 20,000 - 200,000 | 0 - 1 | | Problem | Income dominates! | Equal importance |

Standardization (Z-score Normalization)

from sklearn.preprocessing import StandardScaler

# Create scaler
scaler = StandardScaler()

# Fit on training data only! 
scaler.fit(X_train)

# Transform both train and test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Features scaled using StandardScaler")
print(f"   Mean: {X_train_scaled.mean():.4f}")
print(f"   Std:   {X_train_scaled. std():.4f}")

Formula:

$$X_{scaled} = \frac{X - \mu}{\sigma}$$

Min-Max Normalization

from sklearn.preprocessing import MinMaxScaler

# Create scaler
scaler = MinMaxScaler()

# Fit and transform
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✅ Features scaled to [0, 1] range")

Formula:

$$X_{scaled} = \frac{X - X_{min}}{X_{max} - X_{min}}$$

⚠️ Important Rules

✅ Do

  • Fit scaler on training data only
  • Apply same transformation to test data
  • Scale before training
  • Use consistent scaling method

❌ Don't

  • Fit scaler on test data
  • Forget to scale test data
  • Mix scaling methods
  • Scale target variable (for classification)

4️⃣ Training the Model

Basic Training

from logistic_regression import LogisticRegression

# Create model
model = LogisticRegression(
    learning_rate=0.01,
    n_iterations=1000,
    verbose=True
)

# Train model
print("🚀 Starting training...")
model.fit(X_train_scaled, y_train)
print("✅ Training complete!")

# Plot learning curve
model.plot_cost_history()

Training with Early Stopping

class LogisticRegressionWithEarlyStopping(LogisticRegression):
    """Enhanced version with early stopping"""
    
    def fit(self, X, y, X_val=None, y_val=None, patience=10):
        """
        Train with early stopping
        
        Parameters: 
        -----------
        X, y : training data
        X_val, y_val : validation data
        patience : int, stop if no improvement for this many iterations
        """
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0
        
        best_val_loss = float('inf')
        patience_counter = 0
        
        for i in range(self.n_iterations):
            # Forward pass
            z = np.dot(X, self.weights) + self.bias
            predictions = self._sigmoid(z)
            
            # Compute training cost
            train_cost = self._compute_cost(y, predictions)
            self.cost_history.append(train_cost)
            
            # Compute validation cost
            if X_val is not None: 
                val_predictions = self._sigmoid(np.dot(X_val, self.weights) + self.bias)
                val_cost = self._compute_cost(y_val, val_predictions)
                
                # Early stopping check
                if val_cost < best_val_loss:
                    best_val_loss = val_cost
                    patience_counter = 0
                    # Save best weights
                    best_weights = self.weights.copy()
                    best_bias = self.bias
                else:
                    patience_counter += 1
                
                if patience_counter >= patience:
                    print(f"⏹️ Early stopping at iteration {i}")
                    self.weights = best_weights
                    self.bias = best_bias
                    break
            
            # Backward pass
            dz = predictions - y
            dw = (1/m) * np.dot(X.T, dz)
            db = (1/m) * np.sum(dz)
            
            # Update
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            if self.verbose and i % 100 == 0:
                print(f"Iteration {i}: Train Cost = {train_cost:.4f}, Val Cost = {val_cost:.4f}")
        
        return self

# Usage
model = LogisticRegressionWithEarlyStopping(learning_rate=0.01, n_iterations=5000, verbose=True)
model.fit(X_train_scaled, y_train, X_val_scaled, y_val, patience=50)

5️⃣ Hyperparameter Tuning

Grid Search

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression as SklearnLR

# Define parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'max_iter': [100, 500, 1000, 2000],
    'solver': ['liblinear', 'lbfgs']
}

# Create grid search
grid_search = GridSearchCV(
    SklearnLR(),
    param_grid,
    cv=5,                    # 5-fold cross-validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1                # Use all CPU cores
)

# Fit grid search
print("🔍 Starting grid search...")
grid_search.fit(X_train_scaled, y_train)

# Best parameters
print("\n🏆 Best Parameters:")
print(grid_search.best_params_)
print(f"\n📊 Best CV Score: {grid_search.best_score_:.4f}")

Manual Hyperparameter Tuning

def tune_hyperparameters(X_train, y_train, X_val, y_val):
    """
    Manual hyperparameter tuning
    
    Returns best learning rate and iterations
    """
    learning_rates = [0.001, 0.01, 0.1, 0.5]
    iterations_list = [500, 1000, 2000, 5000]
    
    best_score = 0
    best_params = {}
    results = []
    
    for lr in learning_rates:
        for iters in iterations_list:
            # Train model
            model = LogisticRegression(learning_rate=lr, n_iterations=iters)
            model.fit(X_train, y_train)
            
            # Evaluate on validation set
            val_score = model.score(X_val, y_val)
            
            results.append({
                'learning_rate': lr,
                'iterations': iters,
                'val_accuracy': val_score
            })
            
            if val_score > best_score:
                best_score = val_score
                best_params = {'learning_rate': lr, 'iterations': iters}
            
            print(f"LR={lr}, Iters={iters}: Val Accuracy={val_score:.4f}")
    
    print(f"\n🏆 Best Parameters: {best_params}")
    print(f"📊 Best Validation Accuracy: {best_score:.4f}")
    
    # Convert to DataFrame for visualization
    results_df = pd. DataFrame(results)
    
    # Pivot for heatmap
    pivot = results_df.pivot(index='learning_rate', 
                             columns='iterations', 
                             values='val_accuracy')
    
    # Plot heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu', 
                cbar_kws={'label': 'Validation Accuracy'})
    plt.title('🔍 Hyperparameter Tuning Results', fontweight='bold', pad=20)
    plt.xlabel('Number of Iterations', fontweight='bold')
    plt.ylabel('Learning Rate', fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    return best_params, results_df

# Usage
best_params, results = tune_hyperparameters(X_train_scaled, y_train, X_val_scaled, y_val)

6️⃣ Cross-Validation

K-Fold Cross-Validation

from sklearn.model_selection import cross_val_score, KFold

def perform_cross_validation(X, y, k=5):
    """
    Perform k-fold cross-validation
    
    Parameters: 
    -----------
    X : features
    y : labels
    k : number of folds
    """
    # Create KFold object
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    
    scores = []
    fold_num = 1
    
    for train_idx, val_idx in kfold.split(X):
        # Split data
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Scale data
        scaler = StandardScaler()
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler. transform(X_val_fold)
        
        # Train model
        model = LogisticRegression(learning_rate=0.01, n_iterations=1000)
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate
        score = model.score(X_val_fold, y_val_fold)
        scores.append(score)
        
        print(f"Fold {fold_num}:  Accuracy = {score:.4f}")
        fold_num += 1
    
    print("\n" + "=" * 50)
    print(f"📊 Cross-Validation Results ({k}-Fold)")
    print("=" * 50)
    print(f"Mean Accuracy: {np.mean(scores):.4f}{np.std(scores):.4f})")
    print(f"Min Accuracy:   {np.min(scores):.4f}")
    print(f"Max Accuracy:  {np.max(scores):.4f}")
    
    # Visualize
    plt.figure(figsize=(10, 6))
    plt.bar(range(1, k+1), scores, color='skyblue', edgecolor='black')
    plt.axhline(y=np.mean(scores), color='red', linestyle='--', 
                linewidth=2, label=f'Mean: {np.mean(scores):.4f}')
    plt.xlabel('Fold Number', fontweight='bold')
    plt.ylabel('Accuracy', fontweight='bold')
    plt.title(f'📊 {k}-Fold Cross-Validation Results', fontweight='bold', pad=20)
    plt.legend()
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    return scores

# Usage
cv_scores = perform_cross_validation(X, y, k=5)

Stratified K-Fold

from sklearn.model_selection import StratifiedKFold

# Better for imbalanced datasets
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Ensures each fold has same class distribution

7️⃣ Training Monitoring

Real-Time Training Visualization

import matplotlib.pyplot as plt
from IPython.display import clear_output

class LogisticRegressionWithLiveplot(LogisticRegression):
    """Model with live training visualization"""
    
    def fit(self, X, y, plot_interval=50):
        """
        Train with live plotting
        
        Parameters: 
        -----------
        X, y : training data
        plot_interval : update plot every N iterations
        """
        m, n = X.shape
        self.weights = np.zeros(n)
        self.bias = 0
        
        plt.ion()  # Interactive mode
        fig, ax = plt.subplots(figsize=(10, 6))
        
        for i in range(self.n_iterations):
            # Training step
            z = np.dot(X, self.weights) + self.bias
            predictions = self._sigmoid(z)
            cost = self._compute_cost(y, predictions)
            self.cost_history.append(cost)
            
            # Gradients and update
            dz = predictions - y
            dw = (1/m) * np.dot(X.T, dz)
            db = (1/m) * np.sum(dz)
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # Update plot
            if i % plot_interval == 0:
                clear_output(wait=True)
                ax.clear()
                ax.plot(self.cost_history, linewidth=2, color='blue')
                ax.set_xlabel('Iterations', fontweight='bold')
                ax.set_ylabel('Cost', fontweight='bold')
                ax. set_title(f'🔄 Training Progress (Iteration {i}/{self.n_iterations})', 
                           fontweight='bold')
                ax.grid(True, alpha=0.3)
                plt.pause(0.01)
        
        plt.ioff()
        plt.show()
        
        return self

# Usage
model = LogisticRegressionWithLiveplot(learning_rate=0.01, n_iterations=1000)
model.fit(X_train_scaled, y_train, plot_interval=50)

8️⃣ Model Evaluation

Complete Evaluation Function

from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, classification_report, confusion_matrix)

def evaluate_model(model, X_train, y_train, X_test, y_test):
    """
    Comprehensive model evaluation
    
    Parameters: 
    -----------
    model : trained model
    X_train, y_train : training data
    X_test, y_test : test data
    """
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    print("=" * 60)
    print("📊 MODEL EVALUATION REPORT")
    print("=" * 60)
    
    # Training metrics
    print("\n🎓 TRAINING SET PERFORMANCE:")
    print(f"   Accuracy:   {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"   Precision: {precision_score(y_train, y_train_pred):.4f}")
    print(f"   Recall:    {recall_score(y_train, y_train_pred):.4f}")
    print(f"   F1-Score:  {f1_score(y_train, y_train_pred):.4f}")
    
    # Test metrics
    print("\n🧪 TEST SET PERFORMANCE:")
    print(f"   Accuracy:  {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"   Precision:  {precision_score(y_test, y_test_pred):.4f}")
    print(f"   Recall:    {recall_score(y_test, y_test_pred):.4f}")
    print(f"   F1-Score:  {f1_score(y_test, y_test_pred):.4f}")
    
    # Overfitting check
    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)
    gap = train_acc - test_acc
    
    print("\n🔍 OVERFITTING ANALYSIS:")
    print(f"   Train Accuracy: {train_acc:. 4f}")
    print(f"   Test Accuracy:  {test_acc:.4f}")
    print(f"   Gap:             {gap:.4f}")
    
    if gap < 0.05:
        print("   Status: ✅ Good generalization")
    elif gap < 0.10:
        print("   Status:  ⚠️ Slight overfitting")
    else:
        print("   Status: ❌ Significant overfitting")
    
    # Detailed classification report
    print("\n📋 DETAILED CLASSIFICATION REPORT:")
    print(classification_report(y_test, y_test_pred, 
                               target_names=['Class 0', 'Class 1']))
    
    # Confusion matrix
    print("\n🎯 CONFUSION MATRIX:")
    cm = confusion_matrix(y_test, y_test_pred)
    print(cm)
    
    return {
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'precision': precision_score(y_test, y_test_pred),
        'recall': recall_score(y_test, y_test_pred),
        'f1': f1_score(y_test, y_test_pred)
    }

# Usage
metrics = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)

9️⃣ Saving and Loading Models

Save Model

import pickle

def save_model(model, scaler, filename='logistic_model.pkl'):
    """
    Save trained model and scaler
    
    Parameters: 
    -----------
    model : trained model
    scaler : fitted scaler
    filename : str, filename to save
    """
    model_data = {
        'model':  model,
        'scaler': scaler,
        'weights': model.weights,
        'bias': model. bias,
        'cost_history': model.cost_history
    }
    
    with open(filename, 'wb') as f:
        pickle.dump(model_data, f)
    
    print(f"✅ Model saved to {filename}")

# Usage
save_model(model, scaler, 'my_logistic_model.pkl')

Load Model

def load_model(filename='logistic_model.pkl'):
    """
    Load saved model
    
    Parameters:
    -----------
    filename : str, filename to load
    
    Returns: 
    --------
    model, scaler : loaded objects
    """
    with open(filename, 'rb') as f:
        model_data = pickle.load(f)
    
    print(f"✅ Model loaded from {filename}")
    
    return model_data['model'], model_data['scaler']

# Usage
loaded_model, loaded_scaler = load_model('my_logistic_model.pkl')

# Make predictions
X_new = loaded_scaler.transform(X_new_data)
predictions = loaded_model.predict(X_new)

🎯 Training Checklist

  • ✅ Load and explore data
  • ✅ Handle missing values
  • ✅ Split data (train/val/test)
  • ✅ Scale features
  • ✅ Train initial model
  • ✅ Monitor training (cost curve)
  • ✅ Tune hyperparameters
  • ✅ Perform cross-validation
  • ✅ Evaluate on test set
  • ✅ Check for overfitting
  • ✅ Save best model
  • ✅ Document results

💡 Best Practices

✅ Do

  • Always scale features
  • Use stratified splits for imbalanced data
  • Monitor both train and validation metrics
  • Save multiple model checkpoints
  • Use cross-validation
  • Document hyperparameters
  • Check for data leakage

❌ Don't

  • Fit scaler on test data
  • Ignore class imbalance
  • Only look at training accuracy
  • Overtrain without validation
  • Skip feature scaling
  • Forget to set random seeds
  • Test on training data

Clone this wiki locally