-
Notifications
You must be signed in to change notification settings - Fork 0
Model Training
SRIJA DE CHOWDHURY edited this page Jan 4, 2026
·
1 revision
graph LR
A[📥 Load Data] --> B[🔍 Explore Data]
B --> C[🧹 Preprocess]
C --> D[✂️ Split Data]
D --> E[📏 Scale Features]
E --> F[🎯 Train Model]
F --> G[📊 Validate]
G --> H{Good Performance?}
H -->|No| I[⚙️ Tune Hyperparameters]
I --> F
H -->|Yes| J[🧪 Test Model]
J --> K[🎉 Deploy]
style A fill:#e1f5ff
style C fill:#ffe1e1
style F fill:#e1ffe1
style H fill:#fff4e1
style K fill:#ffd700
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
# Load data
data = load_breast_cancer()
X = data.data
y = data.target
print(f"📊 Dataset Shape: {X. shape}")
print(f"📈 Positive samples: {np.sum(y == 1)}")
print(f"📉 Negative samples: {np.sum(y == 0)}")import matplotlib.pyplot as plt
import seaborn as sns
def explore_data(X, y, feature_names):
"""
Perform basic EDA
Parameters:
-----------
X : features array
y : labels array
feature_names : list of feature names
"""
# Create DataFrame
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y
print("=" * 50)
print("📊 DATASET OVERVIEW")
print("=" * 50)
print(f"\nShape: {df.shape}")
print(f"\nClass Distribution:")
print(df['target'].value_counts())
print(f"\nClass Balance: {df['target'].value_counts(normalize=True)}")
print("\n" + "=" * 50)
print("📈 STATISTICAL SUMMARY")
print("=" * 50)
print(df.describe())
print("\n" + "=" * 50)
print("❓ MISSING VALUES")
print("=" * 50)
print(df.isnull().sum())
# Visualize class distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
df['target'].value_counts().plot(kind='bar', color=['red', 'blue'])
plt.title('Class Distribution', fontweight='bold')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.subplot(1, 2, 2)
df['target'].value_counts().plot(kind='pie', autopct='%1.1f%%',
colors=['red', 'blue'])
plt.title('Class Proportion', fontweight='bold')
plt.ylabel('')
plt.tight_layout()
plt.show()
# Usage
explore_data(X, y, data.feature_names)from sklearn.model_selection import train_test_split
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.2, # 80% train, 20% test
random_state=42, # Reproducibility
stratify=y # Maintain class balance
)
print(f"✅ Training set: {X_train.shape}")
print(f"✅ Test set: {X_test. shape}")
print(f"✅ Train labels: {y_train.shape}")
print(f"✅ Test labels: {y_test. shape}")from sklearn.model_selection import train_test_split
# First split: separate test set
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Second split: separate train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp
) # 0.25 * 0.8 = 0.2 (20% validation)
print("📊 Data Split Summary:")
print(f" Training: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f" Validation: {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f" Test: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")Original Dataset (100%)
├── Training Set (60%)
├── Validation Set (20%)
└── Test Set (20%)
| Feature | Without Scaling | With Scaling | |: --------|:----------------|:-------------| | Age | 20 - 80 | 0 - 1 | | Income | 20,000 - 200,000 | 0 - 1 | | Problem | Income dominates! | Equal importance |
from sklearn.preprocessing import StandardScaler
# Create scaler
scaler = StandardScaler()
# Fit on training data only!
scaler.fit(X_train)
# Transform both train and test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled using StandardScaler")
print(f" Mean: {X_train_scaled.mean():.4f}")
print(f" Std: {X_train_scaled. std():.4f}")Formula:
from sklearn.preprocessing import MinMaxScaler
# Create scaler
scaler = MinMaxScaler()
# Fit and transform
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("✅ Features scaled to [0, 1] range")Formula:
|
|
from logistic_regression import LogisticRegression
# Create model
model = LogisticRegression(
learning_rate=0.01,
n_iterations=1000,
verbose=True
)
# Train model
print("🚀 Starting training...")
model.fit(X_train_scaled, y_train)
print("✅ Training complete!")
# Plot learning curve
model.plot_cost_history()class LogisticRegressionWithEarlyStopping(LogisticRegression):
"""Enhanced version with early stopping"""
def fit(self, X, y, X_val=None, y_val=None, patience=10):
"""
Train with early stopping
Parameters:
-----------
X, y : training data
X_val, y_val : validation data
patience : int, stop if no improvement for this many iterations
"""
m, n = X.shape
self.weights = np.zeros(n)
self.bias = 0
best_val_loss = float('inf')
patience_counter = 0
for i in range(self.n_iterations):
# Forward pass
z = np.dot(X, self.weights) + self.bias
predictions = self._sigmoid(z)
# Compute training cost
train_cost = self._compute_cost(y, predictions)
self.cost_history.append(train_cost)
# Compute validation cost
if X_val is not None:
val_predictions = self._sigmoid(np.dot(X_val, self.weights) + self.bias)
val_cost = self._compute_cost(y_val, val_predictions)
# Early stopping check
if val_cost < best_val_loss:
best_val_loss = val_cost
patience_counter = 0
# Save best weights
best_weights = self.weights.copy()
best_bias = self.bias
else:
patience_counter += 1
if patience_counter >= patience:
print(f"⏹️ Early stopping at iteration {i}")
self.weights = best_weights
self.bias = best_bias
break
# Backward pass
dz = predictions - y
dw = (1/m) * np.dot(X.T, dz)
db = (1/m) * np.sum(dz)
# Update
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
if self.verbose and i % 100 == 0:
print(f"Iteration {i}: Train Cost = {train_cost:.4f}, Val Cost = {val_cost:.4f}")
return self
# Usage
model = LogisticRegressionWithEarlyStopping(learning_rate=0.01, n_iterations=5000, verbose=True)
model.fit(X_train_scaled, y_train, X_val_scaled, y_val, patience=50)from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression as SklearnLR
# Define parameter grid
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100], # Regularization strength
'max_iter': [100, 500, 1000, 2000],
'solver': ['liblinear', 'lbfgs']
}
# Create grid search
grid_search = GridSearchCV(
SklearnLR(),
param_grid,
cv=5, # 5-fold cross-validation
scoring='accuracy',
verbose=1,
n_jobs=-1 # Use all CPU cores
)
# Fit grid search
print("🔍 Starting grid search...")
grid_search.fit(X_train_scaled, y_train)
# Best parameters
print("\n🏆 Best Parameters:")
print(grid_search.best_params_)
print(f"\n📊 Best CV Score: {grid_search.best_score_:.4f}")def tune_hyperparameters(X_train, y_train, X_val, y_val):
"""
Manual hyperparameter tuning
Returns best learning rate and iterations
"""
learning_rates = [0.001, 0.01, 0.1, 0.5]
iterations_list = [500, 1000, 2000, 5000]
best_score = 0
best_params = {}
results = []
for lr in learning_rates:
for iters in iterations_list:
# Train model
model = LogisticRegression(learning_rate=lr, n_iterations=iters)
model.fit(X_train, y_train)
# Evaluate on validation set
val_score = model.score(X_val, y_val)
results.append({
'learning_rate': lr,
'iterations': iters,
'val_accuracy': val_score
})
if val_score > best_score:
best_score = val_score
best_params = {'learning_rate': lr, 'iterations': iters}
print(f"LR={lr}, Iters={iters}: Val Accuracy={val_score:.4f}")
print(f"\n🏆 Best Parameters: {best_params}")
print(f"📊 Best Validation Accuracy: {best_score:.4f}")
# Convert to DataFrame for visualization
results_df = pd. DataFrame(results)
# Pivot for heatmap
pivot = results_df.pivot(index='learning_rate',
columns='iterations',
values='val_accuracy')
# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot, annot=True, fmt='.3f', cmap='YlGnBu',
cbar_kws={'label': 'Validation Accuracy'})
plt.title('🔍 Hyperparameter Tuning Results', fontweight='bold', pad=20)
plt.xlabel('Number of Iterations', fontweight='bold')
plt.ylabel('Learning Rate', fontweight='bold')
plt.tight_layout()
plt.show()
return best_params, results_df
# Usage
best_params, results = tune_hyperparameters(X_train_scaled, y_train, X_val_scaled, y_val)from sklearn.model_selection import cross_val_score, KFold
def perform_cross_validation(X, y, k=5):
"""
Perform k-fold cross-validation
Parameters:
-----------
X : features
y : labels
k : number of folds
"""
# Create KFold object
kfold = KFold(n_splits=k, shuffle=True, random_state=42)
scores = []
fold_num = 1
for train_idx, val_idx in kfold.split(X):
# Split data
X_train_fold, X_val_fold = X[train_idx], X[val_idx]
y_train_fold, y_val_fold = y[train_idx], y[val_idx]
# Scale data
scaler = StandardScaler()
X_train_fold = scaler.fit_transform(X_train_fold)
X_val_fold = scaler. transform(X_val_fold)
# Train model
model = LogisticRegression(learning_rate=0.01, n_iterations=1000)
model.fit(X_train_fold, y_train_fold)
# Evaluate
score = model.score(X_val_fold, y_val_fold)
scores.append(score)
print(f"Fold {fold_num}: Accuracy = {score:.4f}")
fold_num += 1
print("\n" + "=" * 50)
print(f"📊 Cross-Validation Results ({k}-Fold)")
print("=" * 50)
print(f"Mean Accuracy: {np.mean(scores):.4f} (± {np.std(scores):.4f})")
print(f"Min Accuracy: {np.min(scores):.4f}")
print(f"Max Accuracy: {np.max(scores):.4f}")
# Visualize
plt.figure(figsize=(10, 6))
plt.bar(range(1, k+1), scores, color='skyblue', edgecolor='black')
plt.axhline(y=np.mean(scores), color='red', linestyle='--',
linewidth=2, label=f'Mean: {np.mean(scores):.4f}')
plt.xlabel('Fold Number', fontweight='bold')
plt.ylabel('Accuracy', fontweight='bold')
plt.title(f'📊 {k}-Fold Cross-Validation Results', fontweight='bold', pad=20)
plt.legend()
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
return scores
# Usage
cv_scores = perform_cross_validation(X, y, k=5)from sklearn.model_selection import StratifiedKFold
# Better for imbalanced datasets
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Ensures each fold has same class distributionimport matplotlib.pyplot as plt
from IPython.display import clear_output
class LogisticRegressionWithLiveplot(LogisticRegression):
"""Model with live training visualization"""
def fit(self, X, y, plot_interval=50):
"""
Train with live plotting
Parameters:
-----------
X, y : training data
plot_interval : update plot every N iterations
"""
m, n = X.shape
self.weights = np.zeros(n)
self.bias = 0
plt.ion() # Interactive mode
fig, ax = plt.subplots(figsize=(10, 6))
for i in range(self.n_iterations):
# Training step
z = np.dot(X, self.weights) + self.bias
predictions = self._sigmoid(z)
cost = self._compute_cost(y, predictions)
self.cost_history.append(cost)
# Gradients and update
dz = predictions - y
dw = (1/m) * np.dot(X.T, dz)
db = (1/m) * np.sum(dz)
self.weights -= self.learning_rate * dw
self.bias -= self.learning_rate * db
# Update plot
if i % plot_interval == 0:
clear_output(wait=True)
ax.clear()
ax.plot(self.cost_history, linewidth=2, color='blue')
ax.set_xlabel('Iterations', fontweight='bold')
ax.set_ylabel('Cost', fontweight='bold')
ax. set_title(f'🔄 Training Progress (Iteration {i}/{self.n_iterations})',
fontweight='bold')
ax.grid(True, alpha=0.3)
plt.pause(0.01)
plt.ioff()
plt.show()
return self
# Usage
model = LogisticRegressionWithLiveplot(learning_rate=0.01, n_iterations=1000)
model.fit(X_train_scaled, y_train, plot_interval=50)from sklearn.metrics import (accuracy_score, precision_score, recall_score,
f1_score, classification_report, confusion_matrix)
def evaluate_model(model, X_train, y_train, X_test, y_test):
"""
Comprehensive model evaluation
Parameters:
-----------
model : trained model
X_train, y_train : training data
X_test, y_test : test data
"""
# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("=" * 60)
print("📊 MODEL EVALUATION REPORT")
print("=" * 60)
# Training metrics
print("\n🎓 TRAINING SET PERFORMANCE:")
print(f" Accuracy: {accuracy_score(y_train, y_train_pred):.4f}")
print(f" Precision: {precision_score(y_train, y_train_pred):.4f}")
print(f" Recall: {recall_score(y_train, y_train_pred):.4f}")
print(f" F1-Score: {f1_score(y_train, y_train_pred):.4f}")
# Test metrics
print("\n🧪 TEST SET PERFORMANCE:")
print(f" Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f" Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f" Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f" F1-Score: {f1_score(y_test, y_test_pred):.4f}")
# Overfitting check
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
gap = train_acc - test_acc
print("\n🔍 OVERFITTING ANALYSIS:")
print(f" Train Accuracy: {train_acc:. 4f}")
print(f" Test Accuracy: {test_acc:.4f}")
print(f" Gap: {gap:.4f}")
if gap < 0.05:
print(" Status: ✅ Good generalization")
elif gap < 0.10:
print(" Status: ⚠️ Slight overfitting")
else:
print(" Status: ❌ Significant overfitting")
# Detailed classification report
print("\n📋 DETAILED CLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred,
target_names=['Class 0', 'Class 1']))
# Confusion matrix
print("\n🎯 CONFUSION MATRIX:")
cm = confusion_matrix(y_test, y_test_pred)
print(cm)
return {
'train_accuracy': train_acc,
'test_accuracy': test_acc,
'precision': precision_score(y_test, y_test_pred),
'recall': recall_score(y_test, y_test_pred),
'f1': f1_score(y_test, y_test_pred)
}
# Usage
metrics = evaluate_model(model, X_train_scaled, y_train, X_test_scaled, y_test)import pickle
def save_model(model, scaler, filename='logistic_model.pkl'):
"""
Save trained model and scaler
Parameters:
-----------
model : trained model
scaler : fitted scaler
filename : str, filename to save
"""
model_data = {
'model': model,
'scaler': scaler,
'weights': model.weights,
'bias': model. bias,
'cost_history': model.cost_history
}
with open(filename, 'wb') as f:
pickle.dump(model_data, f)
print(f"✅ Model saved to {filename}")
# Usage
save_model(model, scaler, 'my_logistic_model.pkl')def load_model(filename='logistic_model.pkl'):
"""
Load saved model
Parameters:
-----------
filename : str, filename to load
Returns:
--------
model, scaler : loaded objects
"""
with open(filename, 'rb') as f:
model_data = pickle.load(f)
print(f"✅ Model loaded from {filename}")
return model_data['model'], model_data['scaler']
# Usage
loaded_model, loaded_scaler = load_model('my_logistic_model.pkl')
# Make predictions
X_new = loaded_scaler.transform(X_new_data)
predictions = loaded_model.predict(X_new)- ✅ Load and explore data
- ✅ Handle missing values
- ✅ Split data (train/val/test)
- ✅ Scale features
- ✅ Train initial model
- ✅ Monitor training (cost curve)
- ✅ Tune hyperparameters
- ✅ Perform cross-validation
- ✅ Evaluate on test set
- ✅ Check for overfitting
- ✅ Save best model
- ✅ Document results
|
|