|
1 | 1 | import pandas as pd |
2 | | -from sklearn.model_selection import train_test_split |
3 | | -from sklearn.ensemble import RandomForestClassifier |
4 | | -from sklearn.metrics import accuracy_score, classification_report |
5 | | -import os |
| 2 | +import numpy as np |
6 | 3 | import joblib |
7 | 4 | import matplotlib.pyplot as plt |
| 5 | +import seaborn as sns |
| 6 | +from sklearn.model_selection import train_test_split, cross_val_score |
| 7 | +from sklearn.preprocessing import StandardScaler |
| 8 | +from sklearn.ensemble import RandomForestClassifier |
| 9 | +from sklearn.metrics import classification_report, confusion_matrix |
| 10 | +from imblearn.over_sampling import SMOTE |
| 11 | +from xgboost import XGBClassifier |
| 12 | + |
| 13 | +# Load dataset |
| 14 | +data = pd.read_csv("data/merged_data.csv") |
8 | 15 |
|
9 | | -# Load Processed Data |
10 | | -df = pd.read_csv("data/processed_metrics.csv") |
| 16 | +# Check if target column exists |
| 17 | +if "target" not in data.columns: |
| 18 | + raise KeyError("❌ 'target' column not found in the dataset!") |
11 | 19 |
|
12 | | -# Drop unnecessary columns |
13 | | -df = df.drop(columns=["timestamp"], errors="ignore") |
| 20 | +# Remove non-numeric columns and separate features/target |
| 21 | +X = data.drop(columns=["timestamp", "target"], errors="ignore") |
| 22 | +y = data["target"] |
14 | 23 |
|
15 | | -# Ensure "failure" column exists |
16 | | -if "failure" not in df.columns: |
17 | | - raise ValueError("Error: 'failure' column not found in processed_metrics.csv!") |
| 24 | +# Standardize features |
| 25 | +scaler = StandardScaler() |
| 26 | +X_scaled = scaler.fit_transform(X) |
18 | 27 |
|
19 | | -# Define Features (X) and Target (y) |
20 | | -X = df.drop(columns=["failure"]) |
21 | | -y = df["failure"] |
| 28 | +# Handle class imbalance using SMOTE |
| 29 | +smote = SMOTE(random_state=42) |
| 30 | +X_resampled, y_resampled = smote.fit_resample(X_scaled, y) |
22 | 31 |
|
23 | | -# Train-Test Split |
24 | | -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 32 | +# Split into train & test sets |
| 33 | +X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42) |
| 34 | + |
| 35 | +# Train model with class weighting |
| 36 | + |
| 37 | +model = XGBClassifier( |
| 38 | + n_estimators=500, |
| 39 | + max_depth=10, |
| 40 | + learning_rate=0.01, |
| 41 | + subsample=0.8, |
| 42 | + colsample_bytree=0.8, |
| 43 | + scale_pos_weight=1, |
| 44 | + use_label_encoder=False, |
| 45 | + eval_metric="logloss" |
| 46 | +) |
25 | 47 |
|
26 | | -# Train a Random Forest Model |
27 | | -model = RandomForestClassifier(n_estimators=100, random_state=42) |
28 | 48 | model.fit(X_train, y_train) |
29 | 49 |
|
30 | 50 | # Predictions |
31 | 51 | y_pred = model.predict(X_test) |
32 | 52 |
|
33 | | -# Model Evaluation |
34 | | -accuracy = accuracy_score(y_test, y_pred) |
35 | | -print(f"✅ Model Accuracy: {accuracy:.4f}") |
36 | | -print("📊 Classification Report:\n", classification_report(y_test, y_pred)) |
| 53 | +# Evaluate model |
| 54 | +train_acc = model.score(X_train, y_train) |
| 55 | +test_acc = model.score(X_test, y_test) |
| 56 | +cv_acc = np.mean(cross_val_score(model, X_resampled, y_resampled, cv=5)) |
37 | 57 |
|
38 | | -# Ensure models directory exists |
39 | | -os.makedirs("models", exist_ok=True) |
| 58 | +# Confusion matrix |
| 59 | +cm = confusion_matrix(y_test, y_pred) |
40 | 60 |
|
41 | | -# Save the Model |
42 | | -joblib.dump(model, "models/failure_predictor.pkl") |
43 | | -print("✅ Model saved as models/failure_predictor.pkl") |
| 61 | +# Feature importance |
| 62 | +feature_importances = model.feature_importances_ |
| 63 | +sorted_indices = np.argsort(feature_importances)[::-1] |
| 64 | +top_features = X.columns[sorted_indices] |
44 | 65 |
|
| 66 | +# Print results |
| 67 | +print("\n📊 MODEL PERFORMANCE METRICS") |
| 68 | +print("────────────────────────────────") |
| 69 | +print(f"🏋️ Training Accuracy: {train_acc:.4f}") |
| 70 | +print(f"🛠️ Test Accuracy: {test_acc:.4f}") |
| 71 | +print(f"🎯 Cross-Validation Accuracy: {cv_acc:.4f}") |
45 | 72 |
|
46 | | -# Get feature importance |
47 | | -importances = model.feature_importances_ |
48 | | -features = X.columns |
| 73 | +# Print classification report |
| 74 | +print("\n📜 Classification Report:\n", classification_report(y_test, y_pred)) |
49 | 75 |
|
50 | | -# Plot |
51 | | -plt.figure(figsize=(10,5)) |
52 | | -plt.barh(features, importances, color="skyblue") |
53 | | -plt.xlabel("Importance Score") |
54 | | -plt.ylabel("Feature Name") |
55 | | -plt.title("Feature Importance in Failure Prediction Model") |
56 | | -plt.show() |
| 76 | +# Print confusion matrix |
| 77 | +print("\n🖼️ Confusion Matrix:") |
| 78 | +print(cm) |
| 79 | + |
| 80 | +# Show top features |
| 81 | +print("\n🔍 Top 5 Most Important Features:") |
| 82 | +for i in range(min(5, len(top_features))): |
| 83 | + print(f" {i+1}. {top_features[i]} ({feature_importances[sorted_indices[i]]:.4f})") |
57 | 84 |
|
58 | | -# Check training accuracy |
59 | | -train_pred = model.predict(X_train) |
60 | | -train_acc = accuracy_score(y_train, train_pred) |
| 85 | +# Save trained model |
| 86 | +joblib.dump(model, "models/failure_predictor.pkl") |
| 87 | +print("\n✅ Model saved successfully!") |
| 88 | + |
| 89 | +# Plot confusion matrix |
| 90 | +plt.figure(figsize=(6, 5)) |
| 91 | +sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Failure", "Failure"], yticklabels=["No Failure", "Failure"]) |
| 92 | +plt.xlabel("Predicted") |
| 93 | +plt.ylabel("Actual") |
| 94 | +plt.title("Confusion Matrix") |
| 95 | +plt.show() |
61 | 96 |
|
62 | | -print(f"🏋️ Training Accuracy: {train_acc:.2f}") |
63 | | -print(f"🛠️ Test Accuracy: {accuracy_score(y_test, y_pred):.2f}") |
|
0 commit comments