|
| 1 | +import xgboost as xgb |
1 | 2 | import pandas as pd
|
2 | 3 | import numpy as np
|
3 |
| -import joblib |
4 | 4 | import matplotlib.pyplot as plt
|
5 | 5 | import seaborn as sns
|
6 |
| -from sklearn.model_selection import train_test_split, cross_val_score |
7 |
| -from sklearn.preprocessing import StandardScaler |
8 |
| -from sklearn.ensemble import RandomForestClassifier |
9 |
| -from sklearn.metrics import classification_report, confusion_matrix |
| 6 | +import joblib |
| 7 | +from sklearn.model_selection import train_test_split, RandomizedSearchCV |
| 8 | +from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
| 9 | +from sklearn.preprocessing import StandardScaler, LabelEncoder |
| 10 | +from sklearn.impute import SimpleImputer |
10 | 11 | from imblearn.over_sampling import SMOTE
|
11 |
| -from xgboost import XGBClassifier |
12 | 12 |
|
13 | 13 | # Load dataset
|
14 |
| -data = pd.read_csv("data/merged_data.csv") |
| 14 | +data = pd.read_csv("/home/pavithra/k8s-failure-prediction/data/merged_data.csv") |
| 15 | + |
| 16 | +# Convert datetime columns to numeric timestamps |
| 17 | +for col in data.select_dtypes(include=['object', 'datetime']): |
| 18 | + try: |
| 19 | + data[col] = pd.to_datetime(data[col]).astype(int) / 10**9 |
| 20 | + except: |
| 21 | + pass |
| 22 | + |
| 23 | +# Handle categorical features |
| 24 | +categorical_cols = data.select_dtypes(include=['object']).columns |
| 25 | +data[categorical_cols] = data[categorical_cols].apply(LabelEncoder().fit_transform) |
15 | 26 |
|
16 |
| -# Check if target column exists |
17 |
| -if "target" not in data.columns: |
18 |
| - raise KeyError("❌ 'target' column not found in the dataset!") |
| 27 | +# Handle missing values |
| 28 | +imputer = SimpleImputer(strategy='mean') |
| 29 | +data.iloc[:, :] = imputer.fit_transform(data) |
19 | 30 |
|
20 |
| -# Remove non-numeric columns and separate features/target |
21 |
| -X = data.drop(columns=["timestamp", "target"], errors="ignore") |
| 31 | +# Split into features and target |
| 32 | +X = data.drop(columns=["target"]) |
22 | 33 | y = data["target"]
|
23 | 34 |
|
24 |
| -# Standardize features |
| 35 | +# Handle Class Imbalance with SMOTE |
| 36 | +smote = SMOTE(sampling_strategy=0.6, random_state=42) |
| 37 | +X_resampled, y_resampled = smote.fit_resample(X, y) |
| 38 | + |
| 39 | +# Feature Scaling |
25 | 40 | scaler = StandardScaler()
|
26 |
| -X_scaled = scaler.fit_transform(X) |
| 41 | +X_scaled = scaler.fit_transform(X_resampled) |
| 42 | + |
| 43 | +# Train-Test Split |
| 44 | +X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled) |
27 | 45 |
|
28 |
| -# Handle class imbalance using SMOTE |
29 |
| -smote = SMOTE(random_state=42) |
30 |
| -X_resampled, y_resampled = smote.fit_resample(X_scaled, y) |
| 46 | +# Hyperparameter Tuning |
31 | 47 |
|
32 |
| -# Split into train & test sets |
33 |
| -X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42) |
| 48 | +param_grid = { |
| 49 | + 'n_estimators': [400, 500, 600], # More trees to learn better |
| 50 | + 'max_depth': [10, 12, 15], # Allow deeper trees |
| 51 | + 'learning_rate': [0.1, 0.2, 0.3], # Increase learning rate |
| 52 | + 'min_child_weight': [1, 2], # Reduce constraints |
| 53 | + 'subsample': [0.9, 1.0], # Use more data per tree |
| 54 | + 'colsample_bytree': [0.9, 1.0], # Use more features per tree |
| 55 | + 'gamma': [0, 0.1], # Reduce penalty on splits |
| 56 | + 'reg_lambda': [0, 1], # Reduce L2 regularization |
| 57 | + 'reg_alpha': [0, 1], # Reduce L1 regularization |
| 58 | + 'scale_pos_weight': [1] # Balance class weights normally |
| 59 | +} |
34 | 60 |
|
35 |
| -# Train model with class weighting |
36 | 61 |
|
37 |
| -model = XGBClassifier( |
38 |
| - n_estimators=500, |
39 |
| - max_depth=10, |
40 |
| - learning_rate=0.01, |
41 |
| - subsample=0.8, |
42 |
| - colsample_bytree=0.8, |
43 |
| - scale_pos_weight=1, |
44 |
| - use_label_encoder=False, |
45 |
| - eval_metric="logloss" |
46 |
| -) |
| 62 | +xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss') |
47 | 63 |
|
48 |
| -model.fit(X_train, y_train) |
| 64 | +search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=30, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42) |
| 65 | +search.fit(X_train, y_train) |
| 66 | + |
| 67 | +best_model = search.best_estimator_ |
49 | 68 |
|
50 | 69 | # Predictions
|
51 |
| -y_pred = model.predict(X_test) |
52 |
| - |
53 |
| -# Evaluate model |
54 |
| -train_acc = model.score(X_train, y_train) |
55 |
| -test_acc = model.score(X_test, y_test) |
56 |
| -cv_acc = np.mean(cross_val_score(model, X_resampled, y_resampled, cv=5)) |
57 |
| - |
58 |
| -# Confusion matrix |
59 |
| -cm = confusion_matrix(y_test, y_pred) |
60 |
| - |
61 |
| -# Feature importance |
62 |
| -feature_importances = model.feature_importances_ |
63 |
| -sorted_indices = np.argsort(feature_importances)[::-1] |
64 |
| -top_features = X.columns[sorted_indices] |
65 |
| - |
66 |
| -# Print results |
67 |
| -print("\n📊 MODEL PERFORMANCE METRICS") |
68 |
| -print("────────────────────────────────") |
69 |
| -print(f"🏋️ Training Accuracy: {train_acc:.4f}") |
70 |
| -print(f"🛠️ Test Accuracy: {test_acc:.4f}") |
71 |
| -print(f"🎯 Cross-Validation Accuracy: {cv_acc:.4f}") |
72 |
| - |
73 |
| -# Print classification report |
74 |
| -print("\n📜 Classification Report:\n", classification_report(y_test, y_pred)) |
75 |
| - |
76 |
| -# Print confusion matrix |
77 |
| -print("\n🖼️ Confusion Matrix:") |
78 |
| -print(cm) |
79 |
| - |
80 |
| -# Show top features |
81 |
| -print("\n🔍 Top 5 Most Important Features:") |
82 |
| -for i in range(min(5, len(top_features))): |
83 |
| - print(f" {i+1}. {top_features[i]} ({feature_importances[sorted_indices[i]]:.4f})") |
84 |
| - |
85 |
| -# Save trained model |
86 |
| -joblib.dump(model, "models/failure_predictor.pkl") |
87 |
| -print("\n✅ Model saved successfully!") |
88 |
| - |
89 |
| -# Plot confusion matrix |
90 |
| -plt.figure(figsize=(6, 5)) |
91 |
| -sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Failure", "Failure"], yticklabels=["No Failure", "Failure"]) |
| 70 | +y_train_pred = best_model.predict(X_train) |
| 71 | +y_test_pred = best_model.predict(X_test) |
| 72 | + |
| 73 | +# Accuracy Scores |
| 74 | +train_accuracy = accuracy_score(y_train, y_train_pred) * 100 |
| 75 | +test_accuracy = accuracy_score(y_test, y_test_pred) * 100 |
| 76 | + |
| 77 | +print(f"\n🔥 Train Accuracy: {train_accuracy:.2f}%") |
| 78 | +print(f"🔥 Test Accuracy: {test_accuracy:.2f}%") |
| 79 | + |
| 80 | +# Classification Report |
| 81 | +print("\n📊 Classification Report:") |
| 82 | +print(classification_report(y_test, y_test_pred)) |
| 83 | + |
| 84 | +joblib.dump(best_model, "k8s_failure_model.pkl") |
| 85 | +print("\nMODEL SAVED\n") |
| 86 | + |
| 87 | +# Confusion Matrix |
| 88 | +conf_matrix = confusion_matrix(y_test, y_test_pred) |
| 89 | +plt.figure(figsize=(6, 4)) |
| 90 | +sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1']) |
92 | 91 | plt.xlabel("Predicted")
|
93 | 92 | plt.ylabel("Actual")
|
94 | 93 | plt.title("Confusion Matrix")
|
95 | 94 | plt.show()
|
96 | 95 |
|
| 96 | +# Feature Importance Graph |
| 97 | +feature_importances = best_model.feature_importances_ |
| 98 | +features = data.drop(columns=["target"]).columns |
| 99 | + |
| 100 | +# Sort feature importances |
| 101 | +sorted_idx = np.argsort(feature_importances)[::-1] |
| 102 | + |
| 103 | +plt.figure(figsize=(10, 5)) |
| 104 | +sns.barplot(x=feature_importances[sorted_idx][:10], y=[features[i] for i in sorted_idx[:10]], palette="coolwarm") |
| 105 | +plt.xlabel("Feature Importance Score") |
| 106 | +plt.ylabel("Top 10 Features") |
| 107 | +plt.title("Feature Importance (Top 10)") |
| 108 | +plt.show() |
| 109 | + |
0 commit comments