Skip to content

Commit d254548

Browse files
committed
Changed for generalisation and better accuracy score
1 parent 68e09f7 commit d254548

File tree

3 files changed

+221
-81
lines changed

3 files changed

+221
-81
lines changed

scripts/fetch_metrics.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
import requests
22
import pandas as pd
33
import os
4-
from datetime import datetime
4+
from datetime import datetime, timezone
55

66
PROMETHEUS_URL = "http://localhost:9090/api/v1/query"
77

8-
# Define metrics to fetch
98
METRICS = {
109
"cpu_usage": "container_cpu_usage_seconds_total",
1110
"memory_usage": "container_memory_usage_bytes",
@@ -24,16 +23,45 @@ def fetch_metric(metric_name):
2423

2524
results = []
2625
for item in data.get("data", {}).get("result", []):
27-
timestamp = datetime.utcfromtimestamp(float(item["value"][0])).strftime("%Y-%m-%d %H:%M:%S")
28-
value = float(item["value"][1])
29-
results.append({"timestamp": timestamp, "value": value})
26+
try:
27+
timestamp = datetime.fromtimestamp(float(item["value"][0]), tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
28+
value = float(item["value"][1])
29+
results.append({"timestamp": timestamp, metric_name: value})
30+
except Exception as e:
31+
print(f"❌ Error processing {metric_name}: {e}")
3032

31-
return pd.DataFrame(results)
33+
df = pd.DataFrame(results)
34+
35+
if not df.empty:
36+
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
37+
38+
return df
3239

3340
# Fetch all metrics
41+
all_data = None
42+
3443
for metric_key, query in METRICS.items():
3544
df = fetch_metric(query)
36-
save_path = os.path.join(SAVE_DIR, f"{metric_key}.csv")
37-
df.to_csv(save_path, index=False)
38-
print(f"✅ {metric_key} data saved to {save_path}")
45+
46+
if df.empty:
47+
print(f"⚠️ Warning: No data for {metric_key}, skipping merge.")
48+
continue
49+
50+
if all_data is None:
51+
all_data = df
52+
else:
53+
print(f"Merging {metric_key}...")
54+
print("Before merge, all_data columns:", list(all_data.columns))
55+
print("Before merge, df columns:", list(df.columns))
56+
57+
all_data = pd.merge(all_data, df, on="timestamp", how="outer")
58+
59+
# Save collected data
60+
if all_data is not None and not all_data.empty:
61+
save_path = os.path.join(SAVE_DIR, "merged_data.csv")
62+
all_data.to_csv(save_path, index=False)
63+
print(f"✅ Merged data saved to {save_path}")
64+
print(all_data.head()) # Preview first few rows
65+
else:
66+
print("⚠️ No data was fetched, skipping save.")
3967

scripts/train_model.py

Lines changed: 85 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,109 @@
1+
import xgboost as xgb
12
import pandas as pd
23
import numpy as np
3-
import joblib
44
import matplotlib.pyplot as plt
55
import seaborn as sns
6-
from sklearn.model_selection import train_test_split, cross_val_score
7-
from sklearn.preprocessing import StandardScaler
8-
from sklearn.ensemble import RandomForestClassifier
9-
from sklearn.metrics import classification_report, confusion_matrix
6+
import joblib
7+
from sklearn.model_selection import train_test_split, RandomizedSearchCV
8+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
9+
from sklearn.preprocessing import StandardScaler, LabelEncoder
10+
from sklearn.impute import SimpleImputer
1011
from imblearn.over_sampling import SMOTE
11-
from xgboost import XGBClassifier
1212

1313
# Load dataset
14-
data = pd.read_csv("data/merged_data.csv")
14+
data = pd.read_csv("/home/pavithra/k8s-failure-prediction/data/merged_data.csv")
15+
16+
# Convert datetime columns to numeric timestamps
17+
for col in data.select_dtypes(include=['object', 'datetime']):
18+
try:
19+
data[col] = pd.to_datetime(data[col]).astype(int) / 10**9
20+
except:
21+
pass
22+
23+
# Handle categorical features
24+
categorical_cols = data.select_dtypes(include=['object']).columns
25+
data[categorical_cols] = data[categorical_cols].apply(LabelEncoder().fit_transform)
1526

16-
# Check if target column exists
17-
if "target" not in data.columns:
18-
raise KeyError("❌ 'target' column not found in the dataset!")
27+
# Handle missing values
28+
imputer = SimpleImputer(strategy='mean')
29+
data.iloc[:, :] = imputer.fit_transform(data)
1930

20-
# Remove non-numeric columns and separate features/target
21-
X = data.drop(columns=["timestamp", "target"], errors="ignore")
31+
# Split into features and target
32+
X = data.drop(columns=["target"])
2233
y = data["target"]
2334

24-
# Standardize features
35+
# Handle Class Imbalance with SMOTE
36+
smote = SMOTE(sampling_strategy=0.6, random_state=42)
37+
X_resampled, y_resampled = smote.fit_resample(X, y)
38+
39+
# Feature Scaling
2540
scaler = StandardScaler()
26-
X_scaled = scaler.fit_transform(X)
41+
X_scaled = scaler.fit_transform(X_resampled)
42+
43+
# Train-Test Split
44+
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
2745

28-
# Handle class imbalance using SMOTE
29-
smote = SMOTE(random_state=42)
30-
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)
46+
# Hyperparameter Tuning
3147

32-
# Split into train & test sets
33-
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
48+
param_grid = {
49+
'n_estimators': [400, 500, 600], # More trees to learn better
50+
'max_depth': [10, 12, 15], # Allow deeper trees
51+
'learning_rate': [0.1, 0.2, 0.3], # Increase learning rate
52+
'min_child_weight': [1, 2], # Reduce constraints
53+
'subsample': [0.9, 1.0], # Use more data per tree
54+
'colsample_bytree': [0.9, 1.0], # Use more features per tree
55+
'gamma': [0, 0.1], # Reduce penalty on splits
56+
'reg_lambda': [0, 1], # Reduce L2 regularization
57+
'reg_alpha': [0, 1], # Reduce L1 regularization
58+
'scale_pos_weight': [1] # Balance class weights normally
59+
}
3460

35-
# Train model with class weighting
3661

37-
model = XGBClassifier(
38-
n_estimators=500,
39-
max_depth=10,
40-
learning_rate=0.01,
41-
subsample=0.8,
42-
colsample_bytree=0.8,
43-
scale_pos_weight=1,
44-
use_label_encoder=False,
45-
eval_metric="logloss"
46-
)
62+
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')
4763

48-
model.fit(X_train, y_train)
64+
search = RandomizedSearchCV(xgb_model, param_distributions=param_grid, n_iter=30, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42)
65+
search.fit(X_train, y_train)
66+
67+
best_model = search.best_estimator_
4968

5069
# Predictions
51-
y_pred = model.predict(X_test)
52-
53-
# Evaluate model
54-
train_acc = model.score(X_train, y_train)
55-
test_acc = model.score(X_test, y_test)
56-
cv_acc = np.mean(cross_val_score(model, X_resampled, y_resampled, cv=5))
57-
58-
# Confusion matrix
59-
cm = confusion_matrix(y_test, y_pred)
60-
61-
# Feature importance
62-
feature_importances = model.feature_importances_
63-
sorted_indices = np.argsort(feature_importances)[::-1]
64-
top_features = X.columns[sorted_indices]
65-
66-
# Print results
67-
print("\n📊 MODEL PERFORMANCE METRICS")
68-
print("────────────────────────────────")
69-
print(f"🏋️ Training Accuracy: {train_acc:.4f}")
70-
print(f"🛠️ Test Accuracy: {test_acc:.4f}")
71-
print(f"🎯 Cross-Validation Accuracy: {cv_acc:.4f}")
72-
73-
# Print classification report
74-
print("\n📜 Classification Report:\n", classification_report(y_test, y_pred))
75-
76-
# Print confusion matrix
77-
print("\n🖼️ Confusion Matrix:")
78-
print(cm)
79-
80-
# Show top features
81-
print("\n🔍 Top 5 Most Important Features:")
82-
for i in range(min(5, len(top_features))):
83-
print(f" {i+1}. {top_features[i]} ({feature_importances[sorted_indices[i]]:.4f})")
84-
85-
# Save trained model
86-
joblib.dump(model, "models/failure_predictor.pkl")
87-
print("\n✅ Model saved successfully!")
88-
89-
# Plot confusion matrix
90-
plt.figure(figsize=(6, 5))
91-
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["No Failure", "Failure"], yticklabels=["No Failure", "Failure"])
70+
y_train_pred = best_model.predict(X_train)
71+
y_test_pred = best_model.predict(X_test)
72+
73+
# Accuracy Scores
74+
train_accuracy = accuracy_score(y_train, y_train_pred) * 100
75+
test_accuracy = accuracy_score(y_test, y_test_pred) * 100
76+
77+
print(f"\n🔥 Train Accuracy: {train_accuracy:.2f}%")
78+
print(f"🔥 Test Accuracy: {test_accuracy:.2f}%")
79+
80+
# Classification Report
81+
print("\n📊 Classification Report:")
82+
print(classification_report(y_test, y_test_pred))
83+
84+
joblib.dump(best_model, "k8s_failure_model.pkl")
85+
print("\nMODEL SAVED\n")
86+
87+
# Confusion Matrix
88+
conf_matrix = confusion_matrix(y_test, y_test_pred)
89+
plt.figure(figsize=(6, 4))
90+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
9291
plt.xlabel("Predicted")
9392
plt.ylabel("Actual")
9493
plt.title("Confusion Matrix")
9594
plt.show()
9695

96+
# Feature Importance Graph
97+
feature_importances = best_model.feature_importances_
98+
features = data.drop(columns=["target"]).columns
99+
100+
# Sort feature importances
101+
sorted_idx = np.argsort(feature_importances)[::-1]
102+
103+
plt.figure(figsize=(10, 5))
104+
sns.barplot(x=feature_importances[sorted_idx][:10], y=[features[i] for i in sorted_idx[:10]], palette="coolwarm")
105+
plt.xlabel("Feature Importance Score")
106+
plt.ylabel("Top 10 Features")
107+
plt.title("Feature Importance (Top 10)")
108+
plt.show()
109+

scripts/train_model1.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import pandas as pd
2+
import numpy as np
3+
import os
4+
import joblib
5+
import matplotlib.pyplot as plt
6+
import seaborn as sns
7+
from sklearn.ensemble import RandomForestClassifier
8+
from sklearn.linear_model import LogisticRegression
9+
from sklearn.model_selection import train_test_split
10+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
11+
from imblearn.over_sampling import BorderlineSMOTE
12+
from xgboost import XGBClassifier
13+
14+
# ✅ Load Dataset
15+
CSV_PATH = "/home/pavithra/k8s-failure-prediction/data/merged_data.csv"
16+
df = pd.read_csv(CSV_PATH)
17+
18+
# ✅ Preprocessing
19+
df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True).str.lower()
20+
df["timestamp"] = pd.to_datetime(df["timestamp"])
21+
df.set_index("timestamp", inplace=True)
22+
23+
# ✅ Feature Engineering
24+
for col in df.columns:
25+
df[f"{col}_avg"] = df[col].rolling(window=5, min_periods=1).mean()
26+
27+
# ✅ Target Variable
28+
df["target"] = (df["container_restart_count"].diff().fillna(0) > 1).astype(int)
29+
df.drop(columns=["container_restart_count"], inplace=True)
30+
31+
# ✅ Prepare Data
32+
X = df.drop(columns=["target"])
33+
y = df["target"]
34+
35+
# ✅ Handle Class Imbalance
36+
if y.value_counts().min() >= 5:
37+
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
38+
X_resampled, y_resampled = smote.fit_resample(X, y)
39+
else:
40+
X_resampled, y_resampled = X, y
41+
42+
# ✅ Train-Test Split
43+
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
44+
45+
# ✅ Reduce Overfitting (Final Fix)
46+
rf = RandomForestClassifier(
47+
n_estimators=300, # More trees
48+
max_depth=10, # Reduce tree depth
49+
min_samples_split=20, # More samples needed per split
50+
min_samples_leaf=10, # Prevent small branches
51+
bootstrap=True,
52+
random_state=42
53+
)
54+
55+
# ✅ Ensemble Model (Random Forest + XGBoost)
56+
xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, random_state=42)
57+
rf.fit(X_train, y_train)
58+
xgb.fit(X_train, y_train)
59+
60+
# ✅ Predictions
61+
y_pred_rf = rf.predict(X_test)
62+
y_pred_xgb = xgb.predict(X_test)
63+
64+
# ✅ Combine Predictions (Soft Voting)
65+
y_pred_ensemble = (y_pred_rf + y_pred_xgb) // 2
66+
67+
# ✅ Evaluate Model
68+
train_acc = rf.score(X_train, y_train) * 100
69+
test_acc = accuracy_score(y_test, y_pred_ensemble) * 100
70+
print(f"\n🎯 Train Accuracy: {train_acc:.2f} %")
71+
print(f"🎯 Test Accuracy: {test_acc:.2f} %")
72+
print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred_ensemble))
73+
74+
# ✅ Save Model
75+
MODEL_PATH = "../models/k8s_failure_model.pkl"
76+
joblib.dump(rf, MODEL_PATH)
77+
model = joblib.load("models/k8s_failure_model.pkl")
78+
print("The features in model are\n")
79+
print(model.feature_names_in_)
80+
print(f"\n✅ Model saved at {MODEL_PATH}")
81+
82+
# 🔥 Confusion Matrix Plot
83+
cm = confusion_matrix(y_test, y_pred_ensemble)
84+
plt.figure(figsize=(6, 4))
85+
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["No Failure", "Failure"], yticklabels=["No Failure", "Failure"])
86+
plt.title("Confusion Matrix")
87+
plt.xlabel("Predicted")
88+
plt.ylabel("Actual")
89+
plt.show()
90+
91+
# 🔥 Feature Importance Plot
92+
feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
93+
feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(15)
94+
95+
plt.figure(figsize=(10, 6))
96+
sns.barplot(x='Importance', y='Feature', data=feature_importances, palette="viridis")
97+
plt.title("Top 15 Important Features")
98+
plt.show()
99+

0 commit comments

Comments
 (0)