Changed fetch matrix

CPPavithra · CPPavithra · commit acd51f65fcea · 2025-03-23T19:24:21.000+05:30
diff --git a/scripts/fetch_metrics.py b/scripts/fetch_metrics.py
@@ -12,12 +12,9 @@
     "network_rx": "node_network_receive_bytes_total",
     "network_tx": "node_network_transmit_bytes_total",
 }
-
 SAVE_DIR = "../data"
 os.makedirs(SAVE_DIR, exist_ok=True)
-
 def fetch_metric(metric_name):
-    """Fetches a single metric from Prometheus and returns a DataFrame."""
     response = requests.get(PROMETHEUS_URL, params={"query": metric_name})
     data = response.json()
 
@@ -28,7 +25,7 @@ def fetch_metric(metric_name):
             value = float(item["value"][1])
             results.append({"timestamp": timestamp, metric_name: value})
         except Exception as e:
-            print(f"❌ Error processing {metric_name}: {e}")
+            print(f"Error with {metric_name}: {e}")
 
     df = pd.DataFrame(results)
 
@@ -37,14 +34,14 @@ def fetch_metric(metric_name):
 
     return df
 
-# Fetch all metrics
+#fetching all
 all_data = None
 
 for metric_key, query in METRICS.items():
     df = fetch_metric(query)
 
     if df.empty:
-        print(f"⚠️ Warning: No data for {metric_key}, skipping merge.")
+        print(f"⚠️No Data, skipping merge.")
         continue
 
     if all_data is None:
@@ -55,13 +52,11 @@ def fetch_metric(metric_name):
         print("Before merge, df columns:", list(df.columns))
 
         all_data = pd.merge(all_data, df, on="timestamp", how="outer")
-
-# Save collected data
 if all_data is not None and not all_data.empty:
     save_path = os.path.join(SAVE_DIR, "merged_data.csv")
     all_data.to_csv(save_path, index=False)
-    print(f"✅ Merged data saved to {save_path}")
+    print(f"Merged data saved to {save_path}")
     print(all_data.head())  # Preview first few rows
 else:
-    print("⚠️ No data was fetched, skipping save.")
+    print("⚠️ No data was fetched.")
 
diff --git a/scripts/train_model1.py b/scripts/train_model1.py
@@ -11,75 +11,65 @@
 from imblearn.over_sampling import BorderlineSMOTE
 from xgboost import XGBClassifier
 
-# ✅ Load Dataset
 CSV_PATH = "/home/pavithra/k8s-failure-prediction/data/merged_data.csv"
 df = pd.read_csv(CSV_PATH)
 
-# ✅ Preprocessing
 df.columns = df.columns.str.strip().str.replace(r'\s+', '_', regex=True).str.lower()
 df["timestamp"] = pd.to_datetime(df["timestamp"])
 df.set_index("timestamp", inplace=True)
 
-# ✅ Feature Engineering
 for col in df.columns:
     df[f"{col}_avg"] = df[col].rolling(window=5, min_periods=1).mean()
-
-# ✅ Target Variable
 df["target"] = (df["container_restart_count"].diff().fillna(0) > 1).astype(int)
 df.drop(columns=["container_restart_count"], inplace=True)
 
-# ✅ Prepare Data
 X = df.drop(columns=["target"])
 y = df["target"]
-
-# ✅ Handle Class Imbalance
+# to handle the imbalance
 if y.value_counts().min() >= 5:
     smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
     X_resampled, y_resampled = smote.fit_resample(X, y)
 else:
     X_resampled, y_resampled = X, y
 
-# ✅ Train-Test Split
+#splitting
 X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
 
-# ✅ Reduce Overfitting (Final Fix)
+#to reduce overfitting
 rf = RandomForestClassifier(
-    n_estimators=300,  # More trees
-    max_depth=10,  # Reduce tree depth
-    min_samples_split=20,  # More samples needed per split
-    min_samples_leaf=10,  # Prevent small branches
+    n_estimators=300,  
+    max_depth=10,  
+    min_samples_split=20,
+    min_samples_leaf=10, 
     bootstrap=True,
     random_state=42
 )
 
-# ✅ Ensemble Model (Random Forest + XGBoost)
 xgb = XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=7, subsample=0.8, colsample_bytree=0.8, random_state=42)
 rf.fit(X_train, y_train)
 xgb.fit(X_train, y_train)
 
-# ✅ Predictions
+#predict
 y_pred_rf = rf.predict(X_test)
 y_pred_xgb = xgb.predict(X_test)
 
-# ✅ Combine Predictions (Soft Voting)
+#combining them
 y_pred_ensemble = (y_pred_rf + y_pred_xgb) // 2
-
-# ✅ Evaluate Model
 train_acc = rf.score(X_train, y_train) * 100
 test_acc = accuracy_score(y_test, y_pred_ensemble) * 100
 print(f"\n🎯 Train Accuracy: {train_acc:.2f} %")
 print(f"🎯 Test Accuracy: {test_acc:.2f} %")
 print("\n🔹 Classification Report:\n", classification_report(y_test, y_pred_ensemble))
 
-# ✅ Save Model
+
 MODEL_PATH = "../models/k8s_failure_model.pkl"
 joblib.dump(rf, MODEL_PATH)
 model = joblib.load("models/k8s_failure_model.pkl")
 print("The features in model are\n")
 print(model.feature_names_in_)
 print(f"\n✅ Model saved at {MODEL_PATH}")
 
-# 🔥 Confusion Matrix Plot
+#confusion matrix to be plotted
 cm = confusion_matrix(y_test, y_pred_ensemble)
 plt.figure(figsize=(6, 4))
 sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["No Failure", "Failure"], yticklabels=["No Failure", "Failure"])
@@ -88,10 +78,9 @@
 plt.ylabel("Actual")
 plt.show()
 
-# 🔥 Feature Importance Plot
+#feature importance to be plotted
 feature_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf.feature_importances_})
 feature_importances = feature_importances.sort_values(by='Importance', ascending=False).head(15)
-
 plt.figure(figsize=(10, 6))
 sns.barplot(x='Importance', y='Feature', data=feature_importances, palette="viridis")
 plt.title("Top 15 Important Features")