better

eldraco · eldraco · commit e6f8bc931635 · 2025-05-18T18:13:36.000+02:00
diff --git a/app.py b/app.py
@@ -21,10 +21,14 @@
 from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
 import math
 
-# --- Sidebar configuration with hover tooltips ---
-st.sidebar.title("Configuration")
-# --- Sidebar configuration with hover tooltips ---
+# --- Sidebar configuration ---
 st.sidebar.title("Configuration")
+# Fullscreen toggle for plots
+fullscreen = st.sidebar.checkbox(
+    "Fullscreen plots",
+    False,
+    help="When enabled, plots will expand to fill the container width."
+)
 # Dataset selection
 dataset_name = st.sidebar.selectbox(
     "Select synthetic dataset:",
@@ -44,52 +48,37 @@
     )
 # Samples and features
 n_samples = st.sidebar.slider(
-    "Number of samples:", 100, 2000, 500, step=100,
-    help="Choose how many data points (rows) to generate in the dataset."
+    "Number of samples:", 100, 2000, 500, step=100
 )
 n_features = st.sidebar.slider(
-    "Number of features:", 2, 20, 10,
-    help="Select the dimensionality (number of features) for the generated data."
+    "Number of features:", 2, 20, 10
 )
 # Feature selection
 fs_method = st.sidebar.selectbox(
     "Feature selection method:",
-    ("None", "VarianceThreshold", "SelectKBest - ANOVA F-test", "SelectKBest - Mutual Information", "Tree-based importance"),
-    help="Choose a technique to remove or select the most relevant features before training."
+    ("None", "VarianceThreshold", "SelectKBest - ANOVA F-test", "SelectKBest - Mutual Information", "Tree-based importance")
 )
 fs_k = None
 if fs_method.startswith("SelectKBest") or fs_method == "Tree-based importance":
-    fs_k = st.sidebar.slider(
-        "Number of features to select (k):", 1, n_features, min(2, n_features),
-        help="When selecting features, choose the exact number of top features to keep."
-    )
+    fs_k = st.sidebar.slider("Number of features to select (k):", 1, n_features, min(2, n_features))
 # Feature reduction
 fr_method = st.sidebar.selectbox(
     "Feature reduction method:",
-    ("None", "PCA", "KernelPCA (RBF)", "UMAP"),
-    help="Choose a dimensionality reduction method to project features into 2D space for visualization."
+    ("None", "PCA", "KernelPCA (RBF)", "UMAP")
 )
-fr_components = None
-if fr_method in ("PCA", "KernelPCA (RBF)", "UMAP"):
-    fr_components = 2  # always reduce to 2D for plotting
+fr_components = 2 if fr_method in ("PCA", "KernelPCA (RBF)", "UMAP") else None
 # Scaling
 scaler_name = st.sidebar.selectbox(
     "Scaling method:",
-    ("None", "StandardScaler", "MinMaxScaler", "RobustScaler"),
-    help="Apply normalization or scaling to features to improve model performance."
+    ("None", "StandardScaler", "MinMaxScaler", "RobustScaler")
 )
 
-# --- Generate synthetic data ---
+# --- Generate & preprocess data ---
 def get_data(name):
     if name == "make_classification":
-        return datasets.make_classification(
-            n_samples=n_samples,
-            n_features=n_features,
-            n_informative=int(n_features/2),
-            n_redundant=int(n_features/4),
-            n_clusters_per_class=1,
-            random_state=42
-        )
+        return datasets.make_classification(n_samples=n_samples, n_features=n_features,
+                                           n_informative=n_features//2, n_redundant=n_features//4,
+                                           n_clusters_per_class=1, random_state=42)
     elif name == "make_moons":
         return datasets.make_moons(n_samples=n_samples, noise=0.2, random_state=42)
     elif name == "make_circles":
@@ -102,150 +91,105 @@ def get_data(name):
         raise ValueError("Unknown dataset")
 
 X, y = get_data(dataset_name)
-# --- Split data ---
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
-# --- Feature Selection ---
+# Feature selection
 if fs_method == "VarianceThreshold":
-    sel = VarianceThreshold(threshold=0.1)
-    X_train_sel, X_test_sel = sel.fit_transform(X_train), sel.transform(X_test)
+    sel = VarianceThreshold(0.1); X_train, X_test = sel.fit_transform(X_train), sel.transform(X_test)
 elif fs_method == "SelectKBest - ANOVA F-test":
-    sel = SelectKBest(score_func=f_classif, k=fs_k)
-    X_train_sel, X_test_sel = sel.fit_transform(X_train, y_train), sel.transform(X_test)
+    sel = SelectKBest(f_classif, k=fs_k); X_train, X_test = sel.fit_transform(X_train,y_train), sel.transform(X_test)
 elif fs_method == "SelectKBest - Mutual Information":
-    sel = SelectKBest(score_func=mutual_info_classif, k=fs_k)
-    X_train_sel, X_test_sel = sel.fit_transform(X_train, y_train), sel.transform(X_test)
+    sel = SelectKBest(mutual_info_classif, k=fs_k); X_train, X_test = sel.fit_transform(X_train,y_train), sel.transform(X_test)
 elif fs_method == "Tree-based importance":
-    model_fs = RandomForestClassifier(random_state=42).fit(X_train, y_train)
+    model_fs = RandomForestClassifier(random_state=42).fit(X_train,y_train)
     idxs = np.argsort(model_fs.feature_importances_)[-fs_k:]
-    X_train_sel, X_test_sel = X_train[:, idxs], X_test[:, idxs]
-else:
-    X_train_sel, X_test_sel = X_train, X_test
-# --- Feature Reduction ---
+    X_train, X_test = X_train[:,idxs], X_test[:,idxs]
+# Reduction
 if fr_method == "PCA":
-    reducer = PCA(n_components=fr_components)
-    X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
+    reducer = PCA(n_components=2); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
 elif fr_method == "KernelPCA (RBF)":
-    reducer = KernelPCA(n_components=fr_components, kernel="rbf", gamma=0.1)
-    X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
+    reducer = KernelPCA(n_components=2, kernel='rbf', gamma=0.1); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
 elif fr_method == "UMAP":
-    reducer = umap.UMAP(n_components=2, random_state=42)
-    X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
-else:
-    X_train_red, X_test_red = X_train_sel, X_test_sel
-# --- Scaling ---
+    reducer = umap.UMAP(n_components=2, random_state=42); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
+# Scaling
 if scaler_name == "StandardScaler":
-    scaler = StandardScaler()
-    X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
+    scaler = StandardScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
 elif scaler_name == "MinMaxScaler":
-    scaler = MinMaxScaler()
-    X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
+    scaler = MinMaxScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
 elif scaler_name == "RobustScaler":
-    scaler = RobustScaler()
-    X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
-else:
-    X_train_pre, X_test_pre = X_train_red, X_test_red
-# Ensure 2D for plotting
-if X_train_pre.shape[1] < 2:
-    st.error("Need at least 2 dimensions after preprocessing for plotting.")
-    st.stop()
-# --- Setup models ---
+    scaler = RobustScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
+
+# Ensure 2D
+if X_train.shape[1] != 2:
+    st.error("2D data required for boundary plots."); st.stop()
+
+# --- Models & evaluation ---
 models = {
     "Logistic Regression": LogisticRegression(),
     "Linear SVM": LinearSVC(max_iter=5000),
-    "Kernel SVM (RBF)": SVC(),
-    "K-Nearest Neighbors": KNeighborsClassifier(),
-    "Decision Tree": DecisionTreeClassifier(),
+    "Kernel SVM": SVC(),
+    "KNN": KNeighborsClassifier(),
+    "Tree": DecisionTreeClassifier(),
     "Random Forest": RandomForestClassifier(),
     "Extra Trees": ExtraTreesClassifier(),
     "AdaBoost": AdaBoostClassifier(),
-    "Gradient Boosting": GradientBoostingClassifier(),
+    "GradBoost": GradientBoostingClassifier(),
     "Bagging": BaggingClassifier(),
-    "Gaussian NB": GaussianNB(),
+    "GaussNB": GaussianNB(),
     "QDA": QuadraticDiscriminantAnalysis(),
     "MLP": MLPClassifier(max_iter=1000),
     "SGD": SGDClassifier(max_iter=1000),
-    "Passive Aggressive": SGDClassifier(max_iter=1000, loss="hinge"),
-    # Anomaly detection
-    "Isolation Forest": IsolationForest(random_state=42),
-    "One-Class SVM": OneClassSVM(gamma='auto'),
-    "Local Outlier Factor": LocalOutlierFactor(novelty=True)
+    "Passive Aggressive": SGDClassifier(max_iter=1000, loss='hinge'),
+    # Anomaly
+    "IsoForest": IsolationForest(random_state=42),
+    "OneClassSVM": OneClassSVM(gamma='auto'),
+    "LOF": LocalOutlierFactor(novelty=True)
 }
-# --- Evaluate models ---
 results = []
-for name, model in models.items():
-    est = clone(model)
-    # Fit
-    if name == "Local Outlier Factor":
-        est.fit(X_train_pre)
-        y_pred_raw = est.predict(X_test_pre)
-    else:
-        est.fit(X_train_pre, y_train if name not in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"] else None)
-        y_pred_raw = est.predict(X_test_pre)
-    # Map anomaly outputs to 0/1
-    if name in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]:
-        y_pred = (y_pred_raw > 0).astype(int)
+for name, clf in models.items():
+    est = clone(clf)
+    if name == "LOF":
+        est.fit(X_train); y_pred_raw = est.predict(X_test)
     else:
-        y_pred = y_pred_raw
-    # Metrics
+        fit_args = (X_train,y_train) if name not in ["IsoForest","OneClassSVM","LOF"] else (X_train,)
+        est.fit(*fit_args); y_pred_raw = est.predict(X_test)
+    # map anomalies
+    y_pred = (y_pred_raw>0).astype(int) if name in ["IsoForest","OneClassSVM","LOF"] else y_pred_raw
     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
-    tpr = tp/(tp+fn) if (tp+fn)>0 else 0
-    tnr = tn/(tn+fp) if (tn+fp)>0 else 0
-    fpr = fp/(fp+tn) if (fp+tn)>0 else 0
-    fnr = fn/(fn+tp) if (fn+tp)>0 else 0
-    precision = precision_score(y_test, y_pred)
-    recall = tpr
-    f1 = f1_score(y_test, y_pred)
-    accuracy = accuracy_score(y_test, y_pred)
+    tpr, tnr = tp/(tp+fn), tn/(tn+fp)
+    fpr, fnr = fp/(fp+tn), fn/(fn+tp)
+    precision = precision_score(y_test,y_pred)
+    recall, f1 = tpr, f1_score(y_test,y_pred)
+    acc = accuracy_score(y_test,y_pred)
     gmean = math.sqrt(tpr*tnr)
-    results.append({
-        "Model": name,
-        "TP": tp, "TN": tn, "FP": fp, "FN": fn,
-        "TPR": tpr, "TNR": tnr, "FPR": fpr, "FNR": fnr,
-        "Accuracy": accuracy, "Precision": precision,
-        "Recall": recall, "F1-Score": f1, "G-Mean": gmean
-    })
-# Display metrics
-metrics_df = pd.DataFrame(results)
-st.subheader("Performance Metrics on Test Set")
-st.dataframe(metrics_df, use_container_width=True)
+    results.append({"Model":name,"TP":tp,"TN":tn,"FP":fp,"FN":fn,
+                    "TPR":tpr,"TNR":tnr,"FPR":fpr,"FNR":fnr,
+                    "Accuracy":acc,"Precision":precision,
+                    "Recall":recall,"F1":f1,"G-Mean":gmean})
+# Show table
+st.subheader("Performance Metrics")
+st.dataframe(pd.DataFrame(results), use_container_width=True)
+
 # Plot decision boundaries
-x_vis = X_train_pre[:, :2]
-# Create mesh grid once based on x_vis
-x_min, x_max = x_vis[:,0].min() - 1, x_vis[:,0].max() + 1
-y_min, y_max = x_vis[:,1].min() - 1, x_vis[:,1].max() + 1
-xx, yy = np.meshgrid(
-    np.linspace(x_min, x_max, 200),
-    np.linspace(y_min, y_max, 200)
-)
-for _, row in metrics_df.iterrows():
-    name = row["Model"]
+x_min,x_max = X_train[:,0].min()-1, X_train[:,0].max()+1
+y_min,y_max = X_train[:,1].min()-1, X_train[:,1].max()+1
+xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200))
+for name in models:
     exp = st.expander(f"Decision Boundary: {name}")
     with exp:
-        # use columns to restrict plot width
-        col1, _ = st.columns([1, 2])
-        # zoom toggle
-        zoom = col1.checkbox("Enlarge plot", key=f"zoom_{name}")
-        fig_w, fig_h = (6, 4) if zoom else (3, 2)
-        # train on 2D for visualization
-        model_vis = clone(models[name])
-        if name == "Local Outlier Factor":
-            model_vis.fit(x_vis)
-        else:
-            fit_args = (x_vis, y_train) if name not in ["Isolation Forest", "One-Class SVM"] else (x_vis, None)
-            model_vis.fit(*fit_args)
-        # predict on grid
-        Z_pred = model_vis.predict(np.c_[xx.ravel(), yy.ravel()])
-        # map anomalies
-        if name in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]:
-            Z = (Z_pred > 0).astype(int).reshape(xx.shape)
+        est = clone(models[name])
+        if name == "LOF": est.fit(X_train)
         else:
-            Z = Z_pred.reshape(xx.shape)
-        # plot
-        plt.figure(figsize=(fig_w, fig_h))
-        plt.contourf(xx, yy, Z, alpha=0.3)
-        plt.scatter(x_vis[:,0], x_vis[:,1], c=y_train, edgecolor='k', s=20)
+            fit_args = (X_train,y_train) if name not in ["IsoForest","OneClassSVM","LOF"] else (X_train,)
+            est.fit(*fit_args)
+        Z = est.predict(np.c_[xx.ravel(),yy.ravel()])
+        if name in ["IsoForest","OneClassSVM","LOF"]: Z = (Z>0).astype(int)
+        Z = Z.reshape(xx.shape)
+        fig_w,fig_h = (12,8) if fullscreen else (6,4)
+        plt.figure(figsize=(fig_w,fig_h))
+        plt.contourf(xx,yy,Z,alpha=0.3)
+        plt.scatter(X_train[:,0],X_train[:,1],c=y_train,edgecolor='k',s=20)
         plt.title(name)
         plt.xlabel("Component 1")
         plt.ylabel("Component 2")
-        col1.pyplot(plt)
+        st.pyplot(plt, use_container_width=fullscreen)