Skip to content

Commit e6f8bc9

Browse files
committed
better
1 parent ab8e7af commit e6f8bc9

File tree

1 file changed

+85
-141
lines changed

1 file changed

+85
-141
lines changed

app.py

Lines changed: 85 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
2222
import math
2323

24-
# --- Sidebar configuration with hover tooltips ---
25-
st.sidebar.title("Configuration")
26-
# --- Sidebar configuration with hover tooltips ---
24+
# --- Sidebar configuration ---
2725
st.sidebar.title("Configuration")
26+
# Fullscreen toggle for plots
27+
fullscreen = st.sidebar.checkbox(
28+
"Fullscreen plots",
29+
False,
30+
help="When enabled, plots will expand to fill the container width."
31+
)
2832
# Dataset selection
2933
dataset_name = st.sidebar.selectbox(
3034
"Select synthetic dataset:",
@@ -44,52 +48,37 @@
4448
)
4549
# Samples and features
4650
n_samples = st.sidebar.slider(
47-
"Number of samples:", 100, 2000, 500, step=100,
48-
help="Choose how many data points (rows) to generate in the dataset."
51+
"Number of samples:", 100, 2000, 500, step=100
4952
)
5053
n_features = st.sidebar.slider(
51-
"Number of features:", 2, 20, 10,
52-
help="Select the dimensionality (number of features) for the generated data."
54+
"Number of features:", 2, 20, 10
5355
)
5456
# Feature selection
5557
fs_method = st.sidebar.selectbox(
5658
"Feature selection method:",
57-
("None", "VarianceThreshold", "SelectKBest - ANOVA F-test", "SelectKBest - Mutual Information", "Tree-based importance"),
58-
help="Choose a technique to remove or select the most relevant features before training."
59+
("None", "VarianceThreshold", "SelectKBest - ANOVA F-test", "SelectKBest - Mutual Information", "Tree-based importance")
5960
)
6061
fs_k = None
6162
if fs_method.startswith("SelectKBest") or fs_method == "Tree-based importance":
62-
fs_k = st.sidebar.slider(
63-
"Number of features to select (k):", 1, n_features, min(2, n_features),
64-
help="When selecting features, choose the exact number of top features to keep."
65-
)
63+
fs_k = st.sidebar.slider("Number of features to select (k):", 1, n_features, min(2, n_features))
6664
# Feature reduction
6765
fr_method = st.sidebar.selectbox(
6866
"Feature reduction method:",
69-
("None", "PCA", "KernelPCA (RBF)", "UMAP"),
70-
help="Choose a dimensionality reduction method to project features into 2D space for visualization."
67+
("None", "PCA", "KernelPCA (RBF)", "UMAP")
7168
)
72-
fr_components = None
73-
if fr_method in ("PCA", "KernelPCA (RBF)", "UMAP"):
74-
fr_components = 2 # always reduce to 2D for plotting
69+
fr_components = 2 if fr_method in ("PCA", "KernelPCA (RBF)", "UMAP") else None
7570
# Scaling
7671
scaler_name = st.sidebar.selectbox(
7772
"Scaling method:",
78-
("None", "StandardScaler", "MinMaxScaler", "RobustScaler"),
79-
help="Apply normalization or scaling to features to improve model performance."
73+
("None", "StandardScaler", "MinMaxScaler", "RobustScaler")
8074
)
8175

82-
# --- Generate synthetic data ---
76+
# --- Generate & preprocess data ---
8377
def get_data(name):
8478
if name == "make_classification":
85-
return datasets.make_classification(
86-
n_samples=n_samples,
87-
n_features=n_features,
88-
n_informative=int(n_features/2),
89-
n_redundant=int(n_features/4),
90-
n_clusters_per_class=1,
91-
random_state=42
92-
)
79+
return datasets.make_classification(n_samples=n_samples, n_features=n_features,
80+
n_informative=n_features//2, n_redundant=n_features//4,
81+
n_clusters_per_class=1, random_state=42)
9382
elif name == "make_moons":
9483
return datasets.make_moons(n_samples=n_samples, noise=0.2, random_state=42)
9584
elif name == "make_circles":
@@ -102,150 +91,105 @@ def get_data(name):
10291
raise ValueError("Unknown dataset")
10392

10493
X, y = get_data(dataset_name)
105-
# --- Split data ---
10694
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
107-
# --- Feature Selection ---
95+
# Feature selection
10896
if fs_method == "VarianceThreshold":
109-
sel = VarianceThreshold(threshold=0.1)
110-
X_train_sel, X_test_sel = sel.fit_transform(X_train), sel.transform(X_test)
97+
sel = VarianceThreshold(0.1); X_train, X_test = sel.fit_transform(X_train), sel.transform(X_test)
11198
elif fs_method == "SelectKBest - ANOVA F-test":
112-
sel = SelectKBest(score_func=f_classif, k=fs_k)
113-
X_train_sel, X_test_sel = sel.fit_transform(X_train, y_train), sel.transform(X_test)
99+
sel = SelectKBest(f_classif, k=fs_k); X_train, X_test = sel.fit_transform(X_train,y_train), sel.transform(X_test)
114100
elif fs_method == "SelectKBest - Mutual Information":
115-
sel = SelectKBest(score_func=mutual_info_classif, k=fs_k)
116-
X_train_sel, X_test_sel = sel.fit_transform(X_train, y_train), sel.transform(X_test)
101+
sel = SelectKBest(mutual_info_classif, k=fs_k); X_train, X_test = sel.fit_transform(X_train,y_train), sel.transform(X_test)
117102
elif fs_method == "Tree-based importance":
118-
model_fs = RandomForestClassifier(random_state=42).fit(X_train, y_train)
103+
model_fs = RandomForestClassifier(random_state=42).fit(X_train,y_train)
119104
idxs = np.argsort(model_fs.feature_importances_)[-fs_k:]
120-
X_train_sel, X_test_sel = X_train[:, idxs], X_test[:, idxs]
121-
else:
122-
X_train_sel, X_test_sel = X_train, X_test
123-
# --- Feature Reduction ---
105+
X_train, X_test = X_train[:,idxs], X_test[:,idxs]
106+
# Reduction
124107
if fr_method == "PCA":
125-
reducer = PCA(n_components=fr_components)
126-
X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
108+
reducer = PCA(n_components=2); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
127109
elif fr_method == "KernelPCA (RBF)":
128-
reducer = KernelPCA(n_components=fr_components, kernel="rbf", gamma=0.1)
129-
X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
110+
reducer = KernelPCA(n_components=2, kernel='rbf', gamma=0.1); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
130111
elif fr_method == "UMAP":
131-
reducer = umap.UMAP(n_components=2, random_state=42)
132-
X_train_red, X_test_red = reducer.fit_transform(X_train_sel), reducer.transform(X_test_sel)
133-
else:
134-
X_train_red, X_test_red = X_train_sel, X_test_sel
135-
# --- Scaling ---
112+
reducer = umap.UMAP(n_components=2, random_state=42); X_train, X_test = reducer.fit_transform(X_train), reducer.transform(X_test)
113+
# Scaling
136114
if scaler_name == "StandardScaler":
137-
scaler = StandardScaler()
138-
X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
115+
scaler = StandardScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
139116
elif scaler_name == "MinMaxScaler":
140-
scaler = MinMaxScaler()
141-
X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
117+
scaler = MinMaxScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
142118
elif scaler_name == "RobustScaler":
143-
scaler = RobustScaler()
144-
X_train_pre, X_test_pre = scaler.fit_transform(X_train_red), scaler.transform(X_test_red)
145-
else:
146-
X_train_pre, X_test_pre = X_train_red, X_test_red
147-
# Ensure 2D for plotting
148-
if X_train_pre.shape[1] < 2:
149-
st.error("Need at least 2 dimensions after preprocessing for plotting.")
150-
st.stop()
151-
# --- Setup models ---
119+
scaler = RobustScaler(); X_train, X_test = scaler.fit_transform(X_train), scaler.transform(X_test)
120+
121+
# Ensure 2D
122+
if X_train.shape[1] != 2:
123+
st.error("2D data required for boundary plots."); st.stop()
124+
125+
# --- Models & evaluation ---
152126
models = {
153127
"Logistic Regression": LogisticRegression(),
154128
"Linear SVM": LinearSVC(max_iter=5000),
155-
"Kernel SVM (RBF)": SVC(),
156-
"K-Nearest Neighbors": KNeighborsClassifier(),
157-
"Decision Tree": DecisionTreeClassifier(),
129+
"Kernel SVM": SVC(),
130+
"KNN": KNeighborsClassifier(),
131+
"Tree": DecisionTreeClassifier(),
158132
"Random Forest": RandomForestClassifier(),
159133
"Extra Trees": ExtraTreesClassifier(),
160134
"AdaBoost": AdaBoostClassifier(),
161-
"Gradient Boosting": GradientBoostingClassifier(),
135+
"GradBoost": GradientBoostingClassifier(),
162136
"Bagging": BaggingClassifier(),
163-
"Gaussian NB": GaussianNB(),
137+
"GaussNB": GaussianNB(),
164138
"QDA": QuadraticDiscriminantAnalysis(),
165139
"MLP": MLPClassifier(max_iter=1000),
166140
"SGD": SGDClassifier(max_iter=1000),
167-
"Passive Aggressive": SGDClassifier(max_iter=1000, loss="hinge"),
168-
# Anomaly detection
169-
"Isolation Forest": IsolationForest(random_state=42),
170-
"One-Class SVM": OneClassSVM(gamma='auto'),
171-
"Local Outlier Factor": LocalOutlierFactor(novelty=True)
141+
"Passive Aggressive": SGDClassifier(max_iter=1000, loss='hinge'),
142+
# Anomaly
143+
"IsoForest": IsolationForest(random_state=42),
144+
"OneClassSVM": OneClassSVM(gamma='auto'),
145+
"LOF": LocalOutlierFactor(novelty=True)
172146
}
173-
# --- Evaluate models ---
174147
results = []
175-
for name, model in models.items():
176-
est = clone(model)
177-
# Fit
178-
if name == "Local Outlier Factor":
179-
est.fit(X_train_pre)
180-
y_pred_raw = est.predict(X_test_pre)
181-
else:
182-
est.fit(X_train_pre, y_train if name not in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"] else None)
183-
y_pred_raw = est.predict(X_test_pre)
184-
# Map anomaly outputs to 0/1
185-
if name in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]:
186-
y_pred = (y_pred_raw > 0).astype(int)
148+
for name, clf in models.items():
149+
est = clone(clf)
150+
if name == "LOF":
151+
est.fit(X_train); y_pred_raw = est.predict(X_test)
187152
else:
188-
y_pred = y_pred_raw
189-
# Metrics
153+
fit_args = (X_train,y_train) if name not in ["IsoForest","OneClassSVM","LOF"] else (X_train,)
154+
est.fit(*fit_args); y_pred_raw = est.predict(X_test)
155+
# map anomalies
156+
y_pred = (y_pred_raw>0).astype(int) if name in ["IsoForest","OneClassSVM","LOF"] else y_pred_raw
190157
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
191-
tpr = tp/(tp+fn) if (tp+fn)>0 else 0
192-
tnr = tn/(tn+fp) if (tn+fp)>0 else 0
193-
fpr = fp/(fp+tn) if (fp+tn)>0 else 0
194-
fnr = fn/(fn+tp) if (fn+tp)>0 else 0
195-
precision = precision_score(y_test, y_pred)
196-
recall = tpr
197-
f1 = f1_score(y_test, y_pred)
198-
accuracy = accuracy_score(y_test, y_pred)
158+
tpr, tnr = tp/(tp+fn), tn/(tn+fp)
159+
fpr, fnr = fp/(fp+tn), fn/(fn+tp)
160+
precision = precision_score(y_test,y_pred)
161+
recall, f1 = tpr, f1_score(y_test,y_pred)
162+
acc = accuracy_score(y_test,y_pred)
199163
gmean = math.sqrt(tpr*tnr)
200-
results.append({
201-
"Model": name,
202-
"TP": tp, "TN": tn, "FP": fp, "FN": fn,
203-
"TPR": tpr, "TNR": tnr, "FPR": fpr, "FNR": fnr,
204-
"Accuracy": accuracy, "Precision": precision,
205-
"Recall": recall, "F1-Score": f1, "G-Mean": gmean
206-
})
207-
# Display metrics
208-
metrics_df = pd.DataFrame(results)
209-
st.subheader("Performance Metrics on Test Set")
210-
st.dataframe(metrics_df, use_container_width=True)
164+
results.append({"Model":name,"TP":tp,"TN":tn,"FP":fp,"FN":fn,
165+
"TPR":tpr,"TNR":tnr,"FPR":fpr,"FNR":fnr,
166+
"Accuracy":acc,"Precision":precision,
167+
"Recall":recall,"F1":f1,"G-Mean":gmean})
168+
# Show table
169+
st.subheader("Performance Metrics")
170+
st.dataframe(pd.DataFrame(results), use_container_width=True)
171+
211172
# Plot decision boundaries
212-
x_vis = X_train_pre[:, :2]
213-
# Create mesh grid once based on x_vis
214-
x_min, x_max = x_vis[:,0].min() - 1, x_vis[:,0].max() + 1
215-
y_min, y_max = x_vis[:,1].min() - 1, x_vis[:,1].max() + 1
216-
xx, yy = np.meshgrid(
217-
np.linspace(x_min, x_max, 200),
218-
np.linspace(y_min, y_max, 200)
219-
)
220-
for _, row in metrics_df.iterrows():
221-
name = row["Model"]
173+
x_min,x_max = X_train[:,0].min()-1, X_train[:,0].max()+1
174+
y_min,y_max = X_train[:,1].min()-1, X_train[:,1].max()+1
175+
xx,yy = np.meshgrid(np.linspace(x_min,x_max,200),np.linspace(y_min,y_max,200))
176+
for name in models:
222177
exp = st.expander(f"Decision Boundary: {name}")
223178
with exp:
224-
# use columns to restrict plot width
225-
col1, _ = st.columns([1, 2])
226-
# zoom toggle
227-
zoom = col1.checkbox("Enlarge plot", key=f"zoom_{name}")
228-
fig_w, fig_h = (6, 4) if zoom else (3, 2)
229-
# train on 2D for visualization
230-
model_vis = clone(models[name])
231-
if name == "Local Outlier Factor":
232-
model_vis.fit(x_vis)
233-
else:
234-
fit_args = (x_vis, y_train) if name not in ["Isolation Forest", "One-Class SVM"] else (x_vis, None)
235-
model_vis.fit(*fit_args)
236-
# predict on grid
237-
Z_pred = model_vis.predict(np.c_[xx.ravel(), yy.ravel()])
238-
# map anomalies
239-
if name in ["Isolation Forest", "One-Class SVM", "Local Outlier Factor"]:
240-
Z = (Z_pred > 0).astype(int).reshape(xx.shape)
179+
est = clone(models[name])
180+
if name == "LOF": est.fit(X_train)
241181
else:
242-
Z = Z_pred.reshape(xx.shape)
243-
# plot
244-
plt.figure(figsize=(fig_w, fig_h))
245-
plt.contourf(xx, yy, Z, alpha=0.3)
246-
plt.scatter(x_vis[:,0], x_vis[:,1], c=y_train, edgecolor='k', s=20)
182+
fit_args = (X_train,y_train) if name not in ["IsoForest","OneClassSVM","LOF"] else (X_train,)
183+
est.fit(*fit_args)
184+
Z = est.predict(np.c_[xx.ravel(),yy.ravel()])
185+
if name in ["IsoForest","OneClassSVM","LOF"]: Z = (Z>0).astype(int)
186+
Z = Z.reshape(xx.shape)
187+
fig_w,fig_h = (12,8) if fullscreen else (6,4)
188+
plt.figure(figsize=(fig_w,fig_h))
189+
plt.contourf(xx,yy,Z,alpha=0.3)
190+
plt.scatter(X_train[:,0],X_train[:,1],c=y_train,edgecolor='k',s=20)
247191
plt.title(name)
248192
plt.xlabel("Component 1")
249193
plt.ylabel("Component 2")
250-
col1.pyplot(plt)
194+
st.pyplot(plt, use_container_width=fullscreen)
251195

0 commit comments

Comments
 (0)