2121from sklearn .metrics import confusion_matrix , precision_score , recall_score , f1_score , accuracy_score
2222import math
2323
24- # --- Sidebar configuration with hover tooltips ---
25- st .sidebar .title ("Configuration" )
26- # --- Sidebar configuration with hover tooltips ---
24+ # --- Sidebar configuration ---
2725st .sidebar .title ("Configuration" )
26+ # Fullscreen toggle for plots
27+ fullscreen = st .sidebar .checkbox (
28+ "Fullscreen plots" ,
29+ False ,
30+ help = "When enabled, plots will expand to fill the container width."
31+ )
2832# Dataset selection
2933dataset_name = st .sidebar .selectbox (
3034 "Select synthetic dataset:" ,
4448 )
4549# Samples and features
4650n_samples = st .sidebar .slider (
47- "Number of samples:" , 100 , 2000 , 500 , step = 100 ,
48- help = "Choose how many data points (rows) to generate in the dataset."
51+ "Number of samples:" , 100 , 2000 , 500 , step = 100
4952)
5053n_features = st .sidebar .slider (
51- "Number of features:" , 2 , 20 , 10 ,
52- help = "Select the dimensionality (number of features) for the generated data."
54+ "Number of features:" , 2 , 20 , 10
5355)
5456# Feature selection
5557fs_method = st .sidebar .selectbox (
5658 "Feature selection method:" ,
57- ("None" , "VarianceThreshold" , "SelectKBest - ANOVA F-test" , "SelectKBest - Mutual Information" , "Tree-based importance" ),
58- help = "Choose a technique to remove or select the most relevant features before training."
59+ ("None" , "VarianceThreshold" , "SelectKBest - ANOVA F-test" , "SelectKBest - Mutual Information" , "Tree-based importance" )
5960)
6061fs_k = None
6162if fs_method .startswith ("SelectKBest" ) or fs_method == "Tree-based importance" :
62- fs_k = st .sidebar .slider (
63- "Number of features to select (k):" , 1 , n_features , min (2 , n_features ),
64- help = "When selecting features, choose the exact number of top features to keep."
65- )
63+ fs_k = st .sidebar .slider ("Number of features to select (k):" , 1 , n_features , min (2 , n_features ))
6664# Feature reduction
6765fr_method = st .sidebar .selectbox (
6866 "Feature reduction method:" ,
69- ("None" , "PCA" , "KernelPCA (RBF)" , "UMAP" ),
70- help = "Choose a dimensionality reduction method to project features into 2D space for visualization."
67+ ("None" , "PCA" , "KernelPCA (RBF)" , "UMAP" )
7168)
72- fr_components = None
73- if fr_method in ("PCA" , "KernelPCA (RBF)" , "UMAP" ):
74- fr_components = 2 # always reduce to 2D for plotting
69+ fr_components = 2 if fr_method in ("PCA" , "KernelPCA (RBF)" , "UMAP" ) else None
7570# Scaling
7671scaler_name = st .sidebar .selectbox (
7772 "Scaling method:" ,
78- ("None" , "StandardScaler" , "MinMaxScaler" , "RobustScaler" ),
79- help = "Apply normalization or scaling to features to improve model performance."
73+ ("None" , "StandardScaler" , "MinMaxScaler" , "RobustScaler" )
8074)
8175
82- # --- Generate synthetic data ---
76+ # --- Generate & preprocess data ---
8377def get_data (name ):
8478 if name == "make_classification" :
85- return datasets .make_classification (
86- n_samples = n_samples ,
87- n_features = n_features ,
88- n_informative = int (n_features / 2 ),
89- n_redundant = int (n_features / 4 ),
90- n_clusters_per_class = 1 ,
91- random_state = 42
92- )
79+ return datasets .make_classification (n_samples = n_samples , n_features = n_features ,
80+ n_informative = n_features // 2 , n_redundant = n_features // 4 ,
81+ n_clusters_per_class = 1 , random_state = 42 )
9382 elif name == "make_moons" :
9483 return datasets .make_moons (n_samples = n_samples , noise = 0.2 , random_state = 42 )
9584 elif name == "make_circles" :
@@ -102,150 +91,105 @@ def get_data(name):
10291 raise ValueError ("Unknown dataset" )
10392
10493X , y = get_data (dataset_name )
105- # --- Split data ---
10694X_train , X_test , y_train , y_test = train_test_split (X , y , test_size = 0.3 , random_state = 42 )
107- # --- Feature Selection ---
95+ # Feature selection
10896if fs_method == "VarianceThreshold" :
109- sel = VarianceThreshold (threshold = 0.1 )
110- X_train_sel , X_test_sel = sel .fit_transform (X_train ), sel .transform (X_test )
97+ sel = VarianceThreshold (0.1 ); X_train , X_test = sel .fit_transform (X_train ), sel .transform (X_test )
11198elif fs_method == "SelectKBest - ANOVA F-test" :
112- sel = SelectKBest (score_func = f_classif , k = fs_k )
113- X_train_sel , X_test_sel = sel .fit_transform (X_train , y_train ), sel .transform (X_test )
99+ sel = SelectKBest (f_classif , k = fs_k ); X_train , X_test = sel .fit_transform (X_train ,y_train ), sel .transform (X_test )
114100elif fs_method == "SelectKBest - Mutual Information" :
115- sel = SelectKBest (score_func = mutual_info_classif , k = fs_k )
116- X_train_sel , X_test_sel = sel .fit_transform (X_train , y_train ), sel .transform (X_test )
101+ sel = SelectKBest (mutual_info_classif , k = fs_k ); X_train , X_test = sel .fit_transform (X_train ,y_train ), sel .transform (X_test )
117102elif fs_method == "Tree-based importance" :
118- model_fs = RandomForestClassifier (random_state = 42 ).fit (X_train , y_train )
103+ model_fs = RandomForestClassifier (random_state = 42 ).fit (X_train ,y_train )
119104 idxs = np .argsort (model_fs .feature_importances_ )[- fs_k :]
120- X_train_sel , X_test_sel = X_train [:, idxs ], X_test [:, idxs ]
121- else :
122- X_train_sel , X_test_sel = X_train , X_test
123- # --- Feature Reduction ---
105+ X_train , X_test = X_train [:,idxs ], X_test [:,idxs ]
106+ # Reduction
124107if fr_method == "PCA" :
125- reducer = PCA (n_components = fr_components )
126- X_train_red , X_test_red = reducer .fit_transform (X_train_sel ), reducer .transform (X_test_sel )
108+ reducer = PCA (n_components = 2 ); X_train , X_test = reducer .fit_transform (X_train ), reducer .transform (X_test )
127109elif fr_method == "KernelPCA (RBF)" :
128- reducer = KernelPCA (n_components = fr_components , kernel = "rbf" , gamma = 0.1 )
129- X_train_red , X_test_red = reducer .fit_transform (X_train_sel ), reducer .transform (X_test_sel )
110+ reducer = KernelPCA (n_components = 2 , kernel = 'rbf' , gamma = 0.1 ); X_train , X_test = reducer .fit_transform (X_train ), reducer .transform (X_test )
130111elif fr_method == "UMAP" :
131- reducer = umap .UMAP (n_components = 2 , random_state = 42 )
132- X_train_red , X_test_red = reducer .fit_transform (X_train_sel ), reducer .transform (X_test_sel )
133- else :
134- X_train_red , X_test_red = X_train_sel , X_test_sel
135- # --- Scaling ---
112+ reducer = umap .UMAP (n_components = 2 , random_state = 42 ); X_train , X_test = reducer .fit_transform (X_train ), reducer .transform (X_test )
113+ # Scaling
136114if scaler_name == "StandardScaler" :
137- scaler = StandardScaler ()
138- X_train_pre , X_test_pre = scaler .fit_transform (X_train_red ), scaler .transform (X_test_red )
115+ scaler = StandardScaler (); X_train , X_test = scaler .fit_transform (X_train ), scaler .transform (X_test )
139116elif scaler_name == "MinMaxScaler" :
140- scaler = MinMaxScaler ()
141- X_train_pre , X_test_pre = scaler .fit_transform (X_train_red ), scaler .transform (X_test_red )
117+ scaler = MinMaxScaler (); X_train , X_test = scaler .fit_transform (X_train ), scaler .transform (X_test )
142118elif scaler_name == "RobustScaler" :
143- scaler = RobustScaler ()
144- X_train_pre , X_test_pre = scaler .fit_transform (X_train_red ), scaler .transform (X_test_red )
145- else :
146- X_train_pre , X_test_pre = X_train_red , X_test_red
147- # Ensure 2D for plotting
148- if X_train_pre .shape [1 ] < 2 :
149- st .error ("Need at least 2 dimensions after preprocessing for plotting." )
150- st .stop ()
151- # --- Setup models ---
119+ scaler = RobustScaler (); X_train , X_test = scaler .fit_transform (X_train ), scaler .transform (X_test )
120+
121+ # Ensure 2D
122+ if X_train .shape [1 ] != 2 :
123+ st .error ("2D data required for boundary plots." ); st .stop ()
124+
125+ # --- Models & evaluation ---
152126models = {
153127 "Logistic Regression" : LogisticRegression (),
154128 "Linear SVM" : LinearSVC (max_iter = 5000 ),
155- "Kernel SVM (RBF) " : SVC (),
156- "K-Nearest Neighbors " : KNeighborsClassifier (),
157- "Decision Tree" : DecisionTreeClassifier (),
129+ "Kernel SVM" : SVC (),
130+ "KNN " : KNeighborsClassifier (),
131+ "Tree" : DecisionTreeClassifier (),
158132 "Random Forest" : RandomForestClassifier (),
159133 "Extra Trees" : ExtraTreesClassifier (),
160134 "AdaBoost" : AdaBoostClassifier (),
161- "Gradient Boosting " : GradientBoostingClassifier (),
135+ "GradBoost " : GradientBoostingClassifier (),
162136 "Bagging" : BaggingClassifier (),
163- "Gaussian NB " : GaussianNB (),
137+ "GaussNB " : GaussianNB (),
164138 "QDA" : QuadraticDiscriminantAnalysis (),
165139 "MLP" : MLPClassifier (max_iter = 1000 ),
166140 "SGD" : SGDClassifier (max_iter = 1000 ),
167- "Passive Aggressive" : SGDClassifier (max_iter = 1000 , loss = " hinge" ),
168- # Anomaly detection
169- "Isolation Forest " : IsolationForest (random_state = 42 ),
170- "One-Class SVM " : OneClassSVM (gamma = 'auto' ),
171- "Local Outlier Factor " : LocalOutlierFactor (novelty = True )
141+ "Passive Aggressive" : SGDClassifier (max_iter = 1000 , loss = ' hinge' ),
142+ # Anomaly
143+ "IsoForest " : IsolationForest (random_state = 42 ),
144+ "OneClassSVM " : OneClassSVM (gamma = 'auto' ),
145+ "LOF " : LocalOutlierFactor (novelty = True )
172146}
173- # --- Evaluate models ---
174147results = []
175- for name , model in models .items ():
176- est = clone (model )
177- # Fit
178- if name == "Local Outlier Factor" :
179- est .fit (X_train_pre )
180- y_pred_raw = est .predict (X_test_pre )
181- else :
182- est .fit (X_train_pre , y_train if name not in ["Isolation Forest" , "One-Class SVM" , "Local Outlier Factor" ] else None )
183- y_pred_raw = est .predict (X_test_pre )
184- # Map anomaly outputs to 0/1
185- if name in ["Isolation Forest" , "One-Class SVM" , "Local Outlier Factor" ]:
186- y_pred = (y_pred_raw > 0 ).astype (int )
148+ for name , clf in models .items ():
149+ est = clone (clf )
150+ if name == "LOF" :
151+ est .fit (X_train ); y_pred_raw = est .predict (X_test )
187152 else :
188- y_pred = y_pred_raw
189- # Metrics
153+ fit_args = (X_train ,y_train ) if name not in ["IsoForest" ,"OneClassSVM" ,"LOF" ] else (X_train ,)
154+ est .fit (* fit_args ); y_pred_raw = est .predict (X_test )
155+ # map anomalies
156+ y_pred = (y_pred_raw > 0 ).astype (int ) if name in ["IsoForest" ,"OneClassSVM" ,"LOF" ] else y_pred_raw
190157 tn , fp , fn , tp = confusion_matrix (y_test , y_pred ).ravel ()
191- tpr = tp / (tp + fn ) if (tp + fn )> 0 else 0
192- tnr = tn / (tn + fp ) if (tn + fp )> 0 else 0
193- fpr = fp / (fp + tn ) if (fp + tn )> 0 else 0
194- fnr = fn / (fn + tp ) if (fn + tp )> 0 else 0
195- precision = precision_score (y_test , y_pred )
196- recall = tpr
197- f1 = f1_score (y_test , y_pred )
198- accuracy = accuracy_score (y_test , y_pred )
158+ tpr , tnr = tp / (tp + fn ), tn / (tn + fp )
159+ fpr , fnr = fp / (fp + tn ), fn / (fn + tp )
160+ precision = precision_score (y_test ,y_pred )
161+ recall , f1 = tpr , f1_score (y_test ,y_pred )
162+ acc = accuracy_score (y_test ,y_pred )
199163 gmean = math .sqrt (tpr * tnr )
200- results .append ({
201- "Model" : name ,
202- "TP" : tp , "TN" : tn , "FP" : fp , "FN" : fn ,
203- "TPR" : tpr , "TNR" : tnr , "FPR" : fpr , "FNR" : fnr ,
204- "Accuracy" : accuracy , "Precision" : precision ,
205- "Recall" : recall , "F1-Score" : f1 , "G-Mean" : gmean
206- })
207- # Display metrics
208- metrics_df = pd .DataFrame (results )
209- st .subheader ("Performance Metrics on Test Set" )
210- st .dataframe (metrics_df , use_container_width = True )
164+ results .append ({"Model" :name ,"TP" :tp ,"TN" :tn ,"FP" :fp ,"FN" :fn ,
165+ "TPR" :tpr ,"TNR" :tnr ,"FPR" :fpr ,"FNR" :fnr ,
166+ "Accuracy" :acc ,"Precision" :precision ,
167+ "Recall" :recall ,"F1" :f1 ,"G-Mean" :gmean })
168+ # Show table
169+ st .subheader ("Performance Metrics" )
170+ st .dataframe (pd .DataFrame (results ), use_container_width = True )
171+
211172# Plot decision boundaries
212- x_vis = X_train_pre [:, :2 ]
213- # Create mesh grid once based on x_vis
214- x_min , x_max = x_vis [:,0 ].min () - 1 , x_vis [:,0 ].max () + 1
215- y_min , y_max = x_vis [:,1 ].min () - 1 , x_vis [:,1 ].max () + 1
216- xx , yy = np .meshgrid (
217- np .linspace (x_min , x_max , 200 ),
218- np .linspace (y_min , y_max , 200 )
219- )
220- for _ , row in metrics_df .iterrows ():
221- name = row ["Model" ]
173+ x_min ,x_max = X_train [:,0 ].min ()- 1 , X_train [:,0 ].max ()+ 1
174+ y_min ,y_max = X_train [:,1 ].min ()- 1 , X_train [:,1 ].max ()+ 1
175+ xx ,yy = np .meshgrid (np .linspace (x_min ,x_max ,200 ),np .linspace (y_min ,y_max ,200 ))
176+ for name in models :
222177 exp = st .expander (f"Decision Boundary: { name } " )
223178 with exp :
224- # use columns to restrict plot width
225- col1 , _ = st .columns ([1 , 2 ])
226- # zoom toggle
227- zoom = col1 .checkbox ("Enlarge plot" , key = f"zoom_{ name } " )
228- fig_w , fig_h = (6 , 4 ) if zoom else (3 , 2 )
229- # train on 2D for visualization
230- model_vis = clone (models [name ])
231- if name == "Local Outlier Factor" :
232- model_vis .fit (x_vis )
233- else :
234- fit_args = (x_vis , y_train ) if name not in ["Isolation Forest" , "One-Class SVM" ] else (x_vis , None )
235- model_vis .fit (* fit_args )
236- # predict on grid
237- Z_pred = model_vis .predict (np .c_ [xx .ravel (), yy .ravel ()])
238- # map anomalies
239- if name in ["Isolation Forest" , "One-Class SVM" , "Local Outlier Factor" ]:
240- Z = (Z_pred > 0 ).astype (int ).reshape (xx .shape )
179+ est = clone (models [name ])
180+ if name == "LOF" : est .fit (X_train )
241181 else :
242- Z = Z_pred .reshape (xx .shape )
243- # plot
244- plt .figure (figsize = (fig_w , fig_h ))
245- plt .contourf (xx , yy , Z , alpha = 0.3 )
246- plt .scatter (x_vis [:,0 ], x_vis [:,1 ], c = y_train , edgecolor = 'k' , s = 20 )
182+ fit_args = (X_train ,y_train ) if name not in ["IsoForest" ,"OneClassSVM" ,"LOF" ] else (X_train ,)
183+ est .fit (* fit_args )
184+ Z = est .predict (np .c_ [xx .ravel (),yy .ravel ()])
185+ if name in ["IsoForest" ,"OneClassSVM" ,"LOF" ]: Z = (Z > 0 ).astype (int )
186+ Z = Z .reshape (xx .shape )
187+ fig_w ,fig_h = (12 ,8 ) if fullscreen else (6 ,4 )
188+ plt .figure (figsize = (fig_w ,fig_h ))
189+ plt .contourf (xx ,yy ,Z ,alpha = 0.3 )
190+ plt .scatter (X_train [:,0 ],X_train [:,1 ],c = y_train ,edgecolor = 'k' ,s = 20 )
247191 plt .title (name )
248192 plt .xlabel ("Component 1" )
249193 plt .ylabel ("Component 2" )
250- col1 .pyplot (plt )
194+ st .pyplot (plt , use_container_width = fullscreen )
251195
0 commit comments