Skip to content

Commit 642342b

Browse files
committed
Add interactive live visualization comparing sklearn.svm with PLSSVM using bokeh.
1 parent cf60875 commit 642342b

18 files changed

+1054
-0
lines changed

.figures/plssvm_bokeh.gif

2.61 MB
Loading

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,8 @@ The main highlights of our SVM implementations are:
8484
7. Multi-GPU support for **all** kernel functions and GPU backends for `fit` as well as `predict/score` (**note**: no multi-GPU support for the stdpar backend even if run on a GPU!).
8585
8. Python bindings as drop-in replacement for `sklearn.SVC` and `sklearn.SVR` (some features currently not implemented).
8686

87+
To see the full power of Support Vector Machines, have a look at our live visualization examples in
88+
[examples/python/interactive](examples/python/interactive/README.md).
8789

8890
## Getting Started
8991

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# Interactive live comparison between `sklearn.svm` and PLSSVM
2+
3+
This directory contains a bokeh application that can be used to compare `sklearn.svm`'s and PLSSVM's classification and
4+
regression implementation directly besides each other.
5+
It is possible to change all available hyperparameters, e.g., kernel function, decision function shape, or the
6+
respective kernel function parameters. Additionally, the number of classes and datapoints as well as the used dataset
7+
can be changed on the fly.
8+
9+
![Example of our bokeh application visualization between sklearn.svm and PLSSVM.](https://github.com/SC-SGS/PLSSVM/raw/regression/.figures/plssvm_bokeh.gif)
10+
11+
# Requirements
12+
13+
In order to run our interactive comparison, the following packages must be installed:
14+
15+
```bash
16+
pip install numpy pandas bokeh scikit-learn plssvm
17+
```
18+
19+
# Running
20+
21+
To start the bokeh server locally, it is sufficient to call (in the current directory):
22+
23+
```bash
24+
bokeh serve svm.py
25+
```
26+
27+
This will output something like:
28+
29+
```bash
30+
2025-02-14 17:47:49,341 Starting Bokeh server version 3.6.3 (running on Tornado 6.4.2)
31+
2025-02-14 17:47:49,343 User authentication hooks NOT provided (default user enabled)
32+
2025-02-14 17:47:49,346 Bokeh app running at: http://localhost:5006/svm
33+
2025-02-14 17:47:49,346 Starting Bokeh server with process id: 184614
34+
```
35+
36+
You then simply have to open the prompted URL (in this example `http://localhost:5006/svm`) in a browser and enjoy our
37+
live comparison between `sklearn.svm` and PLSSVM!
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# init package functions
2+
from .svc import create_svc_layout
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import pandas as pd
2+
import sklearn
3+
4+
from bokeh.models import ColumnDataSource, DataTable, TableColumn, HTMLTemplateFormatter
5+
6+
7+
def classification_report_as_dataframe(y_true, y_pred):
8+
"""Compute the regression report using y_true and y_pred and convert it to a Pandas DataFrame usable in bokeh."""
9+
# calculate the classification report
10+
report_dict = sklearn.metrics.classification_report(y_true, y_pred, output_dict=True, zero_division=0)
11+
12+
# convert to DataFrame
13+
df = pd.DataFrame(report_dict).transpose()
14+
15+
# convert numeric values and round to 3 decimal places
16+
numeric_cols = ["precision", "recall", "f1-score"]
17+
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce").round(3)
18+
df["support"] = df["support"].astype(int)
19+
df.at['accuracy', 'support'] = len(y_true)
20+
21+
# format text for display
22+
df["precision"] = df["precision"].astype(str)
23+
df["recall"] = df["recall"].astype(str)
24+
df["f1-score"] = df["f1-score"].astype(str)
25+
df.at['accuracy', 'precision'] = ""
26+
df.at['accuracy', 'recall'] = ""
27+
df.at['accuracy', 'f1-score'] = f"<b>{df.at['accuracy', 'f1-score']}</b>"
28+
29+
df = df.reset_index().rename(columns={"index": ""})
30+
31+
return df
32+
33+
34+
def update_classification_report_plot(source, y_true, y_pred):
35+
"""Update the already existing classification report table using y_true and y_pred."""
36+
source.data = classification_report_as_dataframe(y_true, y_pred)
37+
38+
39+
def create_classification_report_plot(y_true, y_pred):
40+
"""Create a new classification report table using y_true and y_pred."""
41+
# create the Pandas DataFrame representing a classification report
42+
df = classification_report_as_dataframe(y_true, y_pred)
43+
44+
# convert DataFrame to ColumnDataSource
45+
source = ColumnDataSource(df)
46+
47+
# define the HTML formatter for the 'Name' column
48+
name_formatter = HTMLTemplateFormatter(template='<div><%= value %></div>')
49+
# create table columns (hide "Class" header by setting title to "")
50+
total_width = 300
51+
main_column_width = 96 # baded on largest string in column
52+
minor_column_width = (total_width - main_column_width) // 4
53+
columns = [TableColumn(field=col, title=col, width=main_column_width if col == "" else minor_column_width, formatter=name_formatter) for col in df.columns]
54+
55+
# Create DataTable
56+
classification_table = DataTable(source=source, columns=columns, index_position=None, sizing_mode='fixed', width=total_width, height=200)
57+
58+
return classification_table, source
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import numpy as np
2+
import pandas as pd
3+
import sklearn
4+
5+
from bokeh.plotting import figure
6+
from bokeh.models import ColumnDataSource, LinearColorMapper, LinearAxis
7+
from bokeh.palettes import Viridis256
8+
9+
10+
def confusion_matrix_as_dataframe(y_true, y_pred):
11+
"""Compute the confusion matrix using y_true and y_pred and convert it to a Pandas DataFrame usable in bokeh."""
12+
# calculate the confusion matrix
13+
confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
14+
15+
# get the unique class names
16+
unique_classes = [str(i) for i in np.unique(np.vstack((y_true, y_pred)))]
17+
y_true_range = sorted(unique_classes, reverse=True)
18+
y_pred_range = sorted(unique_classes)
19+
20+
# convert to DataFrame for easy handling
21+
df = pd.DataFrame(confusion_matrix, index=unique_classes, columns=unique_classes)
22+
# reshape for bokeh
23+
df = df.stack().reset_index(name='value')
24+
25+
# define a threshold for text color change
26+
threshold = confusion_matrix.max() * 0.5
27+
28+
# assign white text for dark backgrounds and black text for bright backgrounds
29+
df["text_color"] = ["white" if val < threshold else "black" for val in df["value"]]
30+
31+
# set alpha values -> per default everything is visible
32+
df["alpha"] = [1] * len(df["value"])
33+
34+
return df, y_true_range, y_pred_range
35+
36+
37+
def update_confusion_matrix_plot(fig, source, y_true, y_pred):
38+
"""Update the already existing confusion matrix plot using y_true and y_pred."""
39+
# create the Pandas DataFrame representing a confusion matrix
40+
df, y_true_range, y_pred_range = confusion_matrix_as_dataframe(y_true, y_pred)
41+
42+
# check if the number of unique classes has changed
43+
old_classes = fig.x_range.factors
44+
45+
if set(y_true_range) != set(old_classes):
46+
# if classes changed, update axes ranges
47+
fig.x_range.factors = y_pred_range
48+
fig.y_range.factors = y_true_range
49+
50+
# replace entire data dictionary
51+
source.data = df
52+
53+
54+
def create_confusion_matrix_plot(y_true, y_pred):
55+
"""Create a new confusion matrix plot using y_true and y_pred."""
56+
# create the Pandas DataFrame representing a confusion matrix
57+
df, y_true_range, y_pred_range = confusion_matrix_as_dataframe(y_true, y_pred)
58+
59+
# create a ColumnDataSource for dynamic updates
60+
source = ColumnDataSource(df)
61+
62+
# create a color mapper
63+
mapper = LinearColorMapper(palette=Viridis256)
64+
65+
# create figure
66+
fig = figure(x_range=y_pred_range, y_range=y_true_range, toolbar_location=None, title="Confusion Matrix",
67+
x_axis_label="y_pred", y_axis_label="y_true", x_axis_location="above",
68+
sizing_mode='fixed', width=300, height=300)
69+
# disable grid lines (only visible if alpha is 0 anyway)
70+
fig.xgrid.grid_line_color = None
71+
fig.ygrid.grid_line_color = None
72+
# disable dragging of the plot
73+
fig.toolbar.active_drag = None
74+
75+
# draw rectangles
76+
fig.rect(x="level_1", y="level_0", width=1, height=1, source=source,
77+
fill_color={'field': 'value', 'transform': mapper}, line_color="white", fill_alpha="alpha")
78+
79+
# add text labels
80+
fig.text(x="level_1", y="level_0", text="value", source=source,
81+
text_align="center", text_baseline="middle", text_color="text_color", text_font_size="10pt", text_alpha="alpha")
82+
83+
fig.axis.major_label_text_font_size = "10pt"
84+
fig.axis.major_label_standoff = 1
85+
86+
return fig, source
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import numpy as np
2+
from sklearn.datasets import make_classification, make_circles, make_moons, make_blobs, make_gaussian_quantiles
3+
4+
5+
def generate_classification_dataset(dataset_type, n_samples):
6+
"""Generate a classification dataset. Returns X (datapoints), y (labels)"""
7+
n_classes = 4
8+
9+
if dataset_type == "classification":
10+
return make_classification(n_samples=n_samples, n_features=2, n_classes=n_classes,
11+
n_clusters_per_class=1, n_redundant=0)
12+
elif dataset_type == "aniso":
13+
return make_classification(n_samples=n_samples, n_features=2, n_classes=n_classes, n_informative=2, n_redundant=0,
14+
n_clusters_per_class=1, class_sep=2)
15+
elif dataset_type == "blobs":
16+
return make_blobs(n_samples=n_samples, centers=n_classes)
17+
elif dataset_type == "varied_density":
18+
return make_blobs(n_samples=n_samples, centers=n_classes, cluster_std=np.random.choice([0.5, 1.0, 2.0, 0.1], n_classes))
19+
elif dataset_type == "outliers_with_clusters":
20+
X, y = make_blobs(n_samples=n_samples, centers=n_classes)
21+
X[:20] += 10 # Add outliers
22+
return X, y
23+
elif dataset_type == "star_cluster":
24+
X, y = [], []
25+
angles = np.linspace(0, 2 * np.pi, n_classes, endpoint=False)
26+
27+
for i, angle in enumerate(angles):
28+
r = np.random.uniform(0.5, 1.5, n_samples // n_classes)
29+
x1 = r * np.cos(angle) + np.random.normal(0, 0.2, size=r.shape)
30+
x2 = r * np.sin(angle) + np.random.normal(0, 0.2, size=r.shape)
31+
32+
X.append(np.column_stack((x1, x2)))
33+
y.append(np.full_like(x1, i))
34+
35+
X = np.vstack(X)
36+
y = np.concatenate(y).astype(int)
37+
return X, y
38+
elif dataset_type == "checkerboard":
39+
X = np.random.rand(n_samples, 2)
40+
if n_classes == 4:
41+
y = []
42+
for datapoint in X:
43+
if datapoint[0] < 0.5 and datapoint[1] < 0.5:
44+
y.append(0)
45+
elif datapoint[0] < 0.5 and datapoint[1] >= 0.5:
46+
y.append(1)
47+
elif datapoint[0] >= 0.5 and datapoint[1] < 0.5:
48+
y.append(2)
49+
elif datapoint[0] >= 0.5 and datapoint[1] >= 0.5:
50+
y.append(3)
51+
y = np.asarray(y)
52+
else:
53+
y = ((np.floor(X[:, 0] * 2) + np.floor(X[:, 1] * 2)) % n_classes).astype(int)
54+
return X, y
55+
elif dataset_type == "concentric_rings":
56+
radii = np.linspace(0.5, 2.0, n_classes)
57+
X, y = [], []
58+
59+
for i, r in enumerate(radii[:n_classes]): # Support up to 4 classes
60+
theta = np.linspace(0, 2 * np.pi, n_samples // n_classes)
61+
x1 = r * np.cos(theta) + np.random.normal(0, 0.1, size=theta.shape)
62+
x2 = r * np.sin(theta) + np.random.normal(0, 0.1, size=theta.shape)
63+
X.append(np.column_stack((x1, x2)))
64+
y.append(np.full_like(x1, i))
65+
66+
X = np.vstack(X)
67+
y = np.concatenate(y).astype(int)
68+
return X, y
69+
elif dataset_type == "ball":
70+
return make_gaussian_quantiles(n_samples=n_samples, n_features=2, n_classes=n_classes)
71+
elif dataset_type == "moons":
72+
X_1, y_1 = make_moons(n_samples=n_samples, noise=0.1, random_state=42)
73+
X_2, y_2 = make_moons(n_samples=n_samples, noise=0.1, random_state=42)
74+
for idx, datapoint in enumerate(X_2):
75+
if y_2[idx] == 0:
76+
datapoint[1] += 1
77+
else:
78+
datapoint[1] -= 1
79+
y_2 += 2
80+
81+
X = np.concatenate((X_1, X_2), axis=0)
82+
y = np.concatenate((y_1, y_2), axis=0)
83+
return X, y
84+
elif dataset_type == "wavy_clusters":
85+
X, y = [], []
86+
x1 = np.linspace(-1, 1, n_samples // n_classes)
87+
88+
for i in range(n_classes):
89+
x2 = np.sin(5 * np.pi * x1) + np.random.normal(0, 0.1, size=x1.shape) + 2 * i
90+
X.append(np.column_stack((x1, x2)))
91+
y.append(np.full_like(x1, i))
92+
93+
X = np.vstack(X)
94+
y = np.concatenate(y).astype(int)
95+
return X, y
96+
elif dataset_type == "s_curves":
97+
x1 = np.linspace(-1, 1, n_samples // n_classes)
98+
X, y = [], []
99+
100+
for i in range(n_classes):
101+
x2 = np.sin(2 * np.pi * x1) + np.random.normal(0, 0.1, size=x1.shape) + i
102+
X.append(np.column_stack((x1, x2)))
103+
y.append(np.full_like(x1, i))
104+
105+
X = np.vstack(X)
106+
y = np.concatenate(y).astype(int)
107+
return X, y
108+
elif dataset_type == "spiral":
109+
theta = np.linspace(0, 4 * np.pi, n_samples)
110+
r = np.linspace(0, 1, n_samples)
111+
X = np.column_stack([r * np.sin(theta), r * np.cos(theta)])
112+
y = np.zeros(n_samples, dtype=int)
113+
if n_classes == 2:
114+
y[r > 0.5] = 1
115+
elif n_classes == 3:
116+
y[r > 0.33] = 1
117+
y[r > 0.66] = 2
118+
elif n_classes == 4:
119+
y[r > 0.25] = 1
120+
y[r > 0.5] = 2
121+
y[r > 0.75] = 3
122+
return X, y
123+
elif dataset_type == "multiple_spirals":
124+
n_samples_per_class = n_samples // n_classes
125+
X, y = [], []
126+
127+
centers = [(i * 1, i * 1) for i in range(n_classes)] # Different starting centers
128+
129+
for i, (cx, cy) in enumerate(centers):
130+
t = np.linspace(0, 2 * np.pi, n_samples_per_class) # Spiral shape
131+
x = cx + t * np.cos(t) + 0.1 * np.random.randn(n_samples_per_class)
132+
y_coord = cy + t * np.sin(t) + 0.1 * np.random.randn(n_samples_per_class)
133+
X.append(np.column_stack((x, y_coord)))
134+
y.append(np.full(n_samples_per_class, i))
135+
136+
X = np.vstack(X)
137+
y = np.hstack(y).astype(int)
138+
return X, y
139+
elif dataset_type == "multiarm_spiral":
140+
n_samples_per_class = n_samples // n_classes
141+
X, y = [], []
142+
143+
for i in range(n_classes):
144+
t = np.linspace(0, 3 * 2 * np.pi, n_samples_per_class) # Spiral shape
145+
angle_offset = (i / n_classes) * (2 * np.pi) # Offset each spiral arm
146+
x = (t + 1) * np.cos(t + angle_offset) + 0.1 * np.random.randn(n_samples_per_class)
147+
y_coord = (t + 1) * np.sin(t + angle_offset) + 0.1 * np.random.randn(n_samples_per_class)
148+
X.append(np.column_stack((x, y_coord)))
149+
y.append(np.full(n_samples_per_class, i))
150+
151+
X = np.vstack(X)
152+
y = np.hstack(y).astype(int)
153+
return X, y

0 commit comments

Comments
 (0)