Skip to content

Commit 26d5782

Browse files
author
Jaime Céspedes Sisniega
authored
Merge pull request #263 from IFCA/fix-readme-examples
Fix README.md examples
2 parents 49052ce + f6121e8 commit 26d5782

File tree

1 file changed

+76
-35
lines changed

1 file changed

+76
-35
lines changed

README.md

Lines changed: 76 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -65,22 +65,23 @@ Frouros is a Python library for drift detection in machine learning systems that
6565

6666
### Concept drift
6767

68-
As a quick example, we can use the wine dataset to which concept drift it is induced in order to show the use of a concept drift detector like DDM (Drift Detection Method).
68+
As a quick example, we can use the breast cancer dataset to which concept drift it is induced and show the use of a concept drift detector like DDM (Drift Detection Method). We can see how concept drift affects the performance in terms of accuracy.
6969

7070
```python
7171
import numpy as np
72-
from sklearn.datasets import load_wine
72+
from sklearn.datasets import load_breast_cancer
7373
from sklearn.linear_model import LogisticRegression
7474
from sklearn.model_selection import train_test_split
7575
from sklearn.pipeline import Pipeline
7676
from sklearn.preprocessing import StandardScaler
7777

7878
from frouros.detectors.concept_drift import DDM, DDMConfig
79+
from frouros.metrics import PrequentialError
7980

8081
np.random.seed(seed=31)
8182

82-
# Load wine dataset
83-
X, y = load_wine(return_X_y=True)
83+
# Load breast cancer dataset
84+
X, y = load_breast_cancer(return_X_y=True)
8485

8586
# Split train (70%) and test (30%)
8687
(
@@ -90,14 +91,6 @@ X, y = load_wine(return_X_y=True)
9091
y_test,
9192
) = train_test_split(X, y, train_size=0.7, random_state=31)
9293

93-
# IMPORTANT: Induce/simulate concept drift in the last part (20%)
94-
# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
95-
drift_size = int(y_test.shape[0] * 0.2)
96-
y_test_drift = y_test[-drift_size:]
97-
modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
98-
y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
99-
y_test[-drift_size:] = y_test_drift
100-
10194
# Define and fit model
10295
pipeline = Pipeline(
10396
[
@@ -108,29 +101,74 @@ pipeline = Pipeline(
108101
pipeline.fit(X=X_train, y=y_train)
109102

110103
# Detector configuration and instantiation
111-
config = DDMConfig(warning_level=2.0,
112-
drift_level=3.0,
113-
min_num_instances=30,)
104+
config = DDMConfig(
105+
warning_level=2.0,
106+
drift_level=3.0,
107+
min_num_instances=25, # minimum number of instances before checking for concept drift
108+
)
114109
detector = DDM(config=config)
115110

116-
# Simulate data stream (assuming test label available after prediction)
117-
for i, (X, y) in enumerate(zip(X_test, y_test)):
118-
y_pred = pipeline.predict(X.reshape(1, -1))
119-
error = 1 - int(y_pred == y)
120-
detector.update(value=error)
121-
status = detector.status
122-
if status["drift"]:
123-
print(f"Drift detected at index {i}")
124-
break
125-
126-
>> Drift detected at index 44
111+
# Metric to compute accuracy
112+
metric = PrequentialError(alpha=1.0) # alpha=1.0 is equivalent to normal accuracy
113+
114+
def stream_test(X_test, y_test, y, metric, detector):
115+
"""Simulate data stream over X_test and y_test. y is the true label."""
116+
drift_flag = False
117+
for i, (X, y) in enumerate(zip(X_test, y_test)):
118+
y_pred = pipeline.predict(X.reshape(1, -1))
119+
error = 1 - (y_pred.item() == y.item())
120+
metric_error = metric(error_value=error)
121+
_ = detector.update(value=error)
122+
status = detector.status
123+
if status["drift"] and not drift_flag:
124+
drift_flag = True
125+
print(f"Concept drift detected at step {i}. Accuracy: {1 - metric_error:.4f}")
126+
if not drift_flag:
127+
print("No concept drift detected")
128+
print(f"Final accuracy: {1 - metric_error:.4f}\n")
129+
130+
# Simulate data stream (assuming test label available after each prediction)
131+
# No concept drift is expected to occur
132+
stream_test(
133+
X_test=X_test,
134+
y_test=y_test,
135+
y=y,
136+
metric=metric,
137+
detector=detector,
138+
)
139+
# >> No concept drift detected
140+
# >> Final accuracy: 0.9766
141+
142+
# IMPORTANT: Induce/simulate concept drift in the last part (20%)
143+
# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
144+
drift_size = int(y_test.shape[0] * 0.2)
145+
y_test_drift = y_test[-drift_size:]
146+
modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
147+
y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
148+
y_test[-drift_size:] = y_test_drift
149+
150+
# Reset detector and metric
151+
detector.reset()
152+
metric.reset()
153+
154+
# Simulate data stream (assuming test label available after each prediction)
155+
# Concept drift is expected to occur because of the label modification
156+
stream_test(
157+
X_test=X_test,
158+
y_test=y_test,
159+
y=y,
160+
metric=metric,
161+
detector=detector,
162+
)
163+
# >> Concept drift detected at step 142. Accuracy: 0.9510
164+
# >> Final accuracy: 0.8480
127165
```
128166

129-
More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
167+
More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/concept_drift.html).
130168

131169
### Data drift
132170

133-
As a quick example, we can use the iris dataset to which data drift in order to show the use of a data drift detector like Kolmogorov-Smirnov test.
171+
As a quick example, we can use the iris dataset to which data drift is induced and show the use of a data drift detector like Kolmogorov-Smirnov test.
134172

135173
```python
136174
import numpy as np
@@ -154,11 +192,11 @@ X, y = load_iris(return_X_y=True)
154192
) = train_test_split(X, y, train_size=0.7, random_state=31)
155193

156194
# Set the feature index to which detector is applied
157-
dim_idx = 0
195+
feature_idx = 0
158196

159197
# IMPORTANT: Induce/simulate data drift in the selected feature of y_test by
160198
# applying some gaussian noise. Therefore, changing P(X))
161-
X_test[:, dim_idx] += np.random.normal(
199+
X_test[:, feature_idx] += np.random.normal(
162200
loc=0.0,
163201
scale=3.0,
164202
size=X_test.shape[0],
@@ -172,18 +210,21 @@ model.fit(X=X_train, y=y_train)
172210
alpha = 0.001
173211
# Define and fit detector
174212
detector = KSTest()
175-
detector.fit(X=X_train[:, dim_idx])
213+
_ = detector.fit(X=X_train[:, feature_idx])
176214

177215
# Apply detector to the selected feature of X_test
178-
result = detector.compare(X=X_test[:, dim_idx])
216+
result, _ = detector.compare(X=X_test[:, feature_idx])
179217

180218
# Check if drift is taking place
181-
result[0].p_value < alpha
182-
>> True # Data drift detected.
219+
if result.p_value <= alpha:
220+
print(f"Data drift detected at feature {feature_idx}")
221+
else:
222+
print(f"No data drift detected at feature {feature_idx}")
223+
# >> Data drift detected at feature 0
183224
# Therefore, we can reject H0 (both samples come from the same distribution).
184225
```
185226

186-
More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
227+
More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/data_drift.html).
187228

188229
## 🛠 Installation
189230

0 commit comments

Comments
 (0)