Merge pull request #263 from IFCA/fix-readme-examples

Jaime Céspedes Sisniega · web-flow · commit 26d57828ab74 · 2023-08-04T16:29:43.000+02:00
Fix README.md examples
diff --git a/README.md b/README.md
@@ -65,22 +65,23 @@ Frouros is a Python library for drift detection in machine learning systems that
 
 ### Concept drift
 
-As a quick example, we can use the wine dataset to which concept drift it is induced in order to show the use of a concept drift detector like DDM (Drift Detection Method).
+As a quick example, we can use the breast cancer dataset to which concept drift it is induced and show the use of a concept drift detector like DDM (Drift Detection Method). We can see how concept drift affects the performance in terms of accuracy.
 
 ```python
 import numpy as np
-from sklearn.datasets import load_wine
+from sklearn.datasets import load_breast_cancer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
 
 from frouros.detectors.concept_drift import DDM, DDMConfig
+from frouros.metrics import PrequentialError
 
 np.random.seed(seed=31)
 
-# Load wine dataset
-X, y = load_wine(return_X_y=True)
+# Load breast cancer dataset
+X, y = load_breast_cancer(return_X_y=True)
 
 # Split train (70%) and test (30%)
 (
@@ -90,14 +91,6 @@ X, y = load_wine(return_X_y=True)
     y_test,
 ) = train_test_split(X, y, train_size=0.7, random_state=31)
 
-# IMPORTANT: Induce/simulate concept drift in the last part (20%)
-# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
-drift_size = int(y_test.shape[0] * 0.2)
-y_test_drift = y_test[-drift_size:]
-modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
-y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
-y_test[-drift_size:] = y_test_drift
-
 # Define and fit model
 pipeline = Pipeline(
     [
@@ -108,29 +101,74 @@ pipeline = Pipeline(
 pipeline.fit(X=X_train, y=y_train)
 
 # Detector configuration and instantiation
-config = DDMConfig(warning_level=2.0,
-                   drift_level=3.0,
-                   min_num_instances=30,)
+config = DDMConfig(
+    warning_level=2.0,
+    drift_level=3.0,
+    min_num_instances=25,  # minimum number of instances before checking for concept drift
+)
 detector = DDM(config=config)
 
-# Simulate data stream (assuming test label available after prediction)
-for i, (X, y) in enumerate(zip(X_test, y_test)):
-    y_pred = pipeline.predict(X.reshape(1, -1))
-    error = 1 - int(y_pred == y)
-    detector.update(value=error)
-    status = detector.status
-    if status["drift"]:
-        print(f"Drift detected at index {i}")
-        break
-
->> Drift detected at index 44
+# Metric to compute accuracy
+metric = PrequentialError(alpha=1.0)  # alpha=1.0 is equivalent to normal accuracy
+
+def stream_test(X_test, y_test, y, metric, detector):
+    """Simulate data stream over X_test and y_test. y is the true label."""
+    drift_flag = False
+    for i, (X, y) in enumerate(zip(X_test, y_test)):
+        y_pred = pipeline.predict(X.reshape(1, -1))
+        error = 1 - (y_pred.item() == y.item())
+        metric_error = metric(error_value=error)
+        _ = detector.update(value=error)
+        status = detector.status
+        if status["drift"] and not drift_flag:
+            drift_flag = True
+            print(f"Concept drift detected at step {i}. Accuracy: {1 - metric_error:.4f}")
+    if not drift_flag:
+        print("No concept drift detected")
+    print(f"Final accuracy: {1 - metric_error:.4f}\n")
+
+# Simulate data stream (assuming test label available after each prediction)
+# No concept drift is expected to occur
+stream_test(
+    X_test=X_test,
+    y_test=y_test,
+    y=y,
+    metric=metric,
+    detector=detector,
+)
+# >> No concept drift detected
+# >> Final accuracy: 0.9766
+
+# IMPORTANT: Induce/simulate concept drift in the last part (20%)
+# of y_test by modifying some labels (50% approx). Therefore, changing P(y|X))
+drift_size = int(y_test.shape[0] * 0.2)
+y_test_drift = y_test[-drift_size:]
+modify_idx = np.random.rand(*y_test_drift.shape) <= 0.5
+y_test_drift[modify_idx] = (y_test_drift[modify_idx] + 1) % len(np.unique(y_test))
+y_test[-drift_size:] = y_test_drift
+
+# Reset detector and metric
+detector.reset()
+metric.reset()
+
+# Simulate data stream (assuming test label available after each prediction)
+# Concept drift is expected to occur because of the label modification
+stream_test(
+    X_test=X_test,
+    y_test=y_test,
+    y=y,
+    metric=metric,
+    detector=detector,
+)
+# >> Concept drift detected at step 142. Accuracy: 0.9510
+# >> Final accuracy: 0.8480
 ```
 
-More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
+More concept drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/concept_drift.html).
 
 ### Data drift
 
-As a quick example, we can use the iris dataset to which data drift in order to show the use of a data drift detector like Kolmogorov-Smirnov test.
+As a quick example, we can use the iris dataset to which data drift is induced and show the use of a data drift detector like Kolmogorov-Smirnov test.
 
 ```python
 import numpy as np
@@ -154,11 +192,11 @@ X, y = load_iris(return_X_y=True)
 ) = train_test_split(X, y, train_size=0.7, random_state=31)
 
 # Set the feature index to which detector is applied
-dim_idx = 0
+feature_idx = 0
 
 # IMPORTANT: Induce/simulate data drift in the selected feature of y_test by
 # applying some gaussian noise. Therefore, changing P(X))
-X_test[:, dim_idx] += np.random.normal(
+X_test[:, feature_idx] += np.random.normal(
     loc=0.0,
     scale=3.0,
     size=X_test.shape[0],
@@ -172,18 +210,21 @@ model.fit(X=X_train, y=y_train)
 alpha = 0.001
 # Define and fit detector
 detector = KSTest()
-detector.fit(X=X_train[:, dim_idx])
+_ = detector.fit(X=X_train[:, feature_idx])
 
 # Apply detector to the selected feature of X_test
-result = detector.compare(X=X_test[:, dim_idx])
+result, _ = detector.compare(X=X_test[:, feature_idx])
 
 # Check if drift is taking place
-result[0].p_value < alpha
->> True # Data drift detected.
+if result.p_value <= alpha:
+    print(f"Data drift detected at feature {feature_idx}")
+else:
+    print(f"No data drift detected at feature {feature_idx}")
+# >> Data drift detected at feature 0
 # Therefore, we can reject H0 (both samples come from the same distribution).
 ```
 
-More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples.html#data-drift).
+More data drift examples can be found [here](https://frouros.readthedocs.io/en/latest/examples/data_drift.html).
 
 ## 🛠 Installation