Merge pull request #176 from shakedzy/v0.7.12

shakedzy · web-flow · commit 68d15963526e · 2025-12-16T23:34:37.000+02:00
V0.7.12
diff --git a/.gitignore b/.gitignore
@@ -1,20 +1,13 @@
-syntax: glob
-.python-version
-.venv
-env/*
-venv/*
-ENV/*
-.idea
-.vscode
+.venv*
+.vscode*
 .DS_Store
-dython.egg*/*
+*.egg-info*
 *__pycache__*
-*run_stuff.py*
+
 build/*
 dist/*
-build_deploy.sh
 site/*
-debug.py
-.coverage
-.hypothesis
-.pytest_cache*
+
+*.coverage*
+*.hypothesis*
+*.pytest_cache*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,10 @@
 # Change Log
 
-## 0.7.12 (dev)
+## 0.7.12
 * _Dython now officially supports Python 3.14_
 * Added new tests (issue [#172](https://github.com/shakedzy/dython/issues/172))
 * `examples` module removed (all examples exist in the [official documentation](https://shakedzy.xyz/dython/getting_started/examples/))
+* Added [Youden's J](https://en.wikipedia.org/wiki/Youden%27s_J_statistic) statistic to `model_utils.metric_graph` ROC Curve option _(breaking change: function signature has changed)_.
 
 ## 0.7.11
 * Fixing dependency issue ([#170](https://github.com/shakedzy/dython/issues/170))
diff --git a/docs/modules/model_utils.md b/docs/modules/model_utils.md
@@ -118,10 +118,15 @@ Plots true-positive rate as a function of the false-positive rate of the positiv
 where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
 (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.
 
+Computes the estimated optimal threshold using two methods:
+* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
+* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)
+
 **Precision-Recall:**
 Plots precision as a function of recall of the positive label in a binary classification, where
 $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
 line with precision of the ratio of positive examples in the dataset.
+Estimated optimal threshold is computed using Euclidean (geometric) distance.
 
 Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018):
 
@@ -258,8 +263,20 @@ Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/mo
     consider the data as a multiclass data rather than binary (useful when plotting
     curves of different models one against the other)
 
-**Returns:** A dictionary, one key for each class. Each value is another dictionary,
-holding AUC and eOpT values.
+**Returns:**     
+A dictionary with these keys:
+- `ax`: the Matplotlib plot axis
+- `metrics`: each key is a class name from the list of provided classes., 
+             Per each class, another dict exists with AUC results
+             and measurement methods results.
+             AUC key holds both the measured area-under-curve (under `val`)
+             and the AUC of a random-guess classifier (under `naive`) for
+             comparison.
+             Each measurement method key contains three values: `x`, `y`, `val`,
+             corresponding to the (x,y) coordinates on the metric graph of the
+             threshold, and its value.
+             If only one class exists, then the measurements method keys and AUC
+             will be directly under `metrics`.
 
 **Example:** See [examples](../getting_started/examples.md).
 
diff --git a/dython/_private.py b/dython/_private.py
@@ -2,7 +2,7 @@
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
-from typing import Any, Literal, cast, overload, Type
+from typing import Any, cast, overload, Type
 from .typing import OneDimArray, TwoDimArray
 
 
@@ -92,16 +92,16 @@ def convert(
             )
         )
     else:
-        return converted
+        return converted                                # pyright: ignore[reportReturnType]
 
 
 def remove_incomplete_samples(
     x: OneDimArray, 
     y: OneDimArray,
 ) -> tuple[OneDimArray, OneDimArray]:
     
-    x = [v if v is not None else np.nan for v in x]
-    y = [v if v is not None else np.nan for v in y]
+    x = [v if v is not None else np.nan for v in x]     # pyright: ignore[reportAssignmentType]
+    y = [v if v is not None else np.nan for v in y]     # pyright: ignore[reportAssignmentType]
     arr = np.array([x, y]).transpose()
     arr = arr[~np.isnan(arr).any(axis=1)].transpose()
     if isinstance(x, list):
diff --git a/dython/model_utils.py b/dython/model_utils.py
@@ -5,7 +5,7 @@
 from sklearn.metrics import roc_curve, precision_recall_curve, auc
 from sklearn.preprocessing import LabelEncoder
 from typing import Any, Iterable
-from .typing import Number, OneDimArray
+from .typing import Number, OneDimArray, MetricGraphResult, SingleCurveResult, SingleMethodResult
 from ._private import convert, plot_or_not
 
 __all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"]
@@ -53,29 +53,38 @@ def _draw_estimated_optimal_threshold_mark(
     ms: int,
     fmt: str,
     ax: Axes,
-) -> tuple[Number, Number, Number]:
+) -> list[tuple[Number, Number, Number]]:
     annotation_offset = (-0.027, 0.03)
     a = np.zeros((len(x_axis), 2))
     a[:, 0] = x_axis
     a[:, 1] = y_axis
+    a = a[a[:, 0] != a[:, 1]]
     if metric == "roc":
-        dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2  # optimal: (0,1)
+        dists = [ # optimal: (0,1)
+            lambda row: row[0] ** 2 + (1 - row[1]) ** 2,        # geo
+            lambda row: row[0] - row[1]  # Inverse Youden's J (X-Y instead of Y-X) as later on we're finding the min value, and Youden's J needs to be maximized
+        ]
     else:  # metric == 'pr'
-        dist = (
-            lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2
-        )  # optimal: (1,1)
-    amin = np.apply_along_axis(dist, 1, a).argmin()
-    ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms)     # pyright: ignore[reportCallIssue, reportArgumentType]
-    ax.annotate(
-        "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt),                  # pyright: ignore[reportCallIssue, reportArgumentType]
-        xy=(x_axis[amin], y_axis[amin]),                                    # pyright: ignore[reportCallIssue, reportArgumentType]
-        color=color,
-        xytext=(
-            x_axis[amin] + annotation_offset[0],                            # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
-            y_axis[amin] + annotation_offset[1],                            # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]  
-        ),
-    )
-    return thresholds[amin], x_axis[amin], y_axis[amin]                     # pyright: ignore[reportCallIssue, reportArgumentType, reportReturnType]
+        dists = [ # optimal: (1,1)
+            lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2   # geo
+          ] 
+    output_tuples = []
+    for dist, marker in zip(dists, ['o','x']):
+        amin = np.apply_along_axis(dist, 1, a).argmin()
+        ax.plot(x_axis[amin], y_axis[amin], color=color, marker=marker, ms=ms)      # pyright: ignore[reportCallIssue, reportArgumentType]
+        ax.annotate(
+            "{th:{fmt}}".format(th=thresholds[amin], fmt=fmt),                      # pyright: ignore[reportCallIssue, reportArgumentType]
+            xy=(x_axis[amin], y_axis[amin]),                                        # pyright: ignore[reportCallIssue, reportArgumentType]
+            color=color,
+            xytext=(
+                x_axis[amin] + annotation_offset[0],                                # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
+                y_axis[amin] + annotation_offset[1],                                # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]  
+            ),
+        )
+        output_tuples.append(
+            (thresholds[amin], x_axis[amin], y_axis[amin])                          # pyright: ignore[reportArgumentType, reportCallIssue]
+        )
+    return output_tuples                     
 
 
 def _plot_macro_metric(
@@ -141,39 +150,58 @@ def _binary_metric_graph(
         metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt
     )
     if metric == "pr":
-        label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt)
+        label += ", naive = {ytr:{fmt}})".format(ytr=y_t_ratio, fmt=fmt)
     if eoptimal:
-        eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark(
+        eopts = _draw_estimated_optimal_threshold_mark(  
             metric, x_axis, y_axis, th, color, ms, fmt, ax
         )
-        label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt)
+        if len(eopts) == 1:
+            eopts.append((None, None, None))  # pyright: ignore[reportArgumentType]
     else:
-        eopt = None
-        eopt_x = None
-        eopt_y = None
-        label += ")"
+        eopts = [
+            (None, None, None),
+            (None, None, None)
+        ]
     ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label)
     return {
         "x": x_axis,
         "y": y_axis,
         "thresholds": th,
         "auc": auc_score,
-        "eopt": eopt,
-        "eopt_x": eopt_x,
-        "eopt_y": eopt_y,
+        "eopts": [
+            {
+                "eopt": eopts[0][0],
+                "eopt_x": eopts[0][1],
+                "eopt_y": eopts[0][2],
+                "name": "geo"
+            },
+            {
+                "eopt": eopts[1][0],
+                "eopt_x": eopts[1][1],
+                "eopt_y": eopts[1][2],
+                "name": "youden_j"
+            },
+        ],
         "y_t_ratio": y_t_ratio,
     }
 
 
 def _build_metric_graph_output_dict(
     metric: str, 
     d: dict[str, Any]
-) -> dict[str, dict[str, Any]]:
+) -> SingleCurveResult:
     naive = d["y_t_ratio"] if metric == "pr" else 0.5
-    return {
-        "auc": {"val": d["auc"], "naive": naive},
-        "eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]},
-    }
+    output: dict = {'auc': {"val": d["auc"], "naive": naive}}   
+    for eopt in d['eopts']:
+        if eopt['eopt'] is None:
+            continue
+        method_result = SingleMethodResult(
+            x=eopt['eopt_x'],
+            y=eopt['eopt_y'],
+            val=eopt['eopt']
+        )
+        output[eopt['name']] = method_result
+    return output     # pyright: ignore[reportReturnType]
 
 
 def metric_graph(
@@ -199,15 +227,25 @@ def metric_graph(
     title: str | None = None,
     filename: str | None = None,
     force_multiclass: bool = False,
-) -> dict[str, Any]:
+) -> MetricGraphResult:
     """
-    Plot a ROC graph of predictor's results (including AUC scores), where each
+    Plot a metric graph of predictor's results (including AUC scores), where each
     row of y_true and y_pred represent a single example.
-    If there are 1 or two columns only, the data is treated as a binary
-    classification (see input example below).
-    If there are more then 2 columns, each column is considered a
-    unique class, and a ROC graph and AUC score will be computed for each.
-    A Macro-ROC and Micro-ROC are computed and plotted too by default.
+
+    **ROC:**
+    Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification,
+    where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
+    (0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.
+
+    Computes the estimated optimal threshold using two methods:
+    * Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
+    * Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)
+
+    **Precision-Recall:**
+    Plots precision as a function of recall of the positive label in a binary classification, where
+    $Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
+    line with precision of the ratio of positive examples in the dataset.
+    Estimated optimal threshold is computed using Euclidean (geometric) distance.
 
     Based on sklearn examples (as was seen on April 2018):
     http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
@@ -270,8 +308,20 @@ def metric_graph(
 
     Returns:
     --------
-    A dictionary, one key for each class. Each value is another dictionary,
-    holding AUC and eOpT values.
+    A dictionary with these keys:
+    - `ax`: the Matplotlib plot axis
+    - `metrics`: each key is a class name from the list of provided classes., 
+                 Per each class, another dict exists with AUC results
+                 and measurement methods results.
+                 AUC key holds both the measured area-under-curve (under `val`)
+                 and the AUC of a random-guess classifier (under `naive`) for
+                 comparison.
+                 Each measurement method key contains three values: `x`, `y`, `val`,
+                 corresponding to the (x,y) coordinates on the metric graph of the
+                 threshold, and its value.
+                 If only one class exists, then the measurements method keys and AUC
+                 will be directly under `metrics`.
+
 
     Binary Classification Input Example:
     ------------------------------------
@@ -325,7 +375,7 @@ def metric_graph(
     else:
         colors_list: list[str] = colors or _ROC_PLOT_COLORS
 
-    output_dict = dict()
+    output_dict: dict[str, SingleCurveResult] = {}
     pr_naives = list()
     if (
         len(y_pred_array.shape) == 1
@@ -422,8 +472,11 @@ def metric_graph(
         filename=filename,
         plot=plot,
     )
-    output_dict["ax"] = axis
-    return output_dict
+    metric_graph_result = MetricGraphResult(
+        ax=axis,
+        metrics=output_dict if len(output_dict) > 1 else output_dict[list(output_dict.keys())[0]]
+    )
+    return metric_graph_result
 
 
 def random_forest_feature_importance(
diff --git a/dython/nominal.py b/dython/nominal.py
@@ -13,10 +13,10 @@
 from collections import Counter
 from matplotlib.colors import Colormap
 from matplotlib.axes._axes import Axes
-from typing import Any, Callable, Iterable, Literal, TypedDict, cast, overload
+from typing import Any, Callable, Iterable, Literal, cast, overload
 from ._private import convert, remove_incomplete_samples, replace_nan_with_value, plot_or_not
 from .data_utils import identify_columns_by_type
-from .typing import Number, OneDimArray, TwoDimArray
+from .typing import Number, OneDimArray, TwoDimArray, AssociationsResult
 
 
 __all__ = [
@@ -53,11 +53,6 @@
 NomNomAssocStr = Literal["cramer", "theil"]
 
 
-class AssociationsResult(TypedDict):
-    corr: pd.DataFrame
-    ax: Axes | None
-
-
 def _inf_nan_str(x: Number) -> str:
     if np.isnan(x):
         return "NaN"
diff --git a/dython/typing.py b/dython/typing.py
@@ -1,8 +1,30 @@
 import numpy as np
 import pandas as pd
-from typing import Sequence, Any
+from typing import Sequence, Any, TypedDict, Protocol
+from matplotlib.axes._axes import Axes
 
 
 Number = int | float
 OneDimArray = Sequence[Number | str] | pd.Series | np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]]
 TwoDimArray = np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]] | pd.DataFrame 
+
+
+class AssociationsResult(TypedDict):
+    corr: pd.DataFrame
+    ax: Axes | None
+
+
+class SingleMethodResult(TypedDict):
+    x: float
+    y: float
+    val: float
+
+
+class SingleCurveResult(Protocol):
+    auc: dict[str, float]
+    def __getitem__(self, key: str) -> SingleMethodResult: ...
+
+
+class MetricGraphResult(TypedDict):
+    metrics: dict[str, SingleCurveResult] | SingleCurveResult
+    ax: Axes
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]}
 
 min_minor = 10
-max_minor = 13
+max_minor = 14
 CLASSIFIERS = [
     f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1)
 ]
diff --git a/tests/test_model_utils/test_metric_graph_advanced.py b/tests/test_model_utils/test_metric_graph_advanced.py

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,7 @@`
`22`	`22`	`EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]}`
`23`	`23`
`24`	`24`	`min_minor = 10`
`25`		`-max_minor = 13`
	`25`	`+max_minor = 14`
`26`	`26`	`CLASSIFIERS = [`
`27`	`27`	`f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1)`
`28`	`28`	`]`