Skip to content

Commit 68d1596

Browse files
authored
Merge pull request #176 from shakedzy/v0.7.12
V0.7.12
2 parents 0b2cdac + dea34e8 commit 68d1596

File tree

9 files changed

+172
-91
lines changed

9 files changed

+172
-91
lines changed

.gitignore

Lines changed: 8 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,13 @@
1-
syntax: glob
2-
.python-version
3-
.venv
4-
env/*
5-
venv/*
6-
ENV/*
7-
.idea
8-
.vscode
1+
.venv*
2+
.vscode*
93
.DS_Store
10-
dython.egg*/*
4+
*.egg-info*
115
*__pycache__*
12-
*run_stuff.py*
6+
137
build/*
148
dist/*
15-
build_deploy.sh
169
site/*
17-
debug.py
18-
.coverage
19-
.hypothesis
20-
.pytest_cache*
10+
11+
*.coverage*
12+
*.hypothesis*
13+
*.pytest_cache*

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# Change Log
22

3-
## 0.7.12 (dev)
3+
## 0.7.12
44
* _Dython now officially supports Python 3.14_
55
* Added new tests (issue [#172](https://github.com/shakedzy/dython/issues/172))
66
* `examples` module removed (all examples exist in the [official documentation](https://shakedzy.xyz/dython/getting_started/examples/))
7+
* Added [Youden's J](https://en.wikipedia.org/wiki/Youden%27s_J_statistic) statistic to `model_utils.metric_graph` ROC Curve option _(breaking change: function signature has changed)_.
78

89
## 0.7.11
910
* Fixing dependency issue ([#170](https://github.com/shakedzy/dython/issues/170))

docs/modules/model_utils.md

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,10 +118,15 @@ Plots true-positive rate as a function of the false-positive rate of the positiv
118118
where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
119119
(0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.
120120

121+
Computes the estimated optimal threshold using two methods:
122+
* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
123+
* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)
124+
121125
**Precision-Recall:**
122126
Plots precision as a function of recall of the positive label in a binary classification, where
123127
$Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
124128
line with precision of the ratio of positive examples in the dataset.
129+
Estimated optimal threshold is computed using Euclidean (geometric) distance.
125130

126131
Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html) (as was seen on April 2018):
127132

@@ -258,8 +263,20 @@ Based on [scikit-learn examples](http://scikit-learn.org/stable/auto_examples/mo
258263
consider the data as a multiclass data rather than binary (useful when plotting
259264
curves of different models one against the other)
260265

261-
**Returns:** A dictionary, one key for each class. Each value is another dictionary,
262-
holding AUC and eOpT values.
266+
**Returns:**
267+
A dictionary with these keys:
268+
- `ax`: the Matplotlib plot axis
269+
- `metrics`: each key is a class name from the list of provided classes.,
270+
Per each class, another dict exists with AUC results
271+
and measurement methods results.
272+
AUC key holds both the measured area-under-curve (under `val`)
273+
and the AUC of a random-guess classifier (under `naive`) for
274+
comparison.
275+
Each measurement method key contains three values: `x`, `y`, `val`,
276+
corresponding to the (x,y) coordinates on the metric graph of the
277+
threshold, and its value.
278+
If only one class exists, then the measurements method keys and AUC
279+
will be directly under `metrics`.
263280

264281
**Example:** See [examples](../getting_started/examples.md).
265282

dython/_private.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import numpy as np
33
import pandas as pd
44
import matplotlib.pyplot as plt
5-
from typing import Any, Literal, cast, overload, Type
5+
from typing import Any, cast, overload, Type
66
from .typing import OneDimArray, TwoDimArray
77

88

@@ -92,16 +92,16 @@ def convert(
9292
)
9393
)
9494
else:
95-
return converted
95+
return converted # pyright: ignore[reportReturnType]
9696

9797

9898
def remove_incomplete_samples(
9999
x: OneDimArray,
100100
y: OneDimArray,
101101
) -> tuple[OneDimArray, OneDimArray]:
102102

103-
x = [v if v is not None else np.nan for v in x]
104-
y = [v if v is not None else np.nan for v in y]
103+
x = [v if v is not None else np.nan for v in x] # pyright: ignore[reportAssignmentType]
104+
y = [v if v is not None else np.nan for v in y] # pyright: ignore[reportAssignmentType]
105105
arr = np.array([x, y]).transpose()
106106
arr = arr[~np.isnan(arr).any(axis=1)].transpose()
107107
if isinstance(x, list):

dython/model_utils.py

Lines changed: 98 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from sklearn.metrics import roc_curve, precision_recall_curve, auc
66
from sklearn.preprocessing import LabelEncoder
77
from typing import Any, Iterable
8-
from .typing import Number, OneDimArray
8+
from .typing import Number, OneDimArray, MetricGraphResult, SingleCurveResult, SingleMethodResult
99
from ._private import convert, plot_or_not
1010

1111
__all__ = ["random_forest_feature_importance", "metric_graph", "ks_abc"]
@@ -53,29 +53,38 @@ def _draw_estimated_optimal_threshold_mark(
5353
ms: int,
5454
fmt: str,
5555
ax: Axes,
56-
) -> tuple[Number, Number, Number]:
56+
) -> list[tuple[Number, Number, Number]]:
5757
annotation_offset = (-0.027, 0.03)
5858
a = np.zeros((len(x_axis), 2))
5959
a[:, 0] = x_axis
6060
a[:, 1] = y_axis
61+
a = a[a[:, 0] != a[:, 1]]
6162
if metric == "roc":
62-
dist = lambda row: row[0] ** 2 + (1 - row[1]) ** 2 # optimal: (0,1)
63+
dists = [ # optimal: (0,1)
64+
lambda row: row[0] ** 2 + (1 - row[1]) ** 2, # geo
65+
lambda row: row[0] - row[1] # Inverse Youden's J (X-Y instead of Y-X) as later on we're finding the min value, and Youden's J needs to be maximized
66+
]
6367
else: # metric == 'pr'
64-
dist = (
65-
lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2
66-
) # optimal: (1,1)
67-
amin = np.apply_along_axis(dist, 1, a).argmin()
68-
ax.plot(x_axis[amin], y_axis[amin], color=color, marker="o", ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType]
69-
ax.annotate(
70-
"{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType]
71-
xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType]
72-
color=color,
73-
xytext=(
74-
x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
75-
y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
76-
),
77-
)
78-
return thresholds[amin], x_axis[amin], y_axis[amin] # pyright: ignore[reportCallIssue, reportArgumentType, reportReturnType]
68+
dists = [ # optimal: (1,1)
69+
lambda row: (1 - row[0]) ** 2 + (1 - row[1]) ** 2 # geo
70+
]
71+
output_tuples = []
72+
for dist, marker in zip(dists, ['o','x']):
73+
amin = np.apply_along_axis(dist, 1, a).argmin()
74+
ax.plot(x_axis[amin], y_axis[amin], color=color, marker=marker, ms=ms) # pyright: ignore[reportCallIssue, reportArgumentType]
75+
ax.annotate(
76+
"{th:{fmt}}".format(th=thresholds[amin], fmt=fmt), # pyright: ignore[reportCallIssue, reportArgumentType]
77+
xy=(x_axis[amin], y_axis[amin]), # pyright: ignore[reportCallIssue, reportArgumentType]
78+
color=color,
79+
xytext=(
80+
x_axis[amin] + annotation_offset[0], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
81+
y_axis[amin] + annotation_offset[1], # pyright: ignore[reportCallIssue, reportArgumentType, reportOperatorIssue]
82+
),
83+
)
84+
output_tuples.append(
85+
(thresholds[amin], x_axis[amin], y_axis[amin]) # pyright: ignore[reportArgumentType, reportCallIssue]
86+
)
87+
return output_tuples
7988

8089

8190
def _plot_macro_metric(
@@ -141,39 +150,58 @@ def _binary_metric_graph(
141150
metric=metric.upper(), class_label=class_label, auc=auc_score, fmt=fmt
142151
)
143152
if metric == "pr":
144-
label += ", naive = {ytr:{fmt}}".format(ytr=y_t_ratio, fmt=fmt)
153+
label += ", naive = {ytr:{fmt}})".format(ytr=y_t_ratio, fmt=fmt)
145154
if eoptimal:
146-
eopt, eopt_x, eopt_y = _draw_estimated_optimal_threshold_mark(
155+
eopts = _draw_estimated_optimal_threshold_mark(
147156
metric, x_axis, y_axis, th, color, ms, fmt, ax
148157
)
149-
label += ", eOpT = {th:{fmt}})".format(th=eopt, fmt=fmt)
158+
if len(eopts) == 1:
159+
eopts.append((None, None, None)) # pyright: ignore[reportArgumentType]
150160
else:
151-
eopt = None
152-
eopt_x = None
153-
eopt_y = None
154-
label += ")"
161+
eopts = [
162+
(None, None, None),
163+
(None, None, None)
164+
]
155165
ax.plot(x_axis, y_axis, color=color, lw=lw, ls=ls, label=label)
156166
return {
157167
"x": x_axis,
158168
"y": y_axis,
159169
"thresholds": th,
160170
"auc": auc_score,
161-
"eopt": eopt,
162-
"eopt_x": eopt_x,
163-
"eopt_y": eopt_y,
171+
"eopts": [
172+
{
173+
"eopt": eopts[0][0],
174+
"eopt_x": eopts[0][1],
175+
"eopt_y": eopts[0][2],
176+
"name": "geo"
177+
},
178+
{
179+
"eopt": eopts[1][0],
180+
"eopt_x": eopts[1][1],
181+
"eopt_y": eopts[1][2],
182+
"name": "youden_j"
183+
},
184+
],
164185
"y_t_ratio": y_t_ratio,
165186
}
166187

167188

168189
def _build_metric_graph_output_dict(
169190
metric: str,
170191
d: dict[str, Any]
171-
) -> dict[str, dict[str, Any]]:
192+
) -> SingleCurveResult:
172193
naive = d["y_t_ratio"] if metric == "pr" else 0.5
173-
return {
174-
"auc": {"val": d["auc"], "naive": naive},
175-
"eopt": {"val": d["eopt"], "x": d["eopt_x"], "y": d["eopt_y"]},
176-
}
194+
output: dict = {'auc': {"val": d["auc"], "naive": naive}}
195+
for eopt in d['eopts']:
196+
if eopt['eopt'] is None:
197+
continue
198+
method_result = SingleMethodResult(
199+
x=eopt['eopt_x'],
200+
y=eopt['eopt_y'],
201+
val=eopt['eopt']
202+
)
203+
output[eopt['name']] = method_result
204+
return output # pyright: ignore[reportReturnType]
177205

178206

179207
def metric_graph(
@@ -199,15 +227,25 @@ def metric_graph(
199227
title: str | None = None,
200228
filename: str | None = None,
201229
force_multiclass: bool = False,
202-
) -> dict[str, Any]:
230+
) -> MetricGraphResult:
203231
"""
204-
Plot a ROC graph of predictor's results (including AUC scores), where each
232+
Plot a metric graph of predictor's results (including AUC scores), where each
205233
row of y_true and y_pred represent a single example.
206-
If there are 1 or two columns only, the data is treated as a binary
207-
classification (see input example below).
208-
If there are more then 2 columns, each column is considered a
209-
unique class, and a ROC graph and AUC score will be computed for each.
210-
A Macro-ROC and Micro-ROC are computed and plotted too by default.
234+
235+
**ROC:**
236+
Plots true-positive rate as a function of the false-positive rate of the positive label in a binary classification,
237+
where $TPR = TP / (TP + FN)$ and $FPR = FP / (FP + TN)$. A naive algorithm will display a linear line going from
238+
(0,0) to (1,1), therefore having an area under-curve (AUC) of 0.5.
239+
240+
Computes the estimated optimal threshold using two methods:
241+
* Geometric distance: Finding the closest point to the optimum at (0,1) using Euclidean distance
242+
* Youden's J: Maximizing $TPR - FPR$ (corresponding to $Y - X$)
243+
244+
**Precision-Recall:**
245+
Plots precision as a function of recall of the positive label in a binary classification, where
246+
$Precision = TP / (TP + FP)$ and $Recall = TP / (TP + FN)$. A naive algorithm will display a horizontal linear
247+
line with precision of the ratio of positive examples in the dataset.
248+
Estimated optimal threshold is computed using Euclidean (geometric) distance.
211249
212250
Based on sklearn examples (as was seen on April 2018):
213251
http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
@@ -270,8 +308,20 @@ def metric_graph(
270308
271309
Returns:
272310
--------
273-
A dictionary, one key for each class. Each value is another dictionary,
274-
holding AUC and eOpT values.
311+
A dictionary with these keys:
312+
- `ax`: the Matplotlib plot axis
313+
- `metrics`: each key is a class name from the list of provided classes.,
314+
Per each class, another dict exists with AUC results
315+
and measurement methods results.
316+
AUC key holds both the measured area-under-curve (under `val`)
317+
and the AUC of a random-guess classifier (under `naive`) for
318+
comparison.
319+
Each measurement method key contains three values: `x`, `y`, `val`,
320+
corresponding to the (x,y) coordinates on the metric graph of the
321+
threshold, and its value.
322+
If only one class exists, then the measurements method keys and AUC
323+
will be directly under `metrics`.
324+
275325
276326
Binary Classification Input Example:
277327
------------------------------------
@@ -325,7 +375,7 @@ def metric_graph(
325375
else:
326376
colors_list: list[str] = colors or _ROC_PLOT_COLORS
327377

328-
output_dict = dict()
378+
output_dict: dict[str, SingleCurveResult] = {}
329379
pr_naives = list()
330380
if (
331381
len(y_pred_array.shape) == 1
@@ -422,8 +472,11 @@ def metric_graph(
422472
filename=filename,
423473
plot=plot,
424474
)
425-
output_dict["ax"] = axis
426-
return output_dict
475+
metric_graph_result = MetricGraphResult(
476+
ax=axis,
477+
metrics=output_dict if len(output_dict) > 1 else output_dict[list(output_dict.keys())[0]]
478+
)
479+
return metric_graph_result
427480

428481

429482
def random_forest_feature_importance(

dython/nominal.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@
1313
from collections import Counter
1414
from matplotlib.colors import Colormap
1515
from matplotlib.axes._axes import Axes
16-
from typing import Any, Callable, Iterable, Literal, TypedDict, cast, overload
16+
from typing import Any, Callable, Iterable, Literal, cast, overload
1717
from ._private import convert, remove_incomplete_samples, replace_nan_with_value, plot_or_not
1818
from .data_utils import identify_columns_by_type
19-
from .typing import Number, OneDimArray, TwoDimArray
19+
from .typing import Number, OneDimArray, TwoDimArray, AssociationsResult
2020

2121

2222
__all__ = [
@@ -53,11 +53,6 @@
5353
NomNomAssocStr = Literal["cramer", "theil"]
5454

5555

56-
class AssociationsResult(TypedDict):
57-
corr: pd.DataFrame
58-
ax: Axes | None
59-
60-
6156
def _inf_nan_str(x: Number) -> str:
6257
if np.isnan(x):
6358
return "NaN"

dython/typing.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
11
import numpy as np
22
import pandas as pd
3-
from typing import Sequence, Any
3+
from typing import Sequence, Any, TypedDict, Protocol
4+
from matplotlib.axes._axes import Axes
45

56

67
Number = int | float
78
OneDimArray = Sequence[Number | str] | pd.Series | np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]]
89
TwoDimArray = np.ndarray[Any, np.dtype[np.int64] | np.dtype[np.float64] | np.dtype[np.str_]] | pd.DataFrame
10+
11+
12+
class AssociationsResult(TypedDict):
13+
corr: pd.DataFrame
14+
ax: Axes | None
15+
16+
17+
class SingleMethodResult(TypedDict):
18+
x: float
19+
y: float
20+
val: float
21+
22+
23+
class SingleCurveResult(Protocol):
24+
auc: dict[str, float]
25+
def __getitem__(self, key: str) -> SingleMethodResult: ...
26+
27+
28+
class MetricGraphResult(TypedDict):
29+
metrics: dict[str, SingleCurveResult] | SingleCurveResult
30+
ax: Axes

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
EXTRAS_REQUIRE = {"dev": [s.strip() for s in dev_requirements.split("\n")]}
2323

2424
min_minor = 10
25-
max_minor = 13
25+
max_minor = 14
2626
CLASSIFIERS = [
2727
f"Programming Language :: Python :: 3.{str(v)}" for v in range(min_minor, max_minor+1)
2828
]

0 commit comments

Comments
 (0)