Skip to content

Commit 2450124

Browse files
[EHN] min_max_scale drop old_min and old_max to fit sklearn's method API (#1107)
* Fit sklearn MinMaxScaler's arguements * lint codes * Add typing annotations for column_name * Use `new_min` and `new_max` more readable * rewrite error checking * test it * lint codes * Update CHANGELOG.md * to keep line words less than 80 * test column_name type is int or str condition * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint codes * Drop tuple type and result type from int to float * Should be list not tuple * Support pandas.Index type * Raises error for old arguments * Update CHANGELOG.md * Simplify a bit * Update the style of example from string to codes * Add `copy` to avoid mutating the original data Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 6cd5ef2 commit 2450124

File tree

3 files changed

+193
-143
lines changed

3 files changed

+193
-143
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521
88
- [ENH] Extend select_columns to support non-string columns. #1105 @samukweku
99
- [ENH] Performance improvement for groupby_topk. #1093 @samukweku
10+
- [EHN] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
1011

1112
## [v0.23.1] - 2022-05-03
1213

janitor/functions/min_max_scale.py

Lines changed: 124 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,124 @@
1-
import pandas_flavor as pf
2-
import pandas as pd
3-
4-
from janitor.utils import deprecated_alias
5-
6-
7-
@pf.register_dataframe_method
8-
@deprecated_alias(col_name="column_name")
9-
def min_max_scale(
10-
df: pd.DataFrame,
11-
old_min=None,
12-
old_max=None,
13-
column_name=None,
14-
new_min=0,
15-
new_max=1,
16-
) -> pd.DataFrame:
17-
"""
18-
Scales data to between a minimum and maximum value.
19-
20-
This method mutates the original DataFrame.
21-
22-
If `minimum` and `maximum` are provided, the true min/max of the
23-
`DataFrame` or column is ignored in the scaling process and replaced with
24-
these values, instead.
25-
26-
One can optionally set a new target minimum and maximum value using the
27-
`new_min` and `new_max` keyword arguments. This will result in the
28-
transformed data being bounded between `new_min` and `new_max`.
29-
30-
If a particular column name is specified, then only that column of data
31-
are scaled. Otherwise, the entire dataframe is scaled.
32-
33-
Method chaining syntax:
34-
35-
```python
36-
df = pd.DataFrame(...).min_max_scale(column_name="a")
37-
```
38-
39-
Setting custom minimum and maximum:
40-
41-
```python
42-
df = (
43-
pd.DataFrame(...)
44-
.min_max_scale(
45-
column_name="a",
46-
new_min=2,
47-
new_max=10
48-
)
49-
)
50-
```
51-
52-
Setting a min and max that is not based on the data, while applying to
53-
entire dataframe:
54-
55-
56-
```python
57-
df = (
58-
pd.DataFrame(...)
59-
.min_max_scale(
60-
old_min=0,
61-
old_max=14,
62-
new_min=0,
63-
new_max=1,
64-
)
65-
)
66-
```
67-
68-
The aforementioned example might be applied to something like scaling the
69-
isoelectric points of amino acids. While technically they range from
70-
approx 3-10, we can also think of them on the pH scale which ranges from
71-
1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10
72-
gets scaled to approx. 0.69 instead.
73-
74-
:param df: A pandas DataFrame.
75-
:param old_min: (optional) Overrides for the current minimum
76-
value of the data to be transformed.
77-
:param old_max: (optional) Overrides for the current maximum
78-
value of the data to be transformed.
79-
:param new_min: (optional) The minimum value of the data after
80-
it has been scaled.
81-
:param new_max: (optional) The maximum value of the data after
82-
it has been scaled.
83-
:param column_name: (optional) The column on which to perform scaling.
84-
:returns: A pandas DataFrame with scaled data.
85-
:raises ValueError: if `old_max` is not greater than `old_min``.
86-
:raises ValueError: if `new_max` is not greater than `new_min``.
87-
"""
88-
if (
89-
(old_min is not None)
90-
and (old_max is not None)
91-
and (old_max <= old_min)
92-
):
93-
raise ValueError("`old_max` should be greater than `old_min`")
94-
95-
if new_max <= new_min:
96-
raise ValueError("`new_max` should be greater than `new_min`")
97-
98-
new_range = new_max - new_min
99-
100-
if column_name:
101-
if old_min is None:
102-
old_min = df[column_name].min()
103-
if old_max is None:
104-
old_max = df[column_name].max()
105-
old_range = old_max - old_min
106-
df[column_name] = (
107-
df[column_name] - old_min
108-
) * new_range / old_range + new_min
109-
else:
110-
if old_min is None:
111-
old_min = df.min().min()
112-
if old_max is None:
113-
old_max = df.max().max()
114-
old_range = old_max - old_min
115-
df = (df - old_min) * new_range / old_range + new_min
116-
return df
1+
from __future__ import annotations
2+
3+
import pandas_flavor as pf
4+
import pandas as pd
5+
6+
from janitor.utils import deprecated_alias
7+
from janitor.utils import deprecated_kwargs
8+
9+
10+
@pf.register_dataframe_method
11+
@deprecated_kwargs(
12+
"old_min",
13+
"old_max",
14+
"new_min",
15+
"new_max",
16+
message=(
17+
"The keyword argument {argument!r} of {func_name!r} is deprecated. "
18+
"Please use 'feature_range' instead."
19+
),
20+
)
21+
@deprecated_alias(col_name="column_name")
22+
def min_max_scale(
23+
df: pd.DataFrame,
24+
feature_range: tuple[int | float, int | float] = (0, 1),
25+
column_name: str | int | list[str | int] | pd.Index = None,
26+
) -> pd.DataFrame:
27+
"""
28+
Scales data to between a minimum and maximum value.
29+
30+
This method mutates the original DataFrame.
31+
32+
If `minimum` and `maximum` are provided, the true min/max of the
33+
`DataFrame` or column is ignored in the scaling process and replaced with
34+
these values, instead.
35+
36+
One can optionally set a new target minimum and maximum value using the
37+
`feature_range[0]` and `feature_range[1]` keyword arguments.
38+
This will result in the transformed data being bounded between
39+
`feature_range[0]` and `feature_range[1]`.
40+
41+
If a particular column name is specified, then only that column of data
42+
are scaled. Otherwise, the entire dataframe is scaled.
43+
44+
Example: Basic usage.
45+
46+
>>> import pandas as pd
47+
>>> import janitor
48+
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
49+
>>> df.min_max_scale()
50+
a b
51+
0 0.5 0.0
52+
1 1.0 0.5
53+
54+
Example: Setting custom minimum and maximum.
55+
56+
>>> import pandas as pd
57+
>>> import janitor
58+
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
59+
>>> df.min_max_scale(feature_range=(0, 100))
60+
a b
61+
0 50.0 0.0
62+
1 100.0 50.0
63+
64+
Example: Apply min-max to the selected columns.
65+
66+
>>> import pandas as pd
67+
>>> import janitor
68+
>>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
69+
>>> df.min_max_scale(feature_range=(0, 100), column_name=['a', 'b'])
70+
a b
71+
0 0.0 0.0
72+
1 100.0 100.0
73+
>>> df.min_max_scale(feature_range=(0, 100), column_name='a')
74+
a b
75+
0 0.0 0
76+
1 100.0 1
77+
78+
The aforementioned example might be applied to something like scaling the
79+
isoelectric points of amino acids. While technically they range from
80+
approx 3-10, we can also think of them on the pH scale which ranges from
81+
1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10
82+
gets scaled to approx. 0.69 instead.
83+
84+
:param df: A pandas DataFrame.
85+
:param feature_range: (optional) Desired range of transformed data.
86+
:param column_name: (optional) The column on which to perform scaling.
87+
:returns: A pandas DataFrame with scaled data.
88+
:raises ValueError: if `feature_range` isn't tuple type.
89+
:raises ValueError: if the length of `feature_range` isn't equal to two.
90+
:raises ValueError: if the element of `feature_range` isn't number type.
91+
:raises ValueError: if `feature_range[1]` <= `feature_range[0]`.
92+
"""
93+
94+
if not (
95+
isinstance(feature_range, (tuple, list))
96+
and len(feature_range) == 2
97+
and all((isinstance(i, (int, float))) for i in feature_range)
98+
and feature_range[1] > feature_range[0]
99+
):
100+
raise ValueError(
101+
"`feature_range` should be a range type contains number element, "
102+
"the first element must be greater than the second one"
103+
)
104+
105+
new_min, new_max = feature_range
106+
new_range = new_max - new_min
107+
108+
if column_name is not None:
109+
old_min = df[column_name].min()
110+
old_max = df[column_name].max()
111+
old_range = old_max - old_min
112+
113+
df = df.copy()
114+
df[column_name] = (
115+
df[column_name] - old_min
116+
) * new_range / old_range + new_min
117+
else:
118+
old_min = df.min().min()
119+
old_max = df.max().max()
120+
old_range = old_max - old_min
121+
122+
df = (df - old_min) * new_range / old_range + new_min
123+
124+
return df

tests/functions/test_min_max_scale.py

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,68 @@
1-
import pytest
2-
3-
4-
@pytest.mark.functions
5-
def test_min_max_scale(dataframe):
6-
df = dataframe.min_max_scale(column_name="a")
7-
assert df["a"].min() == 0
8-
assert df["a"].max() == 1
9-
10-
11-
@pytest.mark.functions
12-
def test_min_max_scale_custom_new_min_max(dataframe):
13-
df = dataframe.min_max_scale(column_name="a", new_min=1, new_max=2)
14-
assert df["a"].min() == 1
15-
assert df["a"].max() == 2
16-
17-
18-
@pytest.mark.functions
19-
def test_min_max_old_min_max_errors(dataframe):
20-
with pytest.raises(ValueError):
21-
dataframe.min_max_scale(column_name="a", old_min=10, old_max=0)
22-
23-
24-
@pytest.mark.functions
25-
def test_min_max_new_min_max_errors(dataframe):
26-
with pytest.raises(ValueError):
27-
dataframe.min_max_scale(column_name="a", new_min=10, new_max=0)
1+
import pandas as pd
2+
import pytest
3+
4+
5+
@pytest.mark.functions
6+
@pytest.mark.parametrize(
7+
"df, column_name, excepted",
8+
[
9+
# test default parameter
10+
(
11+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
12+
None,
13+
pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
14+
),
15+
# test list condition
16+
(
17+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
18+
["a", "b"],
19+
pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
20+
),
21+
# test Index condition
22+
(
23+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
24+
pd.Index(["a", "b"]),
25+
pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
26+
),
27+
# test str condition
28+
(
29+
pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
30+
"a",
31+
pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
32+
),
33+
# test int condition
34+
(
35+
pd.DataFrame({1: [5, 10], "b": [0, 5]}),
36+
1,
37+
pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
38+
),
39+
],
40+
)
41+
def test_min_max_scale_column_name(df, column_name, excepted):
42+
result = df.min_max_scale(column_name=column_name)
43+
44+
assert result.equals(excepted)
45+
46+
47+
@pytest.mark.functions
48+
def test_min_max_scale_custom_new_min_max(dataframe):
49+
df = dataframe.min_max_scale(column_name="a", feature_range=(1, 2))
50+
assert df["a"].min() == 1
51+
assert df["a"].max() == 2
52+
53+
54+
@pytest.mark.functions
55+
@pytest.mark.parametrize(
56+
"feature_range",
57+
[
58+
range(2),
59+
(1, 2, 3),
60+
("1", 2),
61+
[1, "2"],
62+
["1", "2"],
63+
[2, 1],
64+
],
65+
)
66+
def test_min_max_new_min_max_errors(dataframe, feature_range):
67+
with pytest.raises(ValueError):
68+
dataframe.min_max_scale(feature_range=feature_range)

0 commit comments

Comments
 (0)