[EHN] min_max_scale drop old_min and old_max to fit sklearn's method API (#1107)

Zeroto521 · pre-commit-ci[bot] · web-flow · commit 245012443d01 · 2022-06-01T21:10:30.000-04:00
* Fit sklearn MinMaxScaler's arguements * lint codes * Add typing annotations for column_name * Use `new_min` and `new_max` more readable * rewrite error checking * test it * lint codes * Update CHANGELOG.md * to keep line words less than 80 * test column_name type is int or str condition * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * lint codes * Drop tuple type and result type from int to float * Should be list not tuple * Support pandas.Index type * Raises error for old arguments * Update CHANGELOG.md * Simplify a bit * Update the style of example from string to codes * Add `copy` to avoid mutating the original data Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@
 -   [ENH] New decorator `deprecated_kwargs` for breaking API. #1103 @Zeroto521
 -   [ENH] Extend select_columns to support non-string columns. #1105 @samukweku
 -   [ENH] Performance improvement for groupby_topk. #1093 @samukweku
+-   [EHN] `min_max_scale` drop `old_min` and `old_max` to fit sklearn's method API. Issue #1068 @Zeroto521
 
 ## [v0.23.1] - 2022-05-03
 
diff --git a/janitor/functions/min_max_scale.py b/janitor/functions/min_max_scale.py
@@ -1,116 +1,124 @@
-import pandas_flavor as pf
-import pandas as pd
-
-from janitor.utils import deprecated_alias
-
-
-@pf.register_dataframe_method
-@deprecated_alias(col_name="column_name")
-def min_max_scale(
-    df: pd.DataFrame,
-    old_min=None,
-    old_max=None,
-    column_name=None,
-    new_min=0,
-    new_max=1,
-) -> pd.DataFrame:
-    """
-    Scales data to between a minimum and maximum value.
-
-    This method mutates the original DataFrame.
-
-    If `minimum` and `maximum` are provided, the true min/max of the
-    `DataFrame` or column is ignored in the scaling process and replaced with
-    these values, instead.
-
-    One can optionally set a new target minimum and maximum value using the
-    `new_min` and `new_max` keyword arguments. This will result in the
-    transformed data being bounded between `new_min` and `new_max`.
-
-    If a particular column name is specified, then only that column of data
-    are scaled. Otherwise, the entire dataframe is scaled.
-
-    Method chaining syntax:
-
-    ```python
-        df = pd.DataFrame(...).min_max_scale(column_name="a")
-    ```
-
-    Setting custom minimum and maximum:
-
-    ```python
-        df = (
-            pd.DataFrame(...)
-            .min_max_scale(
-                column_name="a",
-                new_min=2,
-                new_max=10
-            )
-        )
-    ```
-
-    Setting a min and max that is not based on the data, while applying to
-    entire dataframe:
-
-
-    ```python
-        df = (
-            pd.DataFrame(...)
-            .min_max_scale(
-                old_min=0,
-                old_max=14,
-                new_min=0,
-                new_max=1,
-            )
-        )
-    ```
-
-    The aforementioned example might be applied to something like scaling the
-    isoelectric points of amino acids. While technically they range from
-    approx 3-10, we can also think of them on the pH scale which ranges from
-    1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10
-    gets scaled to approx. 0.69 instead.
-
-    :param df: A pandas DataFrame.
-    :param old_min: (optional) Overrides for the current minimum
-        value of the data to be transformed.
-    :param old_max: (optional) Overrides for the current maximum
-        value of the data to be transformed.
-    :param new_min: (optional) The minimum value of the data after
-        it has been scaled.
-    :param new_max: (optional) The maximum value of the data after
-        it has been scaled.
-    :param column_name: (optional) The column on which to perform scaling.
-    :returns: A pandas DataFrame with scaled data.
-    :raises ValueError: if `old_max` is not greater than `old_min``.
-    :raises ValueError: if `new_max` is not greater than `new_min``.
-    """
-    if (
-        (old_min is not None)
-        and (old_max is not None)
-        and (old_max <= old_min)
-    ):
-        raise ValueError("`old_max` should be greater than `old_min`")
-
-    if new_max <= new_min:
-        raise ValueError("`new_max` should be greater than `new_min`")
-
-    new_range = new_max - new_min
-
-    if column_name:
-        if old_min is None:
-            old_min = df[column_name].min()
-        if old_max is None:
-            old_max = df[column_name].max()
-        old_range = old_max - old_min
-        df[column_name] = (
-            df[column_name] - old_min
-        ) * new_range / old_range + new_min
-    else:
-        if old_min is None:
-            old_min = df.min().min()
-        if old_max is None:
-            old_max = df.max().max()
-        old_range = old_max - old_min
-        df = (df - old_min) * new_range / old_range + new_min
-    return df
+from __future__ import annotations
+
+import pandas_flavor as pf
+import pandas as pd
+
+from janitor.utils import deprecated_alias
+from janitor.utils import deprecated_kwargs
+
+
+@pf.register_dataframe_method
+@deprecated_kwargs(
+    "old_min",
+    "old_max",
+    "new_min",
+    "new_max",
+    message=(
+        "The keyword argument {argument!r} of {func_name!r} is deprecated. "
+        "Please use 'feature_range' instead."
+    ),
+)
+@deprecated_alias(col_name="column_name")
+def min_max_scale(
+    df: pd.DataFrame,
+    feature_range: tuple[int | float, int | float] = (0, 1),
+    column_name: str | int | list[str | int] | pd.Index = None,
+) -> pd.DataFrame:
+    """
+    Scales data to between a minimum and maximum value.
+
+    This method mutates the original DataFrame.
+
+    If `minimum` and `maximum` are provided, the true min/max of the
+    `DataFrame` or column is ignored in the scaling process and replaced with
+    these values, instead.
+
+    One can optionally set a new target minimum and maximum value using the
+    `feature_range[0]` and `feature_range[1]` keyword arguments.
+    This will result in the transformed data being bounded between
+    `feature_range[0]` and `feature_range[1]`.
+
+    If a particular column name is specified, then only that column of data
+    are scaled. Otherwise, the entire dataframe is scaled.
+
+    Example: Basic usage.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+        >>> df.min_max_scale()
+             a    b
+        0  0.5  0.0
+        1  1.0  0.5
+
+    Example: Setting custom minimum and maximum.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+        >>> df.min_max_scale(feature_range=(0, 100))
+               a     b
+        0   50.0   0.0
+        1  100.0  50.0
+
+    Example: Apply min-max to the selected columns.
+
+        >>> import pandas as pd
+        >>> import janitor
+        >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]})
+        >>> df.min_max_scale(feature_range=(0, 100), column_name=['a', 'b'])
+               a      b
+        0    0.0    0.0
+        1  100.0  100.0
+        >>> df.min_max_scale(feature_range=(0, 100), column_name='a')
+               a  b
+        0    0.0  0
+        1  100.0  1
+
+    The aforementioned example might be applied to something like scaling the
+    isoelectric points of amino acids. While technically they range from
+    approx 3-10, we can also think of them on the pH scale which ranges from
+    1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10
+    gets scaled to approx. 0.69 instead.
+
+    :param df: A pandas DataFrame.
+    :param feature_range: (optional) Desired range of transformed data.
+    :param column_name: (optional) The column on which to perform scaling.
+    :returns: A pandas DataFrame with scaled data.
+    :raises ValueError: if `feature_range` isn't tuple type.
+    :raises ValueError: if the length of `feature_range` isn't equal to two.
+    :raises ValueError: if the element of `feature_range` isn't number type.
+    :raises ValueError: if `feature_range[1]` <= `feature_range[0]`.
+    """
+
+    if not (
+        isinstance(feature_range, (tuple, list))
+        and len(feature_range) == 2
+        and all((isinstance(i, (int, float))) for i in feature_range)
+        and feature_range[1] > feature_range[0]
+    ):
+        raise ValueError(
+            "`feature_range` should be a range type contains number element, "
+            "the first element must be greater than the second one"
+        )
+
+    new_min, new_max = feature_range
+    new_range = new_max - new_min
+
+    if column_name is not None:
+        old_min = df[column_name].min()
+        old_max = df[column_name].max()
+        old_range = old_max - old_min
+
+        df = df.copy()
+        df[column_name] = (
+            df[column_name] - old_min
+        ) * new_range / old_range + new_min
+    else:
+        old_min = df.min().min()
+        old_max = df.max().max()
+        old_range = old_max - old_min
+
+        df = (df - old_min) * new_range / old_range + new_min
+
+    return df
diff --git a/tests/functions/test_min_max_scale.py b/tests/functions/test_min_max_scale.py
@@ -1,27 +1,68 @@
-import pytest
-
-
-@pytest.mark.functions
-def test_min_max_scale(dataframe):
-    df = dataframe.min_max_scale(column_name="a")
-    assert df["a"].min() == 0
-    assert df["a"].max() == 1
-
-
-@pytest.mark.functions
-def test_min_max_scale_custom_new_min_max(dataframe):
-    df = dataframe.min_max_scale(column_name="a", new_min=1, new_max=2)
-    assert df["a"].min() == 1
-    assert df["a"].max() == 2
-
-
-@pytest.mark.functions
-def test_min_max_old_min_max_errors(dataframe):
-    with pytest.raises(ValueError):
-        dataframe.min_max_scale(column_name="a", old_min=10, old_max=0)
-
-
-@pytest.mark.functions
-def test_min_max_new_min_max_errors(dataframe):
-    with pytest.raises(ValueError):
-        dataframe.min_max_scale(column_name="a", new_min=10, new_max=0)
+import pandas as pd
+import pytest
+
+
+@pytest.mark.functions
+@pytest.mark.parametrize(
+    "df, column_name, excepted",
+    [
+        # test default parameter
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            None,
+            pd.DataFrame({"a": [0.5, 1], "b": [0, 0.5]}),
+        ),
+        # test list condition
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            ["a", "b"],
+            pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
+        ),
+        # test Index condition
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            pd.Index(["a", "b"]),
+            pd.DataFrame({"a": [0, 1.0], "b": [0, 1.0]}),
+        ),
+        # test str condition
+        (
+            pd.DataFrame({"a": [5, 10], "b": [0, 5]}),
+            "a",
+            pd.DataFrame({"a": [0, 1.0], "b": [0, 5]}),
+        ),
+        # test int condition
+        (
+            pd.DataFrame({1: [5, 10], "b": [0, 5]}),
+            1,
+            pd.DataFrame({1: [0, 1.0], "b": [0, 5]}),
+        ),
+    ],
+)
+def test_min_max_scale_column_name(df, column_name, excepted):
+    result = df.min_max_scale(column_name=column_name)
+
+    assert result.equals(excepted)
+
+
+@pytest.mark.functions
+def test_min_max_scale_custom_new_min_max(dataframe):
+    df = dataframe.min_max_scale(column_name="a", feature_range=(1, 2))
+    assert df["a"].min() == 1
+    assert df["a"].max() == 2
+
+
+@pytest.mark.functions
+@pytest.mark.parametrize(
+    "feature_range",
+    [
+        range(2),
+        (1, 2, 3),
+        ("1", 2),
+        [1, "2"],
+        ["1", "2"],
+        [2, 1],
+    ],
+)
+def test_min_max_new_min_max_errors(dataframe, feature_range):
+    with pytest.raises(ValueError):
+        dataframe.min_max_scale(feature_range=feature_range)