|
1 |
| -import pandas_flavor as pf |
2 |
| -import pandas as pd |
3 |
| - |
4 |
| -from janitor.utils import deprecated_alias |
5 |
| - |
6 |
| - |
7 |
| -@pf.register_dataframe_method |
8 |
| -@deprecated_alias(col_name="column_name") |
9 |
| -def min_max_scale( |
10 |
| - df: pd.DataFrame, |
11 |
| - old_min=None, |
12 |
| - old_max=None, |
13 |
| - column_name=None, |
14 |
| - new_min=0, |
15 |
| - new_max=1, |
16 |
| -) -> pd.DataFrame: |
17 |
| - """ |
18 |
| - Scales data to between a minimum and maximum value. |
19 |
| -
|
20 |
| - This method mutates the original DataFrame. |
21 |
| -
|
22 |
| - If `minimum` and `maximum` are provided, the true min/max of the |
23 |
| - `DataFrame` or column is ignored in the scaling process and replaced with |
24 |
| - these values, instead. |
25 |
| -
|
26 |
| - One can optionally set a new target minimum and maximum value using the |
27 |
| - `new_min` and `new_max` keyword arguments. This will result in the |
28 |
| - transformed data being bounded between `new_min` and `new_max`. |
29 |
| -
|
30 |
| - If a particular column name is specified, then only that column of data |
31 |
| - are scaled. Otherwise, the entire dataframe is scaled. |
32 |
| -
|
33 |
| - Method chaining syntax: |
34 |
| -
|
35 |
| - ```python |
36 |
| - df = pd.DataFrame(...).min_max_scale(column_name="a") |
37 |
| - ``` |
38 |
| -
|
39 |
| - Setting custom minimum and maximum: |
40 |
| -
|
41 |
| - ```python |
42 |
| - df = ( |
43 |
| - pd.DataFrame(...) |
44 |
| - .min_max_scale( |
45 |
| - column_name="a", |
46 |
| - new_min=2, |
47 |
| - new_max=10 |
48 |
| - ) |
49 |
| - ) |
50 |
| - ``` |
51 |
| -
|
52 |
| - Setting a min and max that is not based on the data, while applying to |
53 |
| - entire dataframe: |
54 |
| -
|
55 |
| -
|
56 |
| - ```python |
57 |
| - df = ( |
58 |
| - pd.DataFrame(...) |
59 |
| - .min_max_scale( |
60 |
| - old_min=0, |
61 |
| - old_max=14, |
62 |
| - new_min=0, |
63 |
| - new_max=1, |
64 |
| - ) |
65 |
| - ) |
66 |
| - ``` |
67 |
| -
|
68 |
| - The aforementioned example might be applied to something like scaling the |
69 |
| - isoelectric points of amino acids. While technically they range from |
70 |
| - approx 3-10, we can also think of them on the pH scale which ranges from |
71 |
| - 1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10 |
72 |
| - gets scaled to approx. 0.69 instead. |
73 |
| -
|
74 |
| - :param df: A pandas DataFrame. |
75 |
| - :param old_min: (optional) Overrides for the current minimum |
76 |
| - value of the data to be transformed. |
77 |
| - :param old_max: (optional) Overrides for the current maximum |
78 |
| - value of the data to be transformed. |
79 |
| - :param new_min: (optional) The minimum value of the data after |
80 |
| - it has been scaled. |
81 |
| - :param new_max: (optional) The maximum value of the data after |
82 |
| - it has been scaled. |
83 |
| - :param column_name: (optional) The column on which to perform scaling. |
84 |
| - :returns: A pandas DataFrame with scaled data. |
85 |
| - :raises ValueError: if `old_max` is not greater than `old_min``. |
86 |
| - :raises ValueError: if `new_max` is not greater than `new_min``. |
87 |
| - """ |
88 |
| - if ( |
89 |
| - (old_min is not None) |
90 |
| - and (old_max is not None) |
91 |
| - and (old_max <= old_min) |
92 |
| - ): |
93 |
| - raise ValueError("`old_max` should be greater than `old_min`") |
94 |
| - |
95 |
| - if new_max <= new_min: |
96 |
| - raise ValueError("`new_max` should be greater than `new_min`") |
97 |
| - |
98 |
| - new_range = new_max - new_min |
99 |
| - |
100 |
| - if column_name: |
101 |
| - if old_min is None: |
102 |
| - old_min = df[column_name].min() |
103 |
| - if old_max is None: |
104 |
| - old_max = df[column_name].max() |
105 |
| - old_range = old_max - old_min |
106 |
| - df[column_name] = ( |
107 |
| - df[column_name] - old_min |
108 |
| - ) * new_range / old_range + new_min |
109 |
| - else: |
110 |
| - if old_min is None: |
111 |
| - old_min = df.min().min() |
112 |
| - if old_max is None: |
113 |
| - old_max = df.max().max() |
114 |
| - old_range = old_max - old_min |
115 |
| - df = (df - old_min) * new_range / old_range + new_min |
116 |
| - return df |
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +import pandas_flavor as pf |
| 4 | +import pandas as pd |
| 5 | + |
| 6 | +from janitor.utils import deprecated_alias |
| 7 | +from janitor.utils import deprecated_kwargs |
| 8 | + |
| 9 | + |
| 10 | +@pf.register_dataframe_method |
| 11 | +@deprecated_kwargs( |
| 12 | + "old_min", |
| 13 | + "old_max", |
| 14 | + "new_min", |
| 15 | + "new_max", |
| 16 | + message=( |
| 17 | + "The keyword argument {argument!r} of {func_name!r} is deprecated. " |
| 18 | + "Please use 'feature_range' instead." |
| 19 | + ), |
| 20 | +) |
| 21 | +@deprecated_alias(col_name="column_name") |
| 22 | +def min_max_scale( |
| 23 | + df: pd.DataFrame, |
| 24 | + feature_range: tuple[int | float, int | float] = (0, 1), |
| 25 | + column_name: str | int | list[str | int] | pd.Index = None, |
| 26 | +) -> pd.DataFrame: |
| 27 | + """ |
| 28 | + Scales data to between a minimum and maximum value. |
| 29 | +
|
| 30 | + This method mutates the original DataFrame. |
| 31 | +
|
| 32 | + If `minimum` and `maximum` are provided, the true min/max of the |
| 33 | + `DataFrame` or column is ignored in the scaling process and replaced with |
| 34 | + these values, instead. |
| 35 | +
|
| 36 | + One can optionally set a new target minimum and maximum value using the |
| 37 | + `feature_range[0]` and `feature_range[1]` keyword arguments. |
| 38 | + This will result in the transformed data being bounded between |
| 39 | + `feature_range[0]` and `feature_range[1]`. |
| 40 | +
|
| 41 | + If a particular column name is specified, then only that column of data |
| 42 | + are scaled. Otherwise, the entire dataframe is scaled. |
| 43 | +
|
| 44 | + Example: Basic usage. |
| 45 | +
|
| 46 | + >>> import pandas as pd |
| 47 | + >>> import janitor |
| 48 | + >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]}) |
| 49 | + >>> df.min_max_scale() |
| 50 | + a b |
| 51 | + 0 0.5 0.0 |
| 52 | + 1 1.0 0.5 |
| 53 | +
|
| 54 | + Example: Setting custom minimum and maximum. |
| 55 | +
|
| 56 | + >>> import pandas as pd |
| 57 | + >>> import janitor |
| 58 | + >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]}) |
| 59 | + >>> df.min_max_scale(feature_range=(0, 100)) |
| 60 | + a b |
| 61 | + 0 50.0 0.0 |
| 62 | + 1 100.0 50.0 |
| 63 | +
|
| 64 | + Example: Apply min-max to the selected columns. |
| 65 | +
|
| 66 | + >>> import pandas as pd |
| 67 | + >>> import janitor |
| 68 | + >>> df = pd.DataFrame({'a':[1, 2], 'b':[0, 1]}) |
| 69 | + >>> df.min_max_scale(feature_range=(0, 100), column_name=['a', 'b']) |
| 70 | + a b |
| 71 | + 0 0.0 0.0 |
| 72 | + 1 100.0 100.0 |
| 73 | + >>> df.min_max_scale(feature_range=(0, 100), column_name='a') |
| 74 | + a b |
| 75 | + 0 0.0 0 |
| 76 | + 1 100.0 1 |
| 77 | +
|
| 78 | + The aforementioned example might be applied to something like scaling the |
| 79 | + isoelectric points of amino acids. While technically they range from |
| 80 | + approx 3-10, we can also think of them on the pH scale which ranges from |
| 81 | + 1 to 14. Hence, 3 gets scaled not to 0 but approx. 0.15 instead, while 10 |
| 82 | + gets scaled to approx. 0.69 instead. |
| 83 | +
|
| 84 | + :param df: A pandas DataFrame. |
| 85 | + :param feature_range: (optional) Desired range of transformed data. |
| 86 | + :param column_name: (optional) The column on which to perform scaling. |
| 87 | + :returns: A pandas DataFrame with scaled data. |
| 88 | + :raises ValueError: if `feature_range` isn't tuple type. |
| 89 | + :raises ValueError: if the length of `feature_range` isn't equal to two. |
| 90 | + :raises ValueError: if the element of `feature_range` isn't number type. |
| 91 | + :raises ValueError: if `feature_range[1]` <= `feature_range[0]`. |
| 92 | + """ |
| 93 | + |
| 94 | + if not ( |
| 95 | + isinstance(feature_range, (tuple, list)) |
| 96 | + and len(feature_range) == 2 |
| 97 | + and all((isinstance(i, (int, float))) for i in feature_range) |
| 98 | + and feature_range[1] > feature_range[0] |
| 99 | + ): |
| 100 | + raise ValueError( |
| 101 | + "`feature_range` should be a range type contains number element, " |
| 102 | + "the first element must be greater than the second one" |
| 103 | + ) |
| 104 | + |
| 105 | + new_min, new_max = feature_range |
| 106 | + new_range = new_max - new_min |
| 107 | + |
| 108 | + if column_name is not None: |
| 109 | + old_min = df[column_name].min() |
| 110 | + old_max = df[column_name].max() |
| 111 | + old_range = old_max - old_min |
| 112 | + |
| 113 | + df = df.copy() |
| 114 | + df[column_name] = ( |
| 115 | + df[column_name] - old_min |
| 116 | + ) * new_range / old_range + new_min |
| 117 | + else: |
| 118 | + old_min = df.min().min() |
| 119 | + old_max = df.max().max() |
| 120 | + old_range = old_max - old_min |
| 121 | + |
| 122 | + df = (df - old_min) * new_range / old_range + new_min |
| 123 | + |
| 124 | + return df |
0 commit comments