-
Notifications
You must be signed in to change notification settings - Fork 27
Add LogScaler transformer #932
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
f1dbfa5
1f7541e
d1c054c
3f1bf3d
1c5289d
57e2a94
c7414ec
d5bc2ed
8c41306
d63495b
3c3b211
2e33bc0
e13e166
a1b7753
9ab83b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -651,7 +651,7 @@ | |
| * ``None``: Do nothing with the missing values on the reverse transform. Simply | ||
| pass whatever data we get through. | ||
| constant (float): | ||
| The constant to set as the 0-value for the log-based transform. Default to 0 | ||
| The constant to set as the 0-value for the log-based transform. Defaults to 0 | ||
| (do not modify the 0-value of the data). | ||
| invert (bool): | ||
| Whether to invert the data with respect to the constant value. If False, do not | ||
|
|
@@ -668,12 +668,19 @@ | |
| self, | ||
| missing_value_replacement='mean', | ||
| missing_value_generation='random', | ||
| constant: float = 0, | ||
| constant: float = 0.0, | ||
| invert: bool = False, | ||
| learn_rounding_scheme: bool = False, | ||
| ): | ||
| self.constant = constant | ||
| self.invert = invert | ||
| if isinstance(constant, float): | ||
|
||
| self.constant = constant | ||
| else: | ||
| raise ValueError('The constant parameter must be a float.') | ||
| if isinstance(invert, bool): | ||
| self.invert = invert | ||
| else: | ||
| raise ValueError('The invert parameter must be a bool.') | ||
|
|
||
| super().__init__( | ||
| missing_value_replacement=missing_value_replacement, | ||
| missing_value_generation=missing_value_generation, | ||
|
|
@@ -684,13 +691,13 @@ | |
| column_name = self.get_input_column() | ||
| if self.invert: | ||
| if not all(data < self.constant): | ||
| raise InvalidDataError( | ||
| f"Unable to apply a log transform to column '{column_name}' due to constant" | ||
| ' being too small.' | ||
| ) | ||
| else: | ||
| if not all(data > self.constant): | ||
| raise InvalidDataError( | ||
| f"Unable to apply a log transform to column '{column_name}' due to constant" | ||
| ' being too large.' | ||
| ) | ||
|
|
@@ -704,36 +711,37 @@ | |
| else: | ||
| self._validate_data(data) | ||
|
|
||
| def _log_transform(self, data): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can move self._validate_data here as well, no? |
||
| if self.invert: | ||
| return np.log(self.constant - data) | ||
| else: | ||
| return np.log(data - self.constant) | ||
|
|
||
| def _transform(self, data): | ||
| data = super()._transform(data) | ||
|
|
||
| if data.ndim > 1: | ||
| self._validate_data(data[:, 0]) | ||
|
||
| if self.invert: | ||
| data[:, 0] = np.log(self.constant - data[:, 0]) | ||
| else: | ||
| data[:, 0] = np.log(data[:, 0] - self.constant) | ||
| data[:, 0] = self._log_transform(data[:, 0]) | ||
| else: | ||
| self._validate_data(data) | ||
| if self.invert: | ||
| data = np.log(self.constant - data) | ||
| else: | ||
| data = np.log(data - self.constant) | ||
| data = self._log_transform(data) | ||
|
|
||
| return data | ||
|
|
||
| def _reverse_log(self, data): | ||
| if self.invert: | ||
| return self.constant - np.exp(data) | ||
| else: | ||
| return np.exp(data) + self.constant | ||
|
|
||
| def _reverse_transform(self, data): | ||
| if not isinstance(data, np.ndarray): | ||
| data = data.to_numpy() | ||
|
|
||
| if data.ndim > 1: | ||
| if self.invert: | ||
| data[:, 0] = self.constant - np.exp(data[:, 0]) | ||
| else: | ||
| data[:, 0] = np.exp(data[:, 0]) + self.constant | ||
| data[:, 0] = self._reverse_log(data[:, 0]) | ||
| else: | ||
| if self.invert: | ||
| data = self.constant - np.exp(data) | ||
| else: | ||
| data = np.exp(data) + self.constant | ||
| data = self._reverse_log(data) | ||
|
|
||
| return super()._reverse_transform(data) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -565,6 +565,7 @@ def test_out_of_bounds_reverse_transform(self): | |
|
|
||
| class TestLogScaler: | ||
| def test_learn_rounding(self): | ||
| """Test that transformer learns rounding scheme from data.""" | ||
| # Setup | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add short docstrings to these tests. |
||
| data = pd.DataFrame({'test': [1.0, np.nan, 1.5]}) | ||
| transformer = LogScaler( | ||
|
|
@@ -583,6 +584,7 @@ def test_learn_rounding(self): | |
| np.testing.assert_array_equal(reversed, expected) | ||
|
|
||
| def test_missing_value_generation_from_column(self): | ||
| """Test from_column missing value generation with nans present.""" | ||
| # Setup | ||
| data = pd.DataFrame({'test': [1.0, np.nan, 1.5]}) | ||
| transformer = LogScaler( | ||
|
|
@@ -599,13 +601,14 @@ def test_missing_value_generation_from_column(self): | |
| np.testing.assert_array_equal(reversed, data) | ||
|
|
||
| def test_missing_value_generation_random(self): | ||
| """Test random missing_value_generation with nans present.""" | ||
| # Setup | ||
| data = pd.DataFrame({'test': [1.0, np.nan, 1.5, 1.5]}) | ||
| transformer = LogScaler( | ||
| missing_value_generation='random', | ||
| missing_value_replacement='mode', | ||
| invert=True, | ||
| constant=3, | ||
| constant=3.0, | ||
| ) | ||
| expected = pd.DataFrame({'test': [np.nan, 1.5, 1.5, 1.5]}) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"Default to 0" -> "Defaults to 0".
Also, either add the `` quotation marks around the 0, False, True values here, or remove them from the other values in the docstring, so it's conistent.