Skip to content
Closed
48 changes: 28 additions & 20 deletions rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@
* ``None``: Do nothing with the missing values on the reverse transform. Simply
pass whatever data we get through.
constant (float):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Default to 0" -> "Defaults to 0".

Also, either add the `` quotation marks around the 0, False, True values here, or remove them from the other values in the docstring, so it's conistent.

The constant to set as the 0-value for the log-based transform. Default to 0
The constant to set as the 0-value for the log-based transform. Defaults to 0
(do not modify the 0-value of the data).
invert (bool):
Whether to invert the data with respect to the constant value. If False, do not
Expand All @@ -668,12 +668,19 @@
self,
missing_value_replacement='mean',
missing_value_generation='random',
constant: float = 0,
constant: float = 0.0,
invert: bool = False,
learn_rounding_scheme: bool = False,
):
self.constant = constant
self.invert = invert
if isinstance(constant, float):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integers should probably be fine as well right?

self.constant = constant
else:
raise ValueError('The constant parameter must be a float.')

Check warning on line 678 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L678

Added line #L678 was not covered by tests
if isinstance(invert, bool):
self.invert = invert
else:
raise ValueError('The invert parameter must be a bool.')

Check warning on line 682 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L682

Added line #L682 was not covered by tests

super().__init__(
missing_value_replacement=missing_value_replacement,
missing_value_generation=missing_value_generation,
Expand All @@ -684,13 +691,13 @@
column_name = self.get_input_column()
if self.invert:
if not all(data < self.constant):
raise InvalidDataError(

Check warning on line 694 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L694

Added line #L694 was not covered by tests
f"Unable to apply a log transform to column '{column_name}' due to constant"
' being too small.'
)
else:
if not all(data > self.constant):
raise InvalidDataError(

Check warning on line 700 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L700

Added line #L700 was not covered by tests
f"Unable to apply a log transform to column '{column_name}' due to constant"
' being too large.'
)
Expand All @@ -704,36 +711,37 @@
else:
self._validate_data(data)

def _log_transform(self, data):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can move self._validate_data here as well, no?

if self.invert:
return np.log(self.constant - data)
else:
return np.log(data - self.constant)

def _transform(self, data):
data = super()._transform(data)

if data.ndim > 1:
self._validate_data(data[:, 0])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would create a helper function for lines 711-715 like you did for _fit, so we don't repeat the code.

if self.invert:
data[:, 0] = np.log(self.constant - data[:, 0])
else:
data[:, 0] = np.log(data[:, 0] - self.constant)
data[:, 0] = self._log_transform(data[:, 0])
else:
self._validate_data(data)
if self.invert:
data = np.log(self.constant - data)
else:
data = np.log(data - self.constant)
data = self._log_transform(data)

return data

def _reverse_log(self, data):
if self.invert:
return self.constant - np.exp(data)
else:
return np.exp(data) + self.constant

def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim > 1:
if self.invert:
data[:, 0] = self.constant - np.exp(data[:, 0])
else:
data[:, 0] = np.exp(data[:, 0]) + self.constant
data[:, 0] = self._reverse_log(data[:, 0])
else:
if self.invert:
data = self.constant - np.exp(data)
else:
data = np.exp(data) + self.constant
data = self._reverse_log(data)

return super()._reverse_transform(data)
2 changes: 1 addition & 1 deletion tests/integration/test_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
'FloatFormatter': {'missing_value_generation': 'from_column'},
'GaussianNormalizer': {'missing_value_generation': 'from_column'},
'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'},
'LogScaler': {'constant': INT64_MIN, 'missing_value_generation': 'from_column'},
'LogScaler': {'constant': float(INT64_MIN), 'missing_value_generation': 'from_column'},
}

# Mapping of rdt sdtype to dtype
Expand Down
5 changes: 4 additions & 1 deletion tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,7 @@ def test_out_of_bounds_reverse_transform(self):

class TestLogScaler:
def test_learn_rounding(self):
"""Test that transformer learns rounding scheme from data."""
# Setup
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add short docstrings to these tests.

data = pd.DataFrame({'test': [1.0, np.nan, 1.5]})
transformer = LogScaler(
Expand All @@ -583,6 +584,7 @@ def test_learn_rounding(self):
np.testing.assert_array_equal(reversed, expected)

def test_missing_value_generation_from_column(self):
"""Test from_column missing value generation with nans present."""
# Setup
data = pd.DataFrame({'test': [1.0, np.nan, 1.5]})
transformer = LogScaler(
Expand All @@ -599,13 +601,14 @@ def test_missing_value_generation_from_column(self):
np.testing.assert_array_equal(reversed, data)

def test_missing_value_generation_random(self):
"""Test random missing_value_generation with nans present."""
# Setup
data = pd.DataFrame({'test': [1.0, np.nan, 1.5, 1.5]})
transformer = LogScaler(
missing_value_generation='random',
missing_value_replacement='mode',
invert=True,
constant=3,
constant=3.0,
)
expected = pd.DataFrame({'test': [np.nan, 1.5, 1.5, 1.5]})

Expand Down
26 changes: 21 additions & 5 deletions tests/unit/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1868,7 +1868,7 @@ def test__reverse_transform_missing_value_replacement_missing_value_replacement_

class TestLogScaler:
def test___init__super_attrs(self):
"""super() arguments are properly passed and set as attributes."""
"""Test super() arguments are properly passed and set as attributes."""
ls = LogScaler(
missing_value_generation='random',
learn_rounding_scheme=False,
Expand All @@ -1888,6 +1888,14 @@ def test___init__constant(self):
assert ls_set.constant == 2.5
assert ls_default.constant == 0.0

def test__init__validates_constant(self):
"""Test __init__ validates constat parameter."""
# Setup
message = 'The constant parameter must be a float.'
# Run and Assert
with pytest.raises(ValueError, match=message):
LogScaler(constant=2)

def test___init__invert(self):
"""Test invert parameter is set as an attribute."""
# Setup
Expand All @@ -1898,6 +1906,14 @@ def test___init__invert(self):
assert ls_set.invert
assert not ls_default.invert

def test__init__validates_invert(self):
"""Test __init__ validates constat parameter."""
# Setup
message = 'The invert parameter must be a bool.'
# Run and Assert
with pytest.raises(ValueError, match=message):
LogScaler(invert=2)

def test__validate_data(self):
"""Test the ``_validate_data`` method"""
# Setup
Expand Down Expand Up @@ -1987,7 +2003,7 @@ def test__transform(self):
def test__transform_invert(self):
"""Test the ``_transform`` method with ``invert=True``"""
# Setup
ls = LogScaler(constant=3, invert=True, missing_value_replacement='from_column')
ls = LogScaler(constant=3.0, invert=True, missing_value_replacement='from_column')
ls._validate_data = Mock()
ls.null_transformer = NullTransformer(
missing_value_replacement='mean', missing_value_generation='from_column'
Expand Down Expand Up @@ -2027,7 +2043,7 @@ def test__transform_null_values(self):
def test__transform_null_values_invert(self):
"""Test the ``_transform`` method with ``invert=True``"""
# Setup
ls = LogScaler(constant=3, invert=True, missing_value_replacement='from_column')
ls = LogScaler(constant=3.0, invert=True, missing_value_replacement='from_column')
ls._validate_data = Mock()
ls.null_transformer = NullTransformer(
missing_value_replacement='mean', missing_value_generation='from_column'
Expand Down Expand Up @@ -2117,7 +2133,7 @@ def test__reverse_transform_invert(self):
[0, 0, 1.0],
]).T
expected = pd.Series([0.1, 1.0, np.nan])
ls = LogScaler(constant=3, invert=True)
ls = LogScaler(constant=3.0, invert=True)
ls.null_transformer = NullTransformer(
missing_value_replacement='mean',
missing_value_generation='from_column',
Expand Down Expand Up @@ -2158,7 +2174,7 @@ def test__reverse_transform_invert_missing_value_generation(self):
# Setup
data = np.array([1.06471, 0.69315, 0])
expected = pd.Series([0.1, 1.0, 2.0])
ls = LogScaler(constant=3, invert=True)
ls = LogScaler(constant=3.0, invert=True)
ls.null_transformer = NullTransformer(None, missing_value_generation='random')

# Run
Expand Down
Loading