Skip to content
Closed
2 changes: 2 additions & 0 deletions rdt/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
ClusterBasedNormalizer,
FloatFormatter,
GaussianNormalizer,
LogScaler,
)
from rdt.transformers.pii.anonymizer import (
AnonymizedFaker,
Expand All @@ -46,6 +47,7 @@
'FrequencyEncoder',
'GaussianNormalizer',
'LabelEncoder',
'LogScaler',
'NullTransformer',
'OneHotEncoder',
'OptimizedTimestampEncoder',
Expand Down
121 changes: 120 additions & 1 deletion rdt/transformers/numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
import scipy

from rdt.errors import TransformerInputError
from rdt.errors import InvalidDataError, TransformerInputError
from rdt.transformers.base import BaseTransformer
from rdt.transformers.null import NullTransformer
from rdt.transformers.utils import learn_rounding_digits
Expand Down Expand Up @@ -626,3 +626,122 @@
recovered_data = np.stack([recovered_data, data[:, -1]], axis=1) # noqa: PD013

return super()._reverse_transform(recovered_data)


class LogScaler(FloatFormatter):
"""Transformer for numerical data using log.

This transformer scales numerical values using log and an optional constant.

Null values are replaced using a ``NullTransformer``.

Args:
missing_value_replacement (object):
Indicate what to replace the null values with. If an integer or float is given,
replace them with the given value. If the strings ``'mean'`` or ``'mode'``
are given, replace them with the corresponding aggregation and if ``'random'``
replace each null value with a random value in the data range. Defaults to ``mean``.
missing_value_generation (str or None):
The way missing values are being handled. There are three strategies:

* ``random``: Randomly generates missing values based on the percentage of
missing values.
* ``from_column``: Creates a binary column that describes whether the original
value was missing. Then use it to recreate missing values.
* ``None``: Do nothing with the missing values on the reverse transform. Simply
pass whatever data we get through.
constant (float):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Default to 0" -> "Defaults to 0".

Also, either add the `` quotation marks around the 0, False, True values here, or remove them from the other values in the docstring, so it's conistent.

The constant to set as the 0-value for the log-based transform. Defaults to 0
(do not modify the 0-value of the data).
invert (bool):
Whether to invert the data with respect to the constant value. If False, do not
invert the data (all values will be greater than the constant value). If True,
invert the data (all the values will be less than the constant value).
Defaults to False.
learn_rounding_scheme (bool):
Whether or not to learn what place to round to based on the data seen during ``fit``.
If ``True``, the data returned by ``reverse_transform`` will be rounded to that place.
Defaults to ``False``.
"""

def __init__(
self,
missing_value_replacement='mean',
missing_value_generation='random',
constant: float = 0.0,
invert: bool = False,
learn_rounding_scheme: bool = False,
):
if isinstance(constant, float):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integers should probably be fine as well right?

self.constant = constant
else:
raise ValueError('The constant parameter must be a float.')

Check warning on line 678 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L678

Added line #L678 was not covered by tests
if isinstance(invert, bool):
self.invert = invert
else:
raise ValueError('The invert parameter must be a bool.')

Check warning on line 682 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L682

Added line #L682 was not covered by tests

super().__init__(
missing_value_replacement=missing_value_replacement,
missing_value_generation=missing_value_generation,
learn_rounding_scheme=learn_rounding_scheme,
)

def _validate_data(self, data: pd.Series):
column_name = self.get_input_column()
if self.invert:
if not all(data < self.constant):
raise InvalidDataError(

Check warning on line 694 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L694

Added line #L694 was not covered by tests
f"Unable to apply a log transform to column '{column_name}' due to constant"
' being too small.'
)
else:
if not all(data > self.constant):
raise InvalidDataError(

Check warning on line 700 in rdt/transformers/numerical.py

View check run for this annotation

Codecov / codecov/patch

rdt/transformers/numerical.py#L700

Added line #L700 was not covered by tests
f"Unable to apply a log transform to column '{column_name}' due to constant"
' being too large.'
)

def _fit(self, data):
super()._fit(data)
data = super()._transform(data)

if data.ndim > 1:
self._validate_data(data[:, 0])
else:
self._validate_data(data)

def _log_transform(self, data):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can move self._validate_data here as well, no?

if self.invert:
return np.log(self.constant - data)
else:
return np.log(data - self.constant)

def _transform(self, data):
data = super()._transform(data)

if data.ndim > 1:
self._validate_data(data[:, 0])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would create a helper function for lines 711-715 like you did for _fit, so we don't repeat the code.

data[:, 0] = self._log_transform(data[:, 0])
else:
self._validate_data(data)
data = self._log_transform(data)

return data

def _reverse_log(self, data):
if self.invert:
return self.constant - np.exp(data)
else:
return np.exp(data) + self.constant

def _reverse_transform(self, data):
if not isinstance(data, np.ndarray):
data = data.to_numpy()

if data.ndim > 1:
data[:, 0] = self._reverse_log(data[:, 0])
else:
data = self._reverse_log(data)

return super()._reverse_transform(data)
4 changes: 4 additions & 0 deletions tests/integration/test_transformers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict

import numpy as np
import pandas as pd
import pytest

Expand All @@ -12,6 +13,8 @@

PRIMARY_SDTYPES = ['boolean', 'categorical', 'datetime', 'numerical']

INT64_MIN = np.iinfo(np.int64).min

# Additional arguments for transformers
TRANSFORMER_ARGS = {
'BinaryEncoder': {
Expand All @@ -23,6 +26,7 @@
'FloatFormatter': {'missing_value_generation': 'from_column'},
'GaussianNormalizer': {'missing_value_generation': 'from_column'},
'ClusterBasedNormalizer': {'missing_value_generation': 'from_column'},
'LogScaler': {'constant': float(INT64_MIN), 'missing_value_generation': 'from_column'},
}

# Mapping of rdt sdtype to dtype
Expand Down
59 changes: 59 additions & 0 deletions tests/integration/transformers/test_numerical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
ClusterBasedNormalizer,
FloatFormatter,
GaussianNormalizer,
LogScaler,
)


Expand Down Expand Up @@ -560,3 +561,61 @@ def test_out_of_bounds_reverse_transform(self):

# Assert
assert isinstance(reverse, pd.DataFrame)


class TestLogScaler:
def test_learn_rounding(self):
"""Test that transformer learns rounding scheme from data."""
# Setup
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add short docstrings to these tests.

data = pd.DataFrame({'test': [1.0, np.nan, 1.5]})
transformer = LogScaler(
missing_value_generation=None,
missing_value_replacement='mean',
learn_rounding_scheme=True,
)
expected = pd.DataFrame({'test': [1.0, 1.2, 1.5]})

# Run
transformer.fit(data, 'test')
transformed = transformer.transform(data)
reversed = transformer.reverse_transform(transformed)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we update this to reversed_values or something similar since reversed is a Python keyword?


# Assert
np.testing.assert_array_equal(reversed, expected)

def test_missing_value_generation_from_column(self):
"""Test from_column missing value generation with nans present."""
# Setup
data = pd.DataFrame({'test': [1.0, np.nan, 1.5]})
transformer = LogScaler(
missing_value_generation='from_column',
missing_value_replacement='mean',
)

# Run
transformer.fit(data, 'test')
transformed = transformer.transform(data)
reversed = transformer.reverse_transform(transformed)

# Assert
np.testing.assert_array_equal(reversed, data)

def test_missing_value_generation_random(self):
"""Test random missing_value_generation with nans present."""
# Setup
data = pd.DataFrame({'test': [1.0, np.nan, 1.5, 1.5]})
transformer = LogScaler(
missing_value_generation='random',
missing_value_replacement='mode',
invert=True,
constant=3.0,
)
expected = pd.DataFrame({'test': [np.nan, 1.5, 1.5, 1.5]})

# Run
transformer.fit(data, 'test')
transformed = transformer.transform(data)
reversed = transformer.reverse_transform(transformed)

# Assert
np.testing.assert_array_equal(reversed, expected)
Loading
Loading