sdv-dev
diff --git a/‎HISTORY.md‎
Lines changed: 26 additions & 0 deletions b/‎HISTORY.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎rdt/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎rdt/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rdt/transformers/__init__.py‎
Lines changed: 10 additions & 6 deletions b/‎rdt/transformers/__init__.py‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎rdt/transformers/base.py‎
Lines changed: 5 additions & 1 deletion b/‎rdt/transformers/base.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎rdt/transformers/categorical.py‎
Lines changed: 268 additions & 0 deletions b/‎rdt/transformers/categorical.py‎
Lines changed: 268 additions & 0 deletions
@@ -1,13 +1,39 @@
 # History
 
+## 1.7.0 - 2023-08-22
+
+This release adds 3 new transformers:
+
+1. `UniformEncoder` - A categorical and boolean transformer that converts the column into a uniform distribution.
+2. `OrderedUniformEncoder` - The same as above, but the order for the categories can be specified, changing which range in the uniform distribution each category belongs to.
+3. `IDGenerator`- A text transformer that drops the input column during transform and returns IDs during reverse transform. The IDs all take the form \<prefix>\<number>\<suffix> and can be configured with a custom prefix, suffix and starting point.
+
+Additionally, the `AnonymizedFaker` is enhanced to support the text sdtype. 
+
+### Deprecations
+
+* The `get_input_sdtype` method is being deprecated in favor of `get_supported_sdtypes`.
+
+### New Features
+
+* Create IDGenerator transformer - Issue [#675](https://github.com/sdv-dev/RDT/issues/675) by @R-Palazzo
+* Add UniformEncoder (and its ordered version) - Issue [#678](https://github.com/sdv-dev/RDT/issues/678) by @R-Palazzo
+* Allow me to use AnonymizedFaker with sdtype text columns - Issue [#688](https://github.com/sdv-dev/RDT/issues/688) by @amontanez24
+
+### Maintenance
+
+* Deprecate get_input_sdtype - Issue [#682](https://github.com/sdv-dev/RDT/issues/682) by @R-Palazzo
+
 ## 1.6.1 - 2023-08-02
 
 This release updates the default transformers used for certain sdtypes. It also enables the `AnonymizedFaker` and `PseudoAnonymizedFaker` to work with any sdtype besides boolean, categorical, datetime, numerical or text.
 
 ### Bugs
+
 * [Enterprise Usage] Unable to assign generic PII transformers (eg. AnonymizedFaker) - Issue [#674](https://github.com/sdv-dev/RDT/issues/674) by @amontanez24
 
 ### New Features
+
 * Update the default transformers that HyperTransformer assigns to each sdtype - Issue [#664](https://github.com/sdv-dev/RDT/issues/664) by @amontanez24
 
 ## 1.6.0 - 2023-07-12
 
@@ -5,7 +5,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '1.6.1'
+__version__ = '1.7.0.dev3'
 
 
 import sys
 
@@ -12,12 +12,13 @@
 from rdt.transformers.base import BaseTransformer
 from rdt.transformers.boolean import BinaryEncoder
 from rdt.transformers.categorical import (
-    CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder)
+    CustomLabelEncoder, FrequencyEncoder, LabelEncoder, OneHotEncoder, OrderedLabelEncoder,
+    OrderedUniformEncoder, UniformEncoder)
 from rdt.transformers.datetime import OptimizedTimestampEncoder, UnixTimestampEncoder
 from rdt.transformers.null import NullTransformer
 from rdt.transformers.numerical import ClusterBasedNormalizer, FloatFormatter, GaussianNormalizer
 from rdt.transformers.pii.anonymizer import AnonymizedFaker, PseudoAnonymizedFaker
-from rdt.transformers.text import RegexGenerator
+from rdt.transformers.text import IDGenerator, RegexGenerator
 
 __all__ = [
     'BaseTransformer',
@@ -36,11 +37,14 @@
     'RegexGenerator',
     'AnonymizedFaker',
     'PseudoAnonymizedFaker',
+    'IDGenerator',
     'get_transformer_name',
     'get_transformer_class',
     'get_transformers_by_type',
     'get_default_transformers',
     'get_default_transformer',
+    'UniformEncoder',
+    'OrderedUniformEncoder',
 ]
 
 
@@ -88,8 +92,8 @@ def get_transformer_name(transformer):
 
 DEFAULT_TRANSFORMERS = {
     'numerical': FloatFormatter(),
-    'categorical': LabelEncoder(add_noise=True),
-    'boolean': LabelEncoder(add_noise=True),
+    'categorical': UniformEncoder(),
+    'boolean': UniformEncoder(),
     'datetime': UnixTimestampEncoder(),
     'text': RegexGenerator(),
     'pii': AnonymizedFaker(),
@@ -141,8 +145,8 @@ def get_transformers_by_type():
     sdtype_transformers = defaultdict(list)
     transformer_classes = BaseTransformer.get_subclasses()
     for transformer in transformer_classes:
-        input_sdtype = transformer.get_input_sdtype()
-        sdtype_transformers[input_sdtype].append(transformer)
+        for sdtype in transformer.get_supported_sdtypes():
+            sdtype_transformers[sdtype].append(transformer)
 
     return sdtype_transformers
 
 
@@ -184,7 +184,11 @@ def get_input_sdtype(cls):
             string:
                 Accepted input sdtype of the transformer.
         """
-        return cls.INPUT_SDTYPE
+        warnings.warn(
+            '`get_input_sdtype` is deprecated. Please use `get_supported_sdtypes` instead.',
+            FutureWarning
+        )
+        return cls.get_supported_sdtypes()[0]
 
     @classmethod
     def get_supported_sdtypes(cls):
 
@@ -1,5 +1,6 @@
 """Transformers for categorical data."""
 
+import logging
 import warnings
 
 import numpy as np
@@ -9,6 +10,273 @@
 
 from rdt.errors import TransformerInputError
 from rdt.transformers.base import BaseTransformer
+from rdt.transformers.utils import fill_nan_with_none
+
+LOGGER = logging.getLogger(__name__)
+
+
+class UniformEncoder(BaseTransformer):
+    """Transformer for categorical data.
+
+    This transformer computes a float representative for each one of the categories
+    found in the fit data, and then replaces the instances of these categories with
+    the corresponding representative.
+
+    The representatives are decided by computing the frequencies of each labels and
+    then dividing the ``[0, 1]`` interval according to these frequencies.
+
+    When the transformation is reverted, each value is assigned the category that
+    corresponds to the interval it falls in.
+
+    Null values are considered just another category.
+
+    Args:
+        order_by (str or None):
+            String defining how to order the data before applying the labels. Options are
+            'alphabetical', 'numerical' and ``None``. Defaults to ``None``.
+    """
+
+    INPUT_SDTYPE = 'categorical'
+    SUPPORTED_SDTYPES = ['categorical', 'boolean']
+    frequencies = None
+    intervals = None
+    dtype = None
+
+    def __init__(self, order_by=None):
+        super().__init__()
+        if order_by not in [None, 'alphabetical', 'numerical_value']:
+            raise TransformerInputError(
+                "order_by must be one of the following values: None, 'numerical_value' or "
+                "'alphabetical'"
+            )
+
+        self.order_by = order_by
+
+    def _order_categories(self, unique_data):
+        nans = pd.isna(unique_data)
+        if self.order_by == 'alphabetical':
+            # pylint: disable=invalid-unary-operand-type
+            if any(map(lambda item: not isinstance(item, str), unique_data[~nans])):
+                raise TransformerInputError(
+                    "The data must be of type string if order_by is 'alphabetical'."
+                )
+        elif self.order_by == 'numerical_value':
+            if not np.issubdtype(unique_data.dtype.type, np.number):
+                raise TransformerInputError(
+                    "The data must be numerical if order_by is 'numerical_value'."
+                )
+
+        if self.order_by is not None:
+            unique_data = np.sort(unique_data[~nans])  # pylint: disable=invalid-unary-operand-type
+            if nans.any():
+                unique_data = np.append(unique_data, [None])
+
+        return unique_data
+
+    @classmethod
+    def _get_message_unseen_categories(cls, unseen_categories):
+        """Message to raise when there is unseen categories.
+
+        Args:
+            unseen_categories (list): list of unseen categories
+
+        Returns:
+            message to print
+        """
+        categories_to_print = ', '.join(str(x) for x in unseen_categories[:3])
+        if len(unseen_categories) > 3:
+            categories_to_print = f'{categories_to_print}, +{len(unseen_categories) - 3} more'
+
+        return categories_to_print
+
+    @staticmethod
+    def _compute_frequencies_intervals(categories, freq):
+        """Compute the frequencies and intervals of the categories.
+
+        Args:
+            categories (list):
+                List of categories.
+            freq (list):
+                List of frequencies.
+
+        Returns:
+            tuple[dict, dict]:
+                First dict maps categories to their frequency and the
+                second dict maps the categories to their intervals.
+        """
+        frequencies = dict(zip(categories, freq))
+        shift = np.cumsum(np.hstack([0, freq]))
+        shift[-1] = 1
+        list_int = [[shift[i], shift[i + 1]] for i in range(len(shift) - 1)]
+        intervals = dict(zip(categories, list_int))
+
+        return frequencies, intervals
+
+    def _fit(self, data):
+        """Fit the transformer to the data.
+
+        Compute the frequencies of each category and use them
+        to map the column to a numerical one.
+
+        Args:
+            data (pandas.Series):
+                Data to fit the transformer to.
+        """
+        self.dtype = data.dtypes
+        data = fill_nan_with_none(data)
+        labels = pd.unique(data)
+        labels = self._order_categories(labels)
+        freq = data.value_counts(normalize=True, dropna=False)
+        nan_value = freq[np.nan] if np.nan in freq.index else None
+        freq = freq.reindex(labels, fill_value=nan_value).array
+
+        self.frequencies, self.intervals = self._compute_frequencies_intervals(labels, freq)
+
+    def _transform(self, data):
+        """Map the category to a continuous value.
+
+        This value is sampled from a uniform distribution
+        with boudaries defined by the frequencies.
+
+        Args:
+            data (pandas.Series):
+                Data to transform.
+
+        Returns:
+            pandas.Series
+        """
+        data_with_none = fill_nan_with_none(data)
+        unseen_indexes = ~(data_with_none.isin(self.frequencies))
+        if unseen_indexes.any():
+            # Keep the 3 first unseen categories
+            unseen_categories = list(data.loc[unseen_indexes].unique())
+            categories_to_print = self._get_message_unseen_categories(unseen_categories)
+            warnings.warn(
+                f"The data in column '{self.get_input_column()}' contains new categories "
+                f"that did not appear during 'fit' ({categories_to_print}). Assigning "
+                'them random values. If you want to model new categories, '
+                "please fit the data again using 'fit'.",
+                category=UserWarning
+            )
+
+            choices = list(self.frequencies.keys())
+            size = unseen_indexes.size
+            data_with_none[unseen_indexes] = np.random.choice(choices, size=size)
+
+        def map_labels(label):
+            return np.random.uniform(self.intervals[label][0], self.intervals[label][1])
+
+        return data_with_none.map(map_labels).astype(float)
+
+    def _reverse_transform(self, data):
+        """Convert float values back to the original categorical values.
+
+        Args:
+            data (pandas.Series):
+                Data to revert.
+
+        Returns:
+            pandas.Series
+        """
+        data = data.clip(0, 1)
+        bins = [0]
+        labels = []
+        nan_name = 'NaN'
+        while nan_name in self.intervals.keys():
+            nan_name += '_'
+
+        for key, interval in self.intervals.items():
+            bins.append(interval[1])
+            if pd.isna(key):
+                labels.append(nan_name)
+            else:
+                labels.append(key)
+
+        result = pd.cut(data, bins=bins, labels=labels)
+        return result.replace(nan_name, np.nan).astype(self.dtype)
+
+
+class OrderedUniformEncoder(UniformEncoder):
+    """Ordered uniform encoder for categorical data.
+
+    This class works very similarly to the ``UniformEncoder``, except that it requires the ordering
+    for the labels to be provided.
+    Null values are considered just another category.
+
+    Args:
+        order (list):
+            A list of all the unique categories for the data. The order of the list determines the
+            label that each category will get.
+    """
+
+    def __init__(self, order):
+        self.order = fill_nan_with_none(pd.Series(order))
+        super().__init__()
+
+    def __repr__(self):
+        """Represent initialization of transformer as text.
+
+        Returns:
+            str:
+                The name of the transformer followed by any non-default parameters.
+        """
+        class_name = self.__class__.get_name()
+        custom_args = ['order=<CUSTOM>']
+        args_string = ', '.join(custom_args)
+        return f'{class_name}({args_string})'
+
+    def _check_unknown_categories(self, data):
+        missing = list(data[~data.isin(self.order)].unique())
+        if len(missing) > 0:
+            raise TransformerInputError(
+                f"Unknown categories '{missing}'. All possible categories must be defined in the "
+                "'order' parameter."
+            )
+
+    def _fit(self, data):
+        """Fit the transformer to the data.
+
+        Create all the class attributes while respecting the speicified
+        order of the labels.
+
+        Args:
+            data (pandas.Series):
+                Data to fit the transformer to.
+        """
+        self.dtype = data.dtypes
+        data = fill_nan_with_none(data)
+        self._check_unknown_categories(data)
+
+        category_not_seen = (set(self.order.dropna()) != set(data.dropna()))
+        nans_not_seen = (pd.isna(self.order).any() and not pd.isna(data).any())
+        if category_not_seen or nans_not_seen:
+            unseen_categories = [x for x in self.order if x not in data.array]
+            categories_to_print = self._get_message_unseen_categories(unseen_categories)
+            LOGGER.info(
+                "For column '%s', some of the provided category values were not present in the"
+                ' data during fit: (%s).',
+                self.get_input_column(),
+                categories_to_print
+            )
+
+            freq = data.value_counts(normalize=True, dropna=False)
+            freq = 0.9 * freq
+            for category in unseen_categories:
+                freq[category] = 0.1 / len(unseen_categories)
+
+        else:
+            freq = data.value_counts(normalize=True, dropna=False)
+
+        nan_value = freq[np.nan] if np.nan in freq.index else None
+        freq = freq.reindex(self.order, fill_value=nan_value).array
+
+        self.frequencies, self.intervals = self._compute_frequencies_intervals(self.order, freq)
+
+    def _transform(self, data):
+        """Map the category to a continuous value."""
+        data = fill_nan_with_none(data)
+        self._check_unknown_categories(data)
+        return super()._transform(data)
 
 
 class FrequencyEncoder(BaseTransformer):