diff --git a/CHANGES.rst b/CHANGES.rst index 7918707fd..f7e00c83f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,6 +11,9 @@ Ongoing Development New features ------------ +- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.' + as decimal separator and it also handles negative numbers indicated with parentheses. + :pr:`1772` by :user:`Gabriela Gómez Jiménez `. Changes ------- diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst new file mode 100644 index 000000000..a1aae9eda --- /dev/null +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -0,0 +1,133 @@ +.. |ToFloat| replace:: :class:`~skrub.ToFloat` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` + +.. _user_guide_feature_engineering_numeric_to_float: + +Converting heterogeneous numeric values to uniform float32 +========================================================== + +Many tabular datasets contain numeric information stored as strings, mixed +representations, locale-specific formats, or other non-standard encodings. +Common issues include: + +- Thousands separators (``1,234.56`` or ``1 234,56``) +- Use of apostrophes as separators (``4'567.89``) +- Negative numbers encoded inside parentheses (``(1,234.56)``) +- String columns that contain mostly numeric values, but with occasional invalid entries + +To provide consistent numeric behavior, skrub includes the |ToFloat| transformer, +which standardizes all numeric-like columns to ``float32`` and handles a wide +range of real-world formatting issues automatically. + +The |ToFloat| transformer is used internally by both the |Cleaner| and the +|TableVectorizer| to guarantee that downstream estimators receive clean and +uniform numeric data. + +What |ToFloat| does +------------------- + +The |ToFloat| transformer provides: + +- **Automatic conversion to 32-bit floating-point values (`float32`).** + This dtype is lightweight and fully supported by scikit-learn estimators. + +- **Automatic parsing of decimal and thousands separators**, regardless of locale: + - The decimal separator must be specified explicitly and can be either ``.`` or ``,`` + - The thousands separator can be one of ``.``, ``,``, space (``" "``), apostrophe (``'``), + or None (no thousands separator) + - The transformer supports integers, decimals (including leading-decimal forms such as .56 or ,56), scientific notation + and negative numbers + - Numbers in parentheses are interpreted as negative numbers (``(1,234.56)`` → ``-1234.56``). This format is more common in financial datasets. + - Decimal and thousands separators must be different characters + +- **Scientific notation parsing** (e.g. ``1.23e+4``) + +- **Graceful handling of invalid or non-numeric values during transform**: + - During ``fit``: non-convertible values raise a ``RejectColumn`` exception + - During ``transform``: invalid entries become ``NaN`` instead of failing + +- **Rejection of categorical and datetime columns**, which should not be cast to numeric. + +As with all skrub transformers, |ToFloat| behaves like a standard +scikit-learn transformer and is fully compatible with pipelines. + +How to use |ToFloat| +-------------------- +The |ToFloat| transformer must be applied to individual columns, and it behaves +like a standard scikit-learn transformer. +|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and +``None`` (no thousands separator) by default. +Each column is expected to use a single separator for decimals, and one for thousands: +if any characters other than the provided selectors are encountered in the column, it will not +be converted. + +During ``fit``, |ToFloat| attempts to convert all values in the column to +numeric values after automatically removing other possible thousands separators +(``,``, ``.``, space, apostrophe). If any value cannot be converted, the column +is rejected with a ``RejectColumn`` exception. + +During ``transform``, invalid or non-convertible values are replaced by ``NaN`` +instead of raising an error. + +Examples +-------- + +Parsing numeric-formatted strings: + +>>> import pandas as pd +>>> from skrub import ToFloat +>>> s = pd.Series(['1.1', None, '3.3'], name='x') +>>> ToFloat().fit_transform(s) +0 1.1 +1 NaN +2 3.3 +Name: x, dtype: float32 + +Locale-dependent decimal separators can be handled by specifying the +``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and +a space as thousands separators: + +>>> s = pd.Series(["4 567,89", "12 567,89"], name="x") +>>> ToFloat(decimal=",", thousand=" ").fit_transform(s) +0 4567.8... +1 12567.8... +Name: x, dtype: float32 + +Parentheses interpreted as negative numbers: + +>>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg") +>>> ToFloat(thousand=",").fit_transform(s) +0 -1234.5... +1 -1234.5... +Name: neg, dtype: float32 + +Scientific notation: + +>>> s = pd.Series(["1.23e+4", "1.23E+4"]) +>>> ToFloat(decimal=".").fit_transform(s) +0 12300.0 +1 12300.0 +dtype: float32 + +Columns that cannot be converted are rejected during ``fit``: + +>>> s = pd.Series(['1.1', 'hello'], name='x') +>>> ToFloat(decimal=".").fit_transform(s) +Traceback (most recent call last): + ... +skrub._single_column_transformer.RejectColumn: Could not convert column 'x' to numbers. + + +During ``transform``, invalid entries become ``NaN`` instead of raising an error: +>>> s = pd.Series(['1.1', '2.2'], name='x') +>>> to_float = ToFloat(decimal=".") +>>> to_float.fit_transform(s) +0 1.1 +1 2.2 +Name: x, dtype: float32 + +>>> to_float.transform(pd.Series(['3.3', 'invalid'], name='x')) +0 3.3 +1 NaN +Name: x, dtype: float32 diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 3e7ade92d..f3613a670 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -693,7 +693,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): We can inspect all the processing steps that were applied to a given column: >>> vectorizer.all_processing_steps_['B'] - [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(), 'B_month': ToFloat(), ...}] + [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(thousand=''), 'B_month': ToFloat(thousand=''), ...}] Note that as the encoder (``DatetimeEncoder()`` above) produces multiple columns, the last processing step is not described by a single transformer @@ -768,7 +768,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ``ToDatetime()``: >>> vectorizer.all_processing_steps_ - {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(), PassThrough(), {'C': ToFloat()}]} + {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(thousand=''), PassThrough(), {'C': ToFloat(thousand='')}]} Specifying several ``specific_transformers`` for the same column is not allowed. diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 6a167ebea..b34463274 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -1,9 +1,99 @@ +import re + from . import _dataframe as sbd +from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._single_column_transformer import RejectColumn, SingleColumnTransformer __all__ = ["ToFloat"] +def _build_number_regex(decimal, thousand): + # Escape decimal and thousand separators to use in regex + d = re.escape(decimal) # e.g., '.' → '\.', ',' → '\,' + t = re.escape(thousand) # e.g., ',' → '\,', '.' → '\.' + + # Matches integer parts: + # Either: + # - one or more digits without thousand separators: \d+ + # - or digits grouped by thousand separators: \d{1,3}(?:{t}\d{3})+ + # e.g., '1,234' or '12,345,678' + integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)" + + # Matches decimal part after the decimal separator + # e.g., '.456' or ',456' depending on locale + decimal_part = rf"{d}\d+" + + # Matches optional scientific notation + # e.g., 'e10', 'E-5', 'e+3' + scientific = r"(?:[eE][+-]?\d+)?" + + # Full number can be: + # - integer with optional decimal part + # - or only decimal part (like '.5') + number = rf"(?:{integer}(?:{decimal_part})?|{decimal_part})" + + # Final regex: + # - optional parentheses around the number: \( ... \)? + # - optional leading + or - sign: [+-]? + # - optional scientific notation is included in `number` + # Anchored to start (^) and end ($) of string + return rf"^\(?[+-]?(?:{number}{scientific})?\)?$" + + +@dispatch +def _str_is_valid_number(col, number_re): + raise_dispatch_unregistered_type(col, kind="Series") + + +@_str_is_valid_number.specialize("pandas", argument_type="Column") +def _str_is_valid_number_pandas(col, number_re): + # Check if all values in the column match the number regex. + # - Fill NaN values with empty string to avoid match errors. + # - Use `str.match` with `na=False` to treat empty/missing values as non-matching. + # - If any value does not match, raise RejectColumn with a descriptive message. + if not col.fillna("").str.match(number_re, na=False).all(): + raise RejectColumn(f"Could not convert column {sbd.name(col)!r} to numbers.") + return True + + +@_str_is_valid_number.specialize("polars", argument_type="Column") +def _str_is_valid_number_polars(col, number_re): + # Check if all values in the column match the number regex. + # - Fill NaN values with empty string to avoid match errors. + # - Use `str.match` with `na=False` to treat empty/missing values as non-matching. + # - If any value does not match, raise RejectColumn with a descriptive message. + if not col.fill_null("").str.contains(number_re.pattern, literal=False).all(): + raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") + return True + + +@dispatch +def _str_replace(col, strict=True): + raise_dispatch_unregistered_type(col, kind="Series") + + +@_str_replace.specialize("pandas", argument_type="Column") +def _str_replace_pandas(col, decimal, thousand): + # Replace parentheses around numbers with a leading minus sign + # e.g., "(123.45)" → "-123.45" + col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) + # Remove thousand separators + col = col.str.replace(thousand, "", regex=False) + # Replace decimal separator with '.' + return col.str.replace(decimal, ".", regex=False) + + +@_str_replace.specialize("polars", argument_type="Column") +def _str_replace_polars(col, decimal, thousand): + # Replace parentheses around numbers with a leading minus sign + # e.g., "(123.45)" → "-123.45" + col = col.str.replace_all(r"^\((.*)\)$", r"-$1") + # Remove thousand separators + col = col.str.replace_all(thousand, "", literal=True) + # Replace decimal separator with '.' + return col.str.replace_all(f"[{decimal}]", ".") + + class ToFloat(SingleColumnTransformer): """ Convert a column to 32-bit floating-point numbers. @@ -22,6 +112,17 @@ class ToFloat(SingleColumnTransformer): During ``transform``, entries for which conversion fails are replaced by null values. + Parameters + ---------- + decimal : str, default='.' + Character to recognize as the decimal separator when converting from + strings to floats. Other possible decimal separators are removed from + the strings before conversion. + thousand : str or None, default=None + Character used as thousands separator. Supported values are ``"."``, + ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands + separator). The decimal and thousands separators must differ. + Examples -------- >>> import pandas as pd @@ -165,8 +266,36 @@ class ToFloat(SingleColumnTransformer): >>> s = pd.Series([1.1, None], dtype='float32') >>> to_float.fit_transform(s) is s True + + Negative numbers represented using parentheses are converted + so they use "-" instead. + >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens') + >>> ToFloat(decimal=".", thousand=",").fit_transform(s) + 0 -1234.5... + 1 1234.5... + 2 -1234.5... + dtype: float32 + + Numbers that use scientific notation are converted: + >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x") + >>> ToFloat(decimal=".").fit_transform(s) + 0 12300.0 + 1 12300.0 + Name: x, dtype: float32 + + It is possible to specify the thousands separator, e.g., to use " " + >>> s = pd.Series(["4 567,89", "12 567,89"], name="x") + >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) # doctest: +ELLIPSIS + 0 4567.8... + 1 12567.8... + Name: x, dtype: float32 """ # noqa: E501 + def __init__(self, decimal=".", thousand=None): + super().__init__() + self.decimal = decimal + self.thousand = "" if thousand is None else thousand + def fit_transform(self, column, y=None): """Fit the encoder and transform a column. @@ -185,12 +314,26 @@ def fit_transform(self, column, y=None): """ del y self.all_outputs_ = [sbd.name(column)] + if self.decimal is None: + raise ValueError("The decimal separator cannot be None.") + if self.thousand == self.decimal: + raise ValueError("The thousand and decimal separators must differ.") + if sbd.is_any_date(column) or sbd.is_categorical(column): raise RejectColumn( f"Refusing to cast column {sbd.name(column)!r} " f"with dtype '{sbd.dtype(column)}' to numbers." ) try: + if sbd.is_string(column): + self._number_re_ = re.compile( + _build_number_regex(self.decimal, self.thousand), + re.VERBOSE, + ) + _str_is_valid_number(column, self._number_re_) + column = _str_replace( + column, decimal=self.decimal, thousand=self.thousand + ) numeric = sbd.to_float32(column, strict=True) return numeric except Exception as e: diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index 5f777f061..a642ac5e7 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -43,3 +43,82 @@ def test_rejected_columns(df_module): ToFloat().fit_transform(col) to_float = ToFloat().fit(df_module.make_column("c", [1.1])) assert is_float32(df_module, to_float.transform(col)) + + +@pytest.mark.parametrize( + "input_str, expected_float, decimal, thousand", + [ + # valid numbers + ("1,234.56", 1234.56, ".", ","), + ("1.234,56", 1234.56, ",", "."), + ("1 234,56", 1234.56, ",", " "), + ("1234.56", 1234.56, ".", None), + ("1234,56", 1234.56, ",", None), + ("1,234,567.89", 1234567.89, ".", ","), + ("1.234.567,89", 1234567.89, ",", "."), + ("1 234 567,89", 1234567.89, ",", " "), + ("1'234'567.89", 1234567.89, ".", "'"), + ("1.23e+4", 12300.0, ".", None), + ("1.23E+4", 12300.0, ".", None), + ("-1,234.56", -1234.56, ".", ","), + ("(1,234.56)", -1234.56, ".", ","), + (".56", 0.56, ".", None), + (",56", 0.56, ",", None), + ("56", 56.0, ".", None), + ], +) +def test_number_parsing_valid(input_str, expected_float, decimal, thousand, df_module): + column = df_module.make_column("col", [input_str]) + result = ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + assert np.allclose(result[0], expected_float) + + +@pytest.mark.parametrize( + "input_str, decimal, thousand", + [ + # invalid grouping + ("1,23,456.78", ".", ","), + ("1.2.3.4", ".", None), + ("1.2.3.4,0", ",", "."), + ("12,3456.78", ".", ","), + ("1 234,567.34", ".", ","), + ("1'234,567.34", ".", ","), + ("1'234'234,567.34", ",", "'"), + ("123.45.67", ".", None), + ("1,,234", ".", ","), + ("1.23,45", ".", ","), + ], +) +def test_number_parsing_invalid(input_str, decimal, thousand, df_module): + column = df_module.make_column("col", [input_str]) + with pytest.raises((RejectColumn, ValueError)): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + + +@pytest.mark.parametrize( + "decimal, thousand", + [ + # invalid because decimal and thousand are the same + (",", ","), + (".", "."), + # invalid because decimal is None + (None, ","), + (None, None), + ], +) +def test_invalid_parameters(decimal, thousand, df_module): + """ + Test that ToFloat raises an exception if the parameters are invalid: + - decimal is None → ValueError + - thousand == decimal → ValueError + """ + column = df_module.make_column("col", ["123", "456"]) + + if decimal is None: + with pytest.raises(ValueError, match="decimal separator cannot be None"): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + else: + with pytest.raises( + ValueError, match="thousand and decimal separators must differ" + ): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)