From e5fe91f94e850391ac3d41c8d3a27f924d6f5873 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 24 Nov 2025 17:17:47 +0100 Subject: [PATCH 01/23] WIP: Adding decimal conversion and tests --- skrub/_to_float.py | 28 ++++++++++++++++++++++++++++ skrub/tests/test_to_float.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 6a167ebea..9c0a84219 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -1,8 +1,28 @@ from . import _dataframe as sbd +from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._single_column_transformer import RejectColumn, SingleColumnTransformer __all__ = ["ToFloat"] +POSSIBLE_SEPARATORS = [".", ",", "'", " "] + + +@dispatch +def _str_replace(col, pattern, strict=True): + raise_dispatch_unregistered_type(col, kind="Series") + + +@_str_replace.specialize("pandas", argument_type="Column") +def _str_replace_pandas(col, pattern): + col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) + return col.str.replace("".join(pattern), "", regex=True) + + +@_str_replace.specialize("polars", argument_type="Column") +def _str_replace_polars(col, pattern): + col = col.str.replace(r"^\((.*)\)$", r"-\1") + return col.str.replace_all("".join(pattern), "") + class ToFloat(SingleColumnTransformer): """ @@ -167,6 +187,10 @@ class ToFloat(SingleColumnTransformer): True """ # noqa: E501 + def __init__(self, decimal="."): + super().__init__() + self.decimal = decimal + def fit_transform(self, column, y=None): """Fit the encoder and transform a column. @@ -191,6 +215,10 @@ def fit_transform(self, column, y=None): f"with dtype '{sbd.dtype(column)}' to numbers." ) try: + if sbd.is_string(column): + p = POSSIBLE_SEPARATORS.copy() + p.remove(self.decimal) + column = _str_replace(column, pattern=p) numeric = sbd.to_float32(column, strict=True) return numeric except Exception as e: diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index 5f777f061..201891567 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -43,3 +43,36 @@ def test_rejected_columns(df_module): ToFloat().fit_transform(col) to_float = ToFloat().fit(df_module.make_column("c", [1.1])) assert is_float32(df_module, to_float.transform(col)) + + +@pytest.mark.parametrize( + "input_str, expected_float, decimal", + [ + ("1,234.56", 1234.56, "."), + ("1.234,56", 1234.56, ","), + ("1 234,56", 1234.56, ","), + ("1234.56", 1234.56, "."), + ("1234,56", 1234.56, ","), + ("1,234,567.89", 1234567.89, "."), + ("1.234.567,89", 1234567.89, ","), + ("1 234 567,89", 1234567.89, ","), + ("1'234'567.89", 1234567.89, "."), + ("1.23e+4", 12300.0, "."), + ("1.23E+4", 12300.0, "."), + ("1,23e+4", 12300.0, ","), + ("1,23E+4", 12300.0, ","), + ("-1,234.56", -1234.56, "."), + ("-1.234,56", -1234.56, ","), + ("(1,234.56)", -1234.56, "."), + ("(1.234,56)", -1234.56, ","), + ("1,23,456.78", 123456.78, "."), + ("12,3456.78", 123456.78, "."), + (".56", 0.56, "."), + (",56", 0.56, ","), + ], +) +def test_number_parsing(input_str, expected_float, decimal, df_module): + column = df_module.make_column("col", [input_str]) + result = ToFloat(decimal=decimal).fit_transform(column) + + assert result == expected_float From c1862010cd6c78b09f1d18cd615418c91c8a5cc2 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 24 Nov 2025 18:34:04 +0100 Subject: [PATCH 02/23] Added tests and examples --- CHANGES.rst | 3 +++ skrub/_to_float.py | 39 ++++++++++++++++++++++++++++++------ skrub/tests/test_to_float.py | 2 +- 3 files changed, 37 insertions(+), 7 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 7918707fd..8ad38c571 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -73,6 +73,9 @@ New features - :class:`TableReport` now includes the ``open_tab`` parameter, which lets the user select which tab should be opened when the ``TableReport`` is rendered. :pr:`1737` by :user:`Riccardo Cappuzzo`. +- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.' + as decimal separator and it also handles negative numbers indicated with parentheses. + :pr:`1772` by :user:`Gabriela Gómez Jiménez `. Changes ------- diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 9c0a84219..3964969f3 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -13,15 +13,17 @@ def _str_replace(col, pattern, strict=True): @_str_replace.specialize("pandas", argument_type="Column") -def _str_replace_pandas(col, pattern): +def _str_replace_pandas(col, pattern, decimal): col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) - return col.str.replace("".join(pattern), "", regex=True) + col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True) + return col.str.replace(decimal, ".", regex=False) @_str_replace.specialize("polars", argument_type="Column") -def _str_replace_polars(col, pattern): - col = col.str.replace(r"^\((.*)\)$", r"-\1") - return col.str.replace_all("".join(pattern), "") +def _str_replace_polars(col, pattern, decimal): + col = col.str.replace_all(r"^\((.*)\)$", r"-$1") + col = col.str.replace_all("[" + "".join(pattern) + "]", "") + return col.str.replace_all(f"[{decimal}]", ".") class ToFloat(SingleColumnTransformer): @@ -185,6 +187,31 @@ class ToFloat(SingleColumnTransformer): >>> s = pd.Series([1.1, None], dtype='float32') >>> to_float.fit_transform(s) is s True + + Handling parentheses around negative numbers + >>> s = pd.Series(["-1,234.56", "1.234,56", "(1,234.56)"], name='parens') + >>> to_float.fit_transform(s) #doctest: +SKIP + 0 -1234.56 + 1 1234.56 + 2 -1234.56 + dtype: float32 + + Scientific notation + >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x") + >>> ToFloat(decimal=".").fit_transform(s) + 0 12300.0 + 1 12300.0 + Name: x, dtype: float32 + + + Space or apostrophe as thousand separator + >>> s = pd.Series(["1 234 567,89", "1'234'567,89"], name="x") + >>> ToFloat(decimal=",").fit_transform(s) + 0 1234567.89 + 1 1234567.89 + Name: x, dtype: float32 + + """ # noqa: E501 def __init__(self, decimal="."): @@ -218,7 +245,7 @@ def fit_transform(self, column, y=None): if sbd.is_string(column): p = POSSIBLE_SEPARATORS.copy() p.remove(self.decimal) - column = _str_replace(column, pattern=p) + column = _str_replace(column, pattern=p, decimal=self.decimal) numeric = sbd.to_float32(column, strict=True) return numeric except Exception as e: diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index 201891567..326a2d084 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -75,4 +75,4 @@ def test_number_parsing(input_str, expected_float, decimal, df_module): column = df_module.make_column("col", [input_str]) result = ToFloat(decimal=decimal).fit_transform(column) - assert result == expected_float + np.allclose(result[0], expected_float) From 67f00c7783de9cf4ac53e5dec655f5b76cc36d80 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 14:58:51 +0100 Subject: [PATCH 03/23] Added doctest skip --- skrub/_to_float.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 3964969f3..13e07201b 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -205,10 +205,10 @@ class ToFloat(SingleColumnTransformer): Space or apostrophe as thousand separator - >>> s = pd.Series(["1 234 567,89", "1'234'567,89"], name="x") - >>> ToFloat(decimal=",").fit_transform(s) - 0 1234567.89 - 1 1234567.89 + >>> s = pd.Series(["4 567,89", "4'567,89"], name="x") + >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP + 0 4567.89 + 1 4567.89 Name: x, dtype: float32 From 47cc97d1479e9b77db07f4d110b4e4b8d5250ba2 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 15:34:57 +0100 Subject: [PATCH 04/23] Added documentation --- .../feature_engineering_numerical.rst | 123 ++++++++++++++++++ skrub/_to_float.py | 10 +- 2 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 doc/modules/column_level_featurizing/feature_engineering_numerical.rst diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst new file mode 100644 index 000000000..c6841a4d6 --- /dev/null +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -0,0 +1,123 @@ +.. |ToFloat| replace:: :class:`~skrub.ToFloat` +.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer` +.. |Cleaner| replace:: :class:`~skrub.Cleaner` + +.. _user_guide_feature_engineering_numeric_to_float: + +Converting heterogeneous numeric values to uniform float32 +========================================================== + +Many tabular datasets contain numeric information stored as strings, mixed +representations, locale-specific formats, or other non-standard encodings. +Common issues include: + +- Thousands separators (``1,234.56`` or ``1 234,56``) +- Use of apostrophes as separators (``4'567.89``) +- Negative numbers encoded inside parentheses (``(1,234.56)``) +- String columns that contain mostly numeric values, but with occasional invalid entries + +To provide consistent numeric behavior, skrub includes the |ToFloat| transformer, +which **standardizes all numeric-like columns to ``float32``** and handles a wide +range of real-world formatting issues automatically. + +The |ToFloat| transformer is used internally by both the |Cleaner| class and the +|TableVectorizer| to guarantee that downstream estimators receive clean and +uniform numeric data. + +What |ToFloat| does +------------------- + +The |ToFloat| transformer provides: + +- **Automatic conversion to 32-bit floating-point values (`float32`).** + This dtype is lightweight and fully supported by scikit-learn estimators. + +- **Automatic parsing of decimal separators**, regardless of locale: + - ``.`` or ``,`` can be used as decimal point + - thousands separators (``.``, ``,``, space, apostrophe) are removed automatically + +- **Parentheses interpreted as negative numbers**, a common format in financial datasets: + - ``(1,234.56)`` → ``-1234.56`` + +- **Scientific notation parsing** (e.g. ``1.23e+4``) + +- **Graceful handling of invalid or non-numeric values during transform**: + - During ``fit``: non-convertible values raise a ``RejectColumn`` exception + - During ``transform``: invalid entries become ``NaN`` instead of failing + +- **Rejection of categorical and datetime columns**, which should not be cast to numeric. + +As with all skrub transformers, |ToFloat| behaves like a standard +scikit-learn transformer and is fully compatible with pipelines. + +Examples +-------- + +Parsing numeric-formatted strings: + +>>> import pandas as pd +>>> from skrub import ToFloat +>>> s = pd.Series(['1.1', None, '3.3'], name='x') +>>> ToFloat().fit_transform(s) +0 1.1 +1 NaN +2 3.3 +Name: x, dtype: float32 + +Automatic handling of locale-dependent decimal separators: + +>>> s = pd.Series(["4 567,89", "4'567,89"], name="x") +>>> ToFloat(decimal=",").fit_transform(s) # doctest: +SKIP +0 4567.89 +1 4567.89 +Name: x, dtype: float32 + +Parentheses interpreted as negative numbers: + +>>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg") +>>> ToFloat().fit_transform(s) # doctest: +SKIP +0 -1234.56 +1 -1234.56 +Name: neg, dtype: float32 + +Scientific notation: + +>>> s = pd.Series(["1.23e+4", "1.23E+4"]) +>>> ToFloat(decimal=".").fit_transform(s) +0 12300.0 +1 12300.0 +dtype: float32 + +Numeric, boolean, and extension dtypes are also standardized: + +>>> pd.Series([True, False]) +0 1.0 +1 0.0 +dtype: float32 + +>>> pd.Series([1.1, 2.2], dtype="Float32") +0 1.1 +1 2.2 +dtype: float32 + +Columns that cannot be converted are rejected during ``fit``: + +>>> pd.Series(['1.1', 'hello']) +Traceback (most recent call last): + ... +skrub._apply_to_cols.RejectColumn: Could not convert column '...' to numbers. + +How |ToFloat| is used in skrub +------------------------------ + +The |ToFloat| transformer is used internally in: + +- the **Cleaner** (|Cleaner|), to normalize all numeric-like columns before modeling +- the **|TableVectorizer|**, ensuring a consistent numeric dtype across all numeric features + +This makes |ToFloat| a core building block of skrub’s handling of heterogeneous +tabular data. + +``ToFloat`` ensures that downstream machine-learning models receive numeric data +that is clean, consistent, lightweight, and free of locale-specific quirks or +string-encoded values. diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 13e07201b..dca4cb339 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -44,6 +44,13 @@ class ToFloat(SingleColumnTransformer): During ``transform``, entries for which conversion fails are replaced by null values. + Parameters + ---------- + decimal : str, default='.' + Character to recognize as the decimal separator when converting from + strings to floats. Other possible decimal separators are removed from + the strings before conversion. + Examples -------- >>> import pandas as pd @@ -203,15 +210,12 @@ class ToFloat(SingleColumnTransformer): 1 12300.0 Name: x, dtype: float32 - Space or apostrophe as thousand separator >>> s = pd.Series(["4 567,89", "4'567,89"], name="x") >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP 0 4567.89 1 4567.89 Name: x, dtype: float32 - - """ # noqa: E501 def __init__(self, decimal="."): From daa9557c4897e3e66897c62d7f6d86c62eb87c3d Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 15:39:01 +0100 Subject: [PATCH 05/23] Added elipsis on doctests --- skrub/_to_float.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index dca4cb339..dfe85b997 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -196,11 +196,11 @@ class ToFloat(SingleColumnTransformer): True Handling parentheses around negative numbers - >>> s = pd.Series(["-1,234.56", "1.234,56", "(1,234.56)"], name='parens') - >>> to_float.fit_transform(s) #doctest: +SKIP - 0 -1234.56 - 1 1234.56 - 2 -1234.56 + >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens') + >>> to_float.fit_transform(s) + 0 -1234.5... + 1 1234.5... + 2 -1234.5... dtype: float32 Scientific notation @@ -212,9 +212,9 @@ class ToFloat(SingleColumnTransformer): Space or apostrophe as thousand separator >>> s = pd.Series(["4 567,89", "4'567,89"], name="x") - >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP - 0 4567.89 - 1 4567.89 + >>> ToFloat(decimal=",").fit_transform(s) + 0 4567.8... + 1 4567.8... Name: x, dtype: float32 """ # noqa: E501 From 292a5c1746694b7c9babb6d19576b1cb41984f95 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 16:08:15 +0100 Subject: [PATCH 06/23] Fixed example doc --- .../feature_engineering_numerical.rst | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index c6841a4d6..0ce2b0191 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -88,24 +88,13 @@ Scientific notation: 1 12300.0 dtype: float32 -Numeric, boolean, and extension dtypes are also standardized: - ->>> pd.Series([True, False]) -0 1.0 -1 0.0 -dtype: float32 - ->>> pd.Series([1.1, 2.2], dtype="Float32") -0 1.1 -1 2.2 -dtype: float32 - Columns that cannot be converted are rejected during ``fit``: ->>> pd.Series(['1.1', 'hello']) +>>> s = pd.Series(['1.1', 'hello'], name='x') +>>> ToFloat(decimal=".").fit_transform(s) Traceback (most recent call last): ... -skrub._apply_to_cols.RejectColumn: Could not convert column '...' to numbers. +skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers. How |ToFloat| is used in skrub ------------------------------ From 6821b325f2c8c2672fe721cffcafea565e1d8964 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 16:37:53 +0100 Subject: [PATCH 07/23] Improved users guide --- .../feature_engineering_numerical.rst | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 0ce2b0191..535883d93 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -17,10 +17,10 @@ Common issues include: - String columns that contain mostly numeric values, but with occasional invalid entries To provide consistent numeric behavior, skrub includes the |ToFloat| transformer, -which **standardizes all numeric-like columns to ``float32``** and handles a wide +which standardizes all numeric-like columns to ``float32`` and handles a wide range of real-world formatting issues automatically. -The |ToFloat| transformer is used internally by both the |Cleaner| class and the +The |ToFloat| transformer is used internally by both the |Cleaner| and the |TableVectorizer| to guarantee that downstream estimators receive clean and uniform numeric data. @@ -50,6 +50,22 @@ The |ToFloat| transformer provides: As with all skrub transformers, |ToFloat| behaves like a standard scikit-learn transformer and is fully compatible with pipelines. +How to use |ToFloat| +-------------------- +The |ToFloat| transformer must be applied to individual columns. It behaves like +a standard scikit-learn transformer. +Each column is expected to use a single decimal separator, which is +specified through the ``decimal`` parameter. If this parameter is not provided, +the default decimal separator is ``'.'``. + +During ``fit``, |ToFloat| attempts to convert all values in the column to +numeric values after automatically removing other possible thousands separators +(``,``, ``.``, space, apostrophe). If any value cannot be converted, the column +is rejected with a ``RejectColumn`` exception. + +During ``transform``, invalid or non-convertible values are replaced by ``NaN`` +instead of raising an error. + Examples -------- @@ -64,20 +80,22 @@ Parsing numeric-formatted strings: 2 3.3 Name: x, dtype: float32 -Automatic handling of locale-dependent decimal separators: +Locale-dependent decimal separators can be handled by specifying the +``decimal`` parameter. Here we use comma as decimal separator, and +remove spaces and apostrophes as thousands separators: >>> s = pd.Series(["4 567,89", "4'567,89"], name="x") ->>> ToFloat(decimal=",").fit_transform(s) # doctest: +SKIP -0 4567.89 -1 4567.89 +>>> ToFloat(decimal=",").fit_transform(s) +0 4567.8... +1 4567.8... Name: x, dtype: float32 Parentheses interpreted as negative numbers: >>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg") ->>> ToFloat().fit_transform(s) # doctest: +SKIP -0 -1234.56 -1 -1234.56 +>>> ToFloat().fit_transform(s) +0 -1234.5... +1 -1234.5... Name: neg, dtype: float32 Scientific notation: @@ -96,17 +114,16 @@ Traceback (most recent call last): ... skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers. -How |ToFloat| is used in skrub ------------------------------- - -The |ToFloat| transformer is used internally in: -- the **Cleaner** (|Cleaner|), to normalize all numeric-like columns before modeling -- the **|TableVectorizer|**, ensuring a consistent numeric dtype across all numeric features - -This makes |ToFloat| a core building block of skrub’s handling of heterogeneous -tabular data. +During ``transform``, invalid entries become ``NaN`` instead of raising an error: +>>> s = pd.Series(['1.1', '2.2'], name='x') +>>> to_float = ToFloat(decimal=".") +>>> to_float.fit_transform(s) +0 1.1 +1 2.2 +Name: x, dtype: float32 -``ToFloat`` ensures that downstream machine-learning models receive numeric data -that is clean, consistent, lightweight, and free of locale-specific quirks or -string-encoded values. +>>> to_float.transform(pd.Series(['3.3', 'invalid'], name='x')) +0 3.3 +1 NaN +Name: x, dtype: float32 From 9df1ba24f12ebb4ba4a49bd4e85efa212aa3b9ce Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 1 Dec 2025 17:14:06 +0100 Subject: [PATCH 08/23] Fixed tests --- skrub/_to_float.py | 14 ++++++++------ skrub/tests/test_to_float.py | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index dfe85b997..21dd327a9 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -8,19 +8,23 @@ @dispatch -def _str_replace(col, pattern, strict=True): +def _str_replace(col, strict=True): raise_dispatch_unregistered_type(col, kind="Series") @_str_replace.specialize("pandas", argument_type="Column") -def _str_replace_pandas(col, pattern, decimal): +def _str_replace_pandas(col, decimal): + pattern = POSSIBLE_SEPARATORS.copy() + pattern.remove(decimal) col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True) return col.str.replace(decimal, ".", regex=False) @_str_replace.specialize("polars", argument_type="Column") -def _str_replace_polars(col, pattern, decimal): +def _str_replace_polars(col, decimal): + pattern = POSSIBLE_SEPARATORS.copy() + pattern.remove(decimal) col = col.str.replace_all(r"^\((.*)\)$", r"-$1") col = col.str.replace_all("[" + "".join(pattern) + "]", "") return col.str.replace_all(f"[{decimal}]", ".") @@ -247,9 +251,7 @@ def fit_transform(self, column, y=None): ) try: if sbd.is_string(column): - p = POSSIBLE_SEPARATORS.copy() - p.remove(self.decimal) - column = _str_replace(column, pattern=p, decimal=self.decimal) + column = _str_replace(column, decimal=self.decimal) numeric = sbd.to_float32(column, strict=True) return numeric except Exception as e: diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index 326a2d084..a7498e0a6 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -5,7 +5,7 @@ from skrub._single_column_transformer import RejectColumn from skrub._to_categorical import ToCategorical from skrub._to_datetime import ToDatetime -from skrub._to_float import ToFloat +from skrub._to_float import ToFloat, _str_replace from skrub.conftest import skip_polars_installed_without_pyarrow @@ -76,3 +76,16 @@ def test_number_parsing(input_str, expected_float, decimal, df_module): result = ToFloat(decimal=decimal).fit_transform(column) np.allclose(result[0], expected_float) + + +def test_str_replace(df_module): + s = df_module.make_column("x", ["1,234.56", "7.890,12", "3 456,78", "9'012.34"]) + result_dot = _str_replace(s, decimal=".") + + expected_dot = df_module.make_column( + "x", ["1234.56", "7890.12", "3456.78", "9012.34"] + ) + np.all(sbd.to_list(result_dot) == sbd.to_list(expected_dot)) + + with pytest.raises(TypeError): + _str_replace([1, 2, 3], decimal=".") From 620bd12470f02d9b1ad76ea47eaec8330695ca68 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 15 Dec 2025 16:36:34 +0100 Subject: [PATCH 09/23] WIP: Improved column verification --- skrub/_to_float.py | 68 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 10 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 21dd327a9..5f54696d3 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -1,3 +1,5 @@ +import re + from . import _dataframe as sbd from ._dispatch import dispatch, raise_dispatch_unregistered_type from ._single_column_transformer import RejectColumn, SingleColumnTransformer @@ -7,26 +9,61 @@ POSSIBLE_SEPARATORS = [".", ",", "'", " "] +def _build_number_regex(decimal, thousand): + d = re.escape(decimal) + t = re.escape(thousand) + + integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)" + decimal_part = rf"(?:{d}\d+)?" + scientific = r"(?:[eE][+-]?\d+)?" + + return rf""" + ^ + \(? + [+-]? + {integer} + {decimal_part} + {scientific} + \)? + $ + """ + + +@dispatch +def _str_is_valid_number(col, number_re): + raise_dispatch_unregistered_type(col, kind="Series") + + +@_str_is_valid_number.specialize("pandas", argument_type="Column") +def _str_is_valid_number_pandas(col, number_re): + if not col.str.match(number_re, na=False).all(): + raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") + return True + + +@_str_is_valid_number.specialize("polars", argument_type="Column") +def _str_is_valid_number_polars(col, number_re): + if not col.str.contains(number_re.pattern).all(): + raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") + return True + + @dispatch def _str_replace(col, strict=True): raise_dispatch_unregistered_type(col, kind="Series") @_str_replace.specialize("pandas", argument_type="Column") -def _str_replace_pandas(col, decimal): - pattern = POSSIBLE_SEPARATORS.copy() - pattern.remove(decimal) +def _str_replace_pandas(col, decimal, thousand): col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) - col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True) + col = col.str.replace(thousand, "", regex=False) return col.str.replace(decimal, ".", regex=False) @_str_replace.specialize("polars", argument_type="Column") -def _str_replace_polars(col, decimal): - pattern = POSSIBLE_SEPARATORS.copy() - pattern.remove(decimal) +def _str_replace_polars(col, decimal, thousand): col = col.str.replace_all(r"^\((.*)\)$", r"-$1") - col = col.str.replace_all("[" + "".join(pattern) + "]", "") + col = col.str.replace_all(thousand, "") return col.str.replace_all(f"[{decimal}]", ".") @@ -222,9 +259,10 @@ class ToFloat(SingleColumnTransformer): Name: x, dtype: float32 """ # noqa: E501 - def __init__(self, decimal="."): + def __init__(self, decimal=".", thousand=","): super().__init__() self.decimal = decimal + self.thousand = thousand def fit_transform(self, column, y=None): """Fit the encoder and transform a column. @@ -244,6 +282,9 @@ def fit_transform(self, column, y=None): """ del y self.all_outputs_ = [sbd.name(column)] + if self.thousand == self.decimal: + raise ValueError("The thousand and decimal separators must differ.") + if sbd.is_any_date(column) or sbd.is_categorical(column): raise RejectColumn( f"Refusing to cast column {sbd.name(column)!r} " @@ -251,7 +292,14 @@ def fit_transform(self, column, y=None): ) try: if sbd.is_string(column): - column = _str_replace(column, decimal=self.decimal) + self._number_re_ = re.compile( + _build_number_regex(self.decimal, self.thousand), + re.VERBOSE, + ) + _str_is_valid_number(column, self._number_re_) + column = _str_replace( + column, decimal=self.decimal, thousand=self.thousand + ) numeric = sbd.to_float32(column, strict=True) return numeric except Exception as e: From 0be30f3a064b3fe863d8e84af8c104138276a863 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 15 Dec 2025 16:49:56 +0100 Subject: [PATCH 10/23] WIP: Removed pattern and include thousand separator --- skrub/_to_float.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 5f54696d3..dd66a40d3 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -6,8 +6,6 @@ __all__ = ["ToFloat"] -POSSIBLE_SEPARATORS = [".", ",", "'", " "] - def _build_number_regex(decimal, thousand): d = re.escape(decimal) @@ -259,10 +257,10 @@ class ToFloat(SingleColumnTransformer): Name: x, dtype: float32 """ # noqa: E501 - def __init__(self, decimal=".", thousand=","): + def __init__(self, decimal=".", thousand=None): super().__init__() self.decimal = decimal - self.thousand = thousand + self.thousand = thousand if thousand is not None else "" def fit_transform(self, column, y=None): """Fit the encoder and transform a column. From ec7d687e686223cea13e6ddb8baee845ed4ee2ca Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 15 Dec 2025 18:00:52 +0100 Subject: [PATCH 11/23] WIP: Regex modification for polars --- skrub/_to_float.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index dd66a40d3..6c6010b01 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -14,17 +14,15 @@ def _build_number_regex(decimal, thousand): integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)" decimal_part = rf"(?:{d}\d+)?" scientific = r"(?:[eE][+-]?\d+)?" - - return rf""" - ^ - \(? - [+-]? - {integer} - {decimal_part} - {scientific} - \)? - $ - """ + return rf"^\(?[+-]?(?:{integer}{decimal_part}{scientific})?\)?$" + # return rf""" + # ^ + # \(? + # [+-]? + # (?:{integer}{decimal_part}{scientific})? + # \)? + # $ + # """ @dispatch @@ -34,14 +32,15 @@ def _str_is_valid_number(col, number_re): @_str_is_valid_number.specialize("pandas", argument_type="Column") def _str_is_valid_number_pandas(col, number_re): - if not col.str.match(number_re, na=False).all(): + if not col.fillna("").str.match(number_re, na=False).all(): raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") return True @_str_is_valid_number.specialize("polars", argument_type="Column") def _str_is_valid_number_polars(col, number_re): - if not col.str.contains(number_re.pattern).all(): + # pattern = re.sub(r'\s+', '', number_re.pattern) + if not col.fill_null("").str.contains(number_re.pattern, literal=False).all(): raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") return True From 3e6dea1b6bd159826da9351ef453a72981cbbd0e Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Wed, 17 Dec 2025 12:13:39 +0100 Subject: [PATCH 12/23] Improved tests --- skrub/_to_float.py | 22 ++++------ skrub/tests/test_to_float.py | 82 +++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 51 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 6c6010b01..cbf07fa77 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -12,17 +12,10 @@ def _build_number_regex(decimal, thousand): t = re.escape(thousand) integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)" - decimal_part = rf"(?:{d}\d+)?" + decimal_part = rf"{d}\d+" scientific = r"(?:[eE][+-]?\d+)?" - return rf"^\(?[+-]?(?:{integer}{decimal_part}{scientific})?\)?$" - # return rf""" - # ^ - # \(? - # [+-]? - # (?:{integer}{decimal_part}{scientific})? - # \)? - # $ - # """ + number = rf"(?:{integer}(?:{decimal_part})?|{decimal_part})" + return rf"^\(?[+-]?(?:{number}{scientific})?\)?$" @dispatch @@ -39,7 +32,6 @@ def _str_is_valid_number_pandas(col, number_re): @_str_is_valid_number.specialize("polars", argument_type="Column") def _str_is_valid_number_polars(col, number_re): - # pattern = re.sub(r'\s+', '', number_re.pattern) if not col.fill_null("").str.contains(number_re.pattern, literal=False).all(): raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") return True @@ -60,7 +52,7 @@ def _str_replace_pandas(col, decimal, thousand): @_str_replace.specialize("polars", argument_type="Column") def _str_replace_polars(col, decimal, thousand): col = col.str.replace_all(r"^\((.*)\)$", r"-$1") - col = col.str.replace_all(thousand, "") + col = col.str.replace_all(thousand, "", literal=True) return col.str.replace_all(f"[{decimal}]", ".") @@ -259,7 +251,7 @@ class ToFloat(SingleColumnTransformer): def __init__(self, decimal=".", thousand=None): super().__init__() self.decimal = decimal - self.thousand = thousand if thousand is not None else "" + self.thousand = thousand def fit_transform(self, column, y=None): """Fit the encoder and transform a column. @@ -279,6 +271,10 @@ def fit_transform(self, column, y=None): """ del y self.all_outputs_ = [sbd.name(column)] + if self.thousand is None: + self.thousand = "" # No thousand separator + if self.decimal is None: + raise ValueError("The decimal separator cannot be None.") if self.thousand == self.decimal: raise ValueError("The thousand and decimal separators must differ.") diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index a7498e0a6..5d014d313 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -5,7 +5,7 @@ from skrub._single_column_transformer import RejectColumn from skrub._to_categorical import ToCategorical from skrub._to_datetime import ToDatetime -from skrub._to_float import ToFloat, _str_replace +from skrub._to_float import ToFloat from skrub.conftest import skip_polars_installed_without_pyarrow @@ -46,46 +46,52 @@ def test_rejected_columns(df_module): @pytest.mark.parametrize( - "input_str, expected_float, decimal", + "input_str, expected_float, decimal, thousand", [ - ("1,234.56", 1234.56, "."), - ("1.234,56", 1234.56, ","), - ("1 234,56", 1234.56, ","), - ("1234.56", 1234.56, "."), - ("1234,56", 1234.56, ","), - ("1,234,567.89", 1234567.89, "."), - ("1.234.567,89", 1234567.89, ","), - ("1 234 567,89", 1234567.89, ","), - ("1'234'567.89", 1234567.89, "."), - ("1.23e+4", 12300.0, "."), - ("1.23E+4", 12300.0, "."), - ("1,23e+4", 12300.0, ","), - ("1,23E+4", 12300.0, ","), - ("-1,234.56", -1234.56, "."), - ("-1.234,56", -1234.56, ","), - ("(1,234.56)", -1234.56, "."), - ("(1.234,56)", -1234.56, ","), - ("1,23,456.78", 123456.78, "."), - ("12,3456.78", 123456.78, "."), - (".56", 0.56, "."), - (",56", 0.56, ","), + # valid numbers + ("1,234.56", 1234.56, ".", ","), + ("1.234,56", 1234.56, ",", "."), + ("1 234,56", 1234.56, ",", " "), + ("1234.56", 1234.56, ".", None), + ("1234,56", 1234.56, ",", None), + ("1,234,567.89", 1234567.89, ".", ","), + ("1.234.567,89", 1234567.89, ",", "."), + ("1 234 567,89", 1234567.89, ",", " "), + ("1'234'567.89", 1234567.89, ".", "'"), + ("1.23e+4", 12300.0, ".", None), + ("1.23E+4", 12300.0, ".", None), + ("-1,234.56", -1234.56, ".", ","), + ("(1,234.56)", -1234.56, ".", ","), + (".56", 0.56, ".", None), + (",56", 0.56, ",", None), + ("56", 56.0, ".", None), ], ) -def test_number_parsing(input_str, expected_float, decimal, df_module): +def test_number_parsing_valid(input_str, expected_float, decimal, thousand, df_module): column = df_module.make_column("col", [input_str]) - result = ToFloat(decimal=decimal).fit_transform(column) + result = ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + assert np.allclose(result[0], expected_float) - np.allclose(result[0], expected_float) - -def test_str_replace(df_module): - s = df_module.make_column("x", ["1,234.56", "7.890,12", "3 456,78", "9'012.34"]) - result_dot = _str_replace(s, decimal=".") - - expected_dot = df_module.make_column( - "x", ["1234.56", "7890.12", "3456.78", "9012.34"] - ) - np.all(sbd.to_list(result_dot) == sbd.to_list(expected_dot)) - - with pytest.raises(TypeError): - _str_replace([1, 2, 3], decimal=".") +@pytest.mark.parametrize( + "input_str, decimal, thousand", + [ + # invalid grouping + ("1,23,456.78", ".", ","), + ("1.2.3.4", ".", None), + ("1.2.3.4,0", ",", "."), + ("12,3456.78", ".", ","), + ("1 234,567.34", ".", ","), + ("1'234,567.34", ".", ","), + ("1'234'234,567.34", ",", "'"), + ("123.45.67", ".", None), + ("1,,234", ".", ","), + ("1.23,45", ".", ","), + # decimal == thousand + ("123,456,789", ",", ","), + ], +) +def test_number_parsing_invalid(input_str, decimal, thousand, df_module): + column = df_module.make_column("col", [input_str]) + with pytest.raises((RejectColumn, ValueError)): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) From f8e63a63288219b87997d6c9bcb620e7237b4729 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Wed, 17 Dec 2025 13:14:22 +0100 Subject: [PATCH 13/23] Improving the docstrings and documentation --- .../feature_engineering_numerical.rst | 24 +++++++++++-------- skrub/_to_float.py | 10 +++++--- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 535883d93..14ef3d888 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -32,9 +32,13 @@ The |ToFloat| transformer provides: - **Automatic conversion to 32-bit floating-point values (`float32`).** This dtype is lightweight and fully supported by scikit-learn estimators. -- **Automatic parsing of decimal separators**, regardless of locale: - - ``.`` or ``,`` can be used as decimal point - - thousands separators (``.``, ``,``, space, apostrophe) are removed automatically +- **Automatic parsing of decimal and thousands separators**, regardless of locale: + - The decimal separator must be specified explicitly and can be either ``.`` or ``,`` + - The thousands separator can be one of ``.``, ``,``, space (``" "``), apostrophe (``'``), + or None (no thousands separator) + - The transformer supports integers, decimals (including leading-decimal forms such as .56 or ,56), scientific notation + and negative numbers (including parentheses) + - Decimal and thousands separators must be different characters - **Parentheses interpreted as negative numbers**, a common format in financial datasets: - ``(1,234.56)`` → ``-1234.56`` @@ -54,9 +58,9 @@ How to use |ToFloat| -------------------- The |ToFloat| transformer must be applied to individual columns. It behaves like a standard scikit-learn transformer. -Each column is expected to use a single decimal separator, which is -specified through the ``decimal`` parameter. If this parameter is not provided, -the default decimal separator is ``'.'``. +Each column is expected to use a single decimals and thousands separator, which is +specified through the ``decimal`` and ``thousand`` parameter. If this parameter is not provided, +the default decimal separators are ``'.'`` and ``None``. During ``fit``, |ToFloat| attempts to convert all values in the column to numeric values after automatically removing other possible thousands separators @@ -81,11 +85,11 @@ Parsing numeric-formatted strings: Name: x, dtype: float32 Locale-dependent decimal separators can be handled by specifying the -``decimal`` parameter. Here we use comma as decimal separator, and -remove spaces and apostrophes as thousands separators: +``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and +a spaces as thousands separators: ->>> s = pd.Series(["4 567,89", "4'567,89"], name="x") ->>> ToFloat(decimal=",").fit_transform(s) +>>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x") +>>> ToFloat(decimal=",", thousand=" ").fit_transform(s) 0 4567.8... 1 4567.8... Name: x, dtype: float32 diff --git a/skrub/_to_float.py b/skrub/_to_float.py index cbf07fa77..a58f4046d 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -80,6 +80,10 @@ class ToFloat(SingleColumnTransformer): Character to recognize as the decimal separator when converting from strings to floats. Other possible decimal separators are removed from the strings before conversion. + thousand : str or None, default=None + Character used as thousands separator. Supported values are ``"."``, + ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands + separator). The decimal and thousands separators must differ. Examples -------- @@ -240,9 +244,9 @@ class ToFloat(SingleColumnTransformer): 1 12300.0 Name: x, dtype: float32 - Space or apostrophe as thousand separator - >>> s = pd.Series(["4 567,89", "4'567,89"], name="x") - >>> ToFloat(decimal=",").fit_transform(s) + Space as thousand separator + >>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x") + >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) 0 4567.8... 1 4567.8... Name: x, dtype: float32 From 50d9b4746d78a9a6329d55288c1eb62693059531 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Wed, 17 Dec 2025 13:28:31 +0100 Subject: [PATCH 14/23] Improving documentation --- .../feature_engineering_numerical.rst | 4 ++-- skrub/_to_float.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 14ef3d888..0f13e0ee1 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -88,10 +88,10 @@ Locale-dependent decimal separators can be handled by specifying the ``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and a spaces as thousands separators: ->>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x") +>>> s = pd.Series(["4 567,89", "12 567,89"], name="x") >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) 0 4567.8... -1 4567.8... +1 12567.8... Name: x, dtype: float32 Parentheses interpreted as negative numbers: diff --git a/skrub/_to_float.py b/skrub/_to_float.py index a58f4046d..a99be8a99 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -231,7 +231,7 @@ class ToFloat(SingleColumnTransformer): Handling parentheses around negative numbers >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens') - >>> to_float.fit_transform(s) + >>> ToFloat(decimal=".", thousand=",").fit_transform(s) 0 -1234.5... 1 1234.5... 2 -1234.5... @@ -245,10 +245,10 @@ class ToFloat(SingleColumnTransformer): Name: x, dtype: float32 Space as thousand separator - >>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x") - >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) + >>> s = pd.Series(["4 567,89", "12 567,89"], name="x") + >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) # doctest: +ELLIPSIS 0 4567.8... - 1 4567.8... + 1 12567.8... Name: x, dtype: float32 """ # noqa: E501 From 36d1d8f4e1565cdc60ab9e6fc830829554b8d6fb Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:19:13 +0100 Subject: [PATCH 15/23] Update doc/modules/column_level_featurizing/feature_engineering_numerical.rst Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- .../feature_engineering_numerical.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 0f13e0ee1..1274fa8f0 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -37,12 +37,10 @@ The |ToFloat| transformer provides: - The thousands separator can be one of ``.``, ``,``, space (``" "``), apostrophe (``'``), or None (no thousands separator) - The transformer supports integers, decimals (including leading-decimal forms such as .56 or ,56), scientific notation - and negative numbers (including parentheses) + and negative numbers + - Numbers in parentheses are interpreted as negative numbers (``(1,234.56)`` → ``-1234.56``). This format is more common in financial datasets. - Decimal and thousands separators must be different characters -- **Parentheses interpreted as negative numbers**, a common format in financial datasets: - - ``(1,234.56)`` → ``-1234.56`` - - **Scientific notation parsing** (e.g. ``1.23e+4``) - **Graceful handling of invalid or non-numeric values during transform**: From 0f149f1e312af1f308ca9b8cdf8bbf940f9496e1 Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:19:31 +0100 Subject: [PATCH 16/23] Update doc/modules/column_level_featurizing/feature_engineering_numerical.rst Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- .../feature_engineering_numerical.rst | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 1274fa8f0..8abfc76f6 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -54,11 +54,13 @@ scikit-learn transformer and is fully compatible with pipelines. How to use |ToFloat| -------------------- -The |ToFloat| transformer must be applied to individual columns. It behaves like -a standard scikit-learn transformer. -Each column is expected to use a single decimals and thousands separator, which is -specified through the ``decimal`` and ``thousand`` parameter. If this parameter is not provided, -the default decimal separators are ``'.'`` and ``None``. +The |ToFloat| transformer must be applied to individual columns, and it behaves +like a standard scikit-learn transformer. +|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and +``None`` (no thousands separator) by default. +Each column is expected to use a single separator for decimals, and one for thousands: +if any characters other than the provided selectors are encountered in the column, it will not +be converted. During ``fit``, |ToFloat| attempts to convert all values in the column to numeric values after automatically removing other possible thousands separators From 415aec18b6e1b9febe6bcef39a882b6549f0cfb1 Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:20:06 +0100 Subject: [PATCH 17/23] Update doc/modules/column_level_featurizing/feature_engineering_numerical.rst Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- .../column_level_featurizing/feature_engineering_numerical.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 8abfc76f6..1ffa4b19b 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -86,7 +86,7 @@ Name: x, dtype: float32 Locale-dependent decimal separators can be handled by specifying the ``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and -a spaces as thousands separators: +a space as thousands separators: >>> s = pd.Series(["4 567,89", "12 567,89"], name="x") >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) From a19e149a4a07e0108b16b48f9c8a2fba6ecd270e Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:23:21 +0100 Subject: [PATCH 18/23] Update skrub/_to_float.py Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- skrub/_to_float.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index a99be8a99..a6d95d704 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -81,9 +81,9 @@ class ToFloat(SingleColumnTransformer): strings to floats. Other possible decimal separators are removed from the strings before conversion. thousand : str or None, default=None - Character used as thousands separator. Supported values are ``"."``, - ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands - separator). The decimal and thousands separators must differ. + Character used as thousands separator. Supported values are ``"."``, + ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands + separator). The decimal and thousands separators must differ. Examples -------- From 1754d073600c867a1be5e4e44981296bea031446 Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:24:15 +0100 Subject: [PATCH 19/23] Update skrub/_to_float.py Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- skrub/_to_float.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index a6d95d704..965d2a3f9 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -229,7 +229,8 @@ class ToFloat(SingleColumnTransformer): >>> to_float.fit_transform(s) is s True - Handling parentheses around negative numbers + Negative numbers represented using parentheses are converted + so they use "-" instead. >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens') >>> ToFloat(decimal=".", thousand=",").fit_transform(s) 0 -1234.5... From db44c3e0db8d7d624cfa08133b29b7b3319ab8cd Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:25:16 +0100 Subject: [PATCH 20/23] Update skrub/_to_float.py Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- skrub/_to_float.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 965d2a3f9..c9d45ea3f 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -238,7 +238,7 @@ class ToFloat(SingleColumnTransformer): 2 -1234.5... dtype: float32 - Scientific notation + Numbers that use scientific notation are converted: >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x") >>> ToFloat(decimal=".").fit_transform(s) 0 12300.0 From 0b71c2611bf05a28b10acdcd0e034b87a5ea1078 Mon Sep 17 00:00:00 2001 From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com> Date: Wed, 17 Dec 2025 16:26:03 +0100 Subject: [PATCH 21/23] Update skrub/_to_float.py Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com> --- skrub/_to_float.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index c9d45ea3f..b0f4ef4df 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -245,7 +245,7 @@ class ToFloat(SingleColumnTransformer): 1 12300.0 Name: x, dtype: float32 - Space as thousand separator + It is possible to specify the thousands separator, e.g., to use " " >>> s = pd.Series(["4 567,89", "12 567,89"], name="x") >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) # doctest: +ELLIPSIS 0 4567.8... From 94435da87c421b5a0b8405c9a9fb92f874d959a1 Mon Sep 17 00:00:00 2001 From: GOMEZ JIMENEZ Gabriela Date: Mon, 19 Jan 2026 14:52:00 +0100 Subject: [PATCH 22/23] WIP --- skrub/_to_float.py | 44 +++++++++++++++++++++++++++++++++--- skrub/tests/test_to_float.py | 31 +++++++++++++++++++++++-- 2 files changed, 70 insertions(+), 5 deletions(-) diff --git a/skrub/_to_float.py b/skrub/_to_float.py index b0f4ef4df..5d39d88af 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -8,13 +8,35 @@ def _build_number_regex(decimal, thousand): - d = re.escape(decimal) - t = re.escape(thousand) - + # Escape decimal and thousand separators to use in regex + d = re.escape(decimal) # e.g., '.' → '\.', ',' → '\,' + t = re.escape(thousand) # e.g., ',' → '\,', '.' → '\.' + + # Matches integer parts: + # Either: + # - one or more digits without thousand separators: \d+ + # - or digits grouped by thousand separators: \d{1,3}(?:{t}\d{3})+ + # e.g., '1,234' or '12,345,678' integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)" + + # Matches decimal part after the decimal separator + # e.g., '.456' or ',456' depending on locale decimal_part = rf"{d}\d+" + + # Matches optional scientific notation + # e.g., 'e10', 'E-5', 'e+3' scientific = r"(?:[eE][+-]?\d+)?" + + # Full number can be: + # - integer with optional decimal part + # - or only decimal part (like '.5') number = rf"(?:{integer}(?:{decimal_part})?|{decimal_part})" + + # Final regex: + # - optional parentheses around the number: \( ... \)? + # - optional leading + or - sign: [+-]? + # - optional scientific notation is included in `number` + # Anchored to start (^) and end ($) of string return rf"^\(?[+-]?(?:{number}{scientific})?\)?$" @@ -25,6 +47,10 @@ def _str_is_valid_number(col, number_re): @_str_is_valid_number.specialize("pandas", argument_type="Column") def _str_is_valid_number_pandas(col, number_re): + # Check if all values in the column match the number regex. + # - Fill NaN values with empty string to avoid match errors. + # - Use `str.match` with `na=False` to treat empty/missing values as non-matching. + # - If any value does not match, raise RejectColumn with a descriptive message. if not col.fillna("").str.match(number_re, na=False).all(): raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") return True @@ -32,6 +58,10 @@ def _str_is_valid_number_pandas(col, number_re): @_str_is_valid_number.specialize("polars", argument_type="Column") def _str_is_valid_number_polars(col, number_re): + # Check if all values in the column match the number regex. + # - Fill NaN values with empty string to avoid match errors. + # - Use `str.match` with `na=False` to treat empty/missing values as non-matching. + # - If any value does not match, raise RejectColumn with a descriptive message. if not col.fill_null("").str.contains(number_re.pattern, literal=False).all(): raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") return True @@ -44,15 +74,23 @@ def _str_replace(col, strict=True): @_str_replace.specialize("pandas", argument_type="Column") def _str_replace_pandas(col, decimal, thousand): + # Replace parentheses around numbers with a leading minus sign + # e.g., "(123.45)" → "-123.45" col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True) + # Remove thousand separators col = col.str.replace(thousand, "", regex=False) + # Replace decimal separator with '.' return col.str.replace(decimal, ".", regex=False) @_str_replace.specialize("polars", argument_type="Column") def _str_replace_polars(col, decimal, thousand): + # Replace parentheses around numbers with a leading minus sign + # e.g., "(123.45)" → "-123.45" col = col.str.replace_all(r"^\((.*)\)$", r"-$1") + # Remove thousand separators col = col.str.replace_all(thousand, "", literal=True) + # Replace decimal separator with '.' return col.str.replace_all(f"[{decimal}]", ".") diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py index 5d014d313..a642ac5e7 100644 --- a/skrub/tests/test_to_float.py +++ b/skrub/tests/test_to_float.py @@ -87,11 +87,38 @@ def test_number_parsing_valid(input_str, expected_float, decimal, thousand, df_m ("123.45.67", ".", None), ("1,,234", ".", ","), ("1.23,45", ".", ","), - # decimal == thousand - ("123,456,789", ",", ","), ], ) def test_number_parsing_invalid(input_str, decimal, thousand, df_module): column = df_module.make_column("col", [input_str]) with pytest.raises((RejectColumn, ValueError)): ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + + +@pytest.mark.parametrize( + "decimal, thousand", + [ + # invalid because decimal and thousand are the same + (",", ","), + (".", "."), + # invalid because decimal is None + (None, ","), + (None, None), + ], +) +def test_invalid_parameters(decimal, thousand, df_module): + """ + Test that ToFloat raises an exception if the parameters are invalid: + - decimal is None → ValueError + - thousand == decimal → ValueError + """ + column = df_module.make_column("col", ["123", "456"]) + + if decimal is None: + with pytest.raises(ValueError, match="decimal separator cannot be None"): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) + else: + with pytest.raises( + ValueError, match="thousand and decimal separators must differ" + ): + ToFloat(decimal=decimal, thousand=thousand).fit_transform(column) From 095f403803d1a99c8d6fcf908a8402a5da858756 Mon Sep 17 00:00:00 2001 From: Riccardo Cappuzzo Date: Tue, 20 Jan 2026 11:55:09 +0100 Subject: [PATCH 23/23] Reverting changes and cleaning up history --- CHANGES.rst | 6 +++--- .../feature_engineering_numerical.rst | 14 +++++++------- skrub/_table_vectorizer.py | 4 ++-- skrub/_to_float.py | 6 ++---- 4 files changed, 14 insertions(+), 16 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 8ad38c571..f7e00c83f 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -11,6 +11,9 @@ Ongoing Development New features ------------ +- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.' + as decimal separator and it also handles negative numbers indicated with parentheses. + :pr:`1772` by :user:`Gabriela Gómez Jiménez `. Changes ------- @@ -73,9 +76,6 @@ New features - :class:`TableReport` now includes the ``open_tab`` parameter, which lets the user select which tab should be opened when the ``TableReport`` is rendered. :pr:`1737` by :user:`Riccardo Cappuzzo`. -- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.' - as decimal separator and it also handles negative numbers indicated with parentheses. - :pr:`1772` by :user:`Gabriela Gómez Jiménez `. Changes ------- diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst index 1ffa4b19b..a1aae9eda 100644 --- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst +++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst @@ -55,12 +55,12 @@ scikit-learn transformer and is fully compatible with pipelines. How to use |ToFloat| -------------------- The |ToFloat| transformer must be applied to individual columns, and it behaves -like a standard scikit-learn transformer. -|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and +like a standard scikit-learn transformer. +|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and ``None`` (no thousands separator) by default. -Each column is expected to use a single separator for decimals, and one for thousands: -if any characters other than the provided selectors are encountered in the column, it will not -be converted. +Each column is expected to use a single separator for decimals, and one for thousands: +if any characters other than the provided selectors are encountered in the column, it will not +be converted. During ``fit``, |ToFloat| attempts to convert all values in the column to numeric values after automatically removing other possible thousands separators @@ -97,7 +97,7 @@ Name: x, dtype: float32 Parentheses interpreted as negative numbers: >>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg") ->>> ToFloat().fit_transform(s) +>>> ToFloat(thousand=",").fit_transform(s) 0 -1234.5... 1 -1234.5... Name: neg, dtype: float32 @@ -116,7 +116,7 @@ Columns that cannot be converted are rejected during ``fit``: >>> ToFloat(decimal=".").fit_transform(s) Traceback (most recent call last): ... -skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers. +skrub._single_column_transformer.RejectColumn: Could not convert column 'x' to numbers. During ``transform``, invalid entries become ``NaN`` instead of raising an error: diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 3e7ade92d..f3613a670 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -693,7 +693,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): We can inspect all the processing steps that were applied to a given column: >>> vectorizer.all_processing_steps_['B'] - [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(), 'B_month': ToFloat(), ...}] + [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(thousand=''), 'B_month': ToFloat(thousand=''), ...}] Note that as the encoder (``DatetimeEncoder()`` above) produces multiple columns, the last processing step is not described by a single transformer @@ -768,7 +768,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ``ToDatetime()``: >>> vectorizer.all_processing_steps_ - {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(), PassThrough(), {'C': ToFloat()}]} + {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(thousand=''), PassThrough(), {'C': ToFloat(thousand='')}]} Specifying several ``specific_transformers`` for the same column is not allowed. diff --git a/skrub/_to_float.py b/skrub/_to_float.py index 5d39d88af..b34463274 100644 --- a/skrub/_to_float.py +++ b/skrub/_to_float.py @@ -52,7 +52,7 @@ def _str_is_valid_number_pandas(col, number_re): # - Use `str.match` with `na=False` to treat empty/missing values as non-matching. # - If any value does not match, raise RejectColumn with a descriptive message. if not col.fillna("").str.match(number_re, na=False).all(): - raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.") + raise RejectColumn(f"Could not convert column {sbd.name(col)!r} to numbers.") return True @@ -294,7 +294,7 @@ class ToFloat(SingleColumnTransformer): def __init__(self, decimal=".", thousand=None): super().__init__() self.decimal = decimal - self.thousand = thousand + self.thousand = "" if thousand is None else thousand def fit_transform(self, column, y=None): """Fit the encoder and transform a column. @@ -314,8 +314,6 @@ def fit_transform(self, column, y=None): """ del y self.all_outputs_ = [sbd.name(column)] - if self.thousand is None: - self.thousand = "" # No thousand separator if self.decimal is None: raise ValueError("The decimal separator cannot be None.") if self.thousand == self.decimal: