From e5fe91f94e850391ac3d41c8d3a27f924d6f5873 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 24 Nov 2025 17:17:47 +0100
Subject: [PATCH 01/23] WIP: Adding decimal conversion and tests

---
 skrub/_to_float.py           | 28 ++++++++++++++++++++++++++++
 skrub/tests/test_to_float.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 6a167ebea..9c0a84219 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -1,8 +1,28 @@
 from . import _dataframe as sbd
+from ._dispatch import dispatch, raise_dispatch_unregistered_type
 from ._single_column_transformer import RejectColumn, SingleColumnTransformer
 
 __all__ = ["ToFloat"]
 
+POSSIBLE_SEPARATORS = [".", ",", "'", " "]
+
+
+@dispatch
+def _str_replace(col, pattern, strict=True):
+    raise_dispatch_unregistered_type(col, kind="Series")
+
+
+@_str_replace.specialize("pandas", argument_type="Column")
+def _str_replace_pandas(col, pattern):
+    col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
+    return col.str.replace("".join(pattern), "", regex=True)
+
+
+@_str_replace.specialize("polars", argument_type="Column")
+def _str_replace_polars(col, pattern):
+    col = col.str.replace(r"^\((.*)\)$", r"-\1")
+    return col.str.replace_all("".join(pattern), "")
+
 
 class ToFloat(SingleColumnTransformer):
     """
@@ -167,6 +187,10 @@ class ToFloat(SingleColumnTransformer):
     True
     """  # noqa: E501
 
+    def __init__(self, decimal="."):
+        super().__init__()
+        self.decimal = decimal
+
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
 
@@ -191,6 +215,10 @@ def fit_transform(self, column, y=None):
                 f"with dtype '{sbd.dtype(column)}' to numbers."
             )
         try:
+            if sbd.is_string(column):
+                p = POSSIBLE_SEPARATORS.copy()
+                p.remove(self.decimal)
+                column = _str_replace(column, pattern=p)
             numeric = sbd.to_float32(column, strict=True)
             return numeric
         except Exception as e:
diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
index 5f777f061..201891567 100644
--- a/skrub/tests/test_to_float.py
+++ b/skrub/tests/test_to_float.py
@@ -43,3 +43,36 @@ def test_rejected_columns(df_module):
             ToFloat().fit_transform(col)
         to_float = ToFloat().fit(df_module.make_column("c", [1.1]))
         assert is_float32(df_module, to_float.transform(col))
+
+
+@pytest.mark.parametrize(
+    "input_str, expected_float, decimal",
+    [
+        ("1,234.56", 1234.56, "."),
+        ("1.234,56", 1234.56, ","),
+        ("1 234,56", 1234.56, ","),
+        ("1234.56", 1234.56, "."),
+        ("1234,56", 1234.56, ","),
+        ("1,234,567.89", 1234567.89, "."),
+        ("1.234.567,89", 1234567.89, ","),
+        ("1 234 567,89", 1234567.89, ","),
+        ("1'234'567.89", 1234567.89, "."),
+        ("1.23e+4", 12300.0, "."),
+        ("1.23E+4", 12300.0, "."),
+        ("1,23e+4", 12300.0, ","),
+        ("1,23E+4", 12300.0, ","),
+        ("-1,234.56", -1234.56, "."),
+        ("-1.234,56", -1234.56, ","),
+        ("(1,234.56)", -1234.56, "."),
+        ("(1.234,56)", -1234.56, ","),
+        ("1,23,456.78", 123456.78, "."),
+        ("12,3456.78", 123456.78, "."),
+        (".56", 0.56, "."),
+        (",56", 0.56, ","),
+    ],
+)
+def test_number_parsing(input_str, expected_float, decimal, df_module):
+    column = df_module.make_column("col", [input_str])
+    result = ToFloat(decimal=decimal).fit_transform(column)
+
+    assert result == expected_float

From c1862010cd6c78b09f1d18cd615418c91c8a5cc2 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 24 Nov 2025 18:34:04 +0100
Subject: [PATCH 02/23] Added tests and examples

---
 CHANGES.rst                  |  3 +++
 skrub/_to_float.py           | 39 ++++++++++++++++++++++++++++++------
 skrub/tests/test_to_float.py |  2 +-
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 7918707fd..8ad38c571 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -73,6 +73,9 @@ New features
 - :class:`TableReport` now includes the ``open_tab`` parameter, which lets the
   user select which tab should be opened when the ``TableReport`` is
   rendered. :pr:`1737` by :user:`Riccardo Cappuzzo<rcap107>`.
+- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.'
+  as decimal separator and it also handles negative numbers indicated with parentheses.
+  :pr:`1772` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
 
 Changes
 -------
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 9c0a84219..3964969f3 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -13,15 +13,17 @@ def _str_replace(col, pattern, strict=True):
 
 
 @_str_replace.specialize("pandas", argument_type="Column")
-def _str_replace_pandas(col, pattern):
+def _str_replace_pandas(col, pattern, decimal):
     col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
-    return col.str.replace("".join(pattern), "", regex=True)
+    col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True)
+    return col.str.replace(decimal, ".", regex=False)
 
 
 @_str_replace.specialize("polars", argument_type="Column")
-def _str_replace_polars(col, pattern):
-    col = col.str.replace(r"^\((.*)\)$", r"-\1")
-    return col.str.replace_all("".join(pattern), "")
+def _str_replace_polars(col, pattern, decimal):
+    col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
+    col = col.str.replace_all("[" + "".join(pattern) + "]", "")
+    return col.str.replace_all(f"[{decimal}]", ".")
 
 
 class ToFloat(SingleColumnTransformer):
@@ -185,6 +187,31 @@ class ToFloat(SingleColumnTransformer):
     >>> s = pd.Series([1.1, None], dtype='float32')
     >>> to_float.fit_transform(s) is s
     True
+
+    Handling parentheses around negative numbers
+    >>> s = pd.Series(["-1,234.56", "1.234,56", "(1,234.56)"], name='parens')
+    >>> to_float.fit_transform(s) #doctest: +SKIP
+    0   -1234.56
+    1    1234.56
+    2   -1234.56
+    dtype: float32
+
+    Scientific notation
+    >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x")
+    >>> ToFloat(decimal=".").fit_transform(s)
+    0    12300.0
+    1    12300.0
+    Name: x, dtype: float32
+
+
+    Space or apostrophe as thousand separator
+    >>> s = pd.Series(["1 234 567,89", "1'234'567,89"], name="x")
+    >>> ToFloat(decimal=",").fit_transform(s)
+    0    1234567.89
+    1    1234567.89
+    Name: x, dtype: float32
+
+
     """  # noqa: E501
 
     def __init__(self, decimal="."):
@@ -218,7 +245,7 @@ def fit_transform(self, column, y=None):
             if sbd.is_string(column):
                 p = POSSIBLE_SEPARATORS.copy()
                 p.remove(self.decimal)
-                column = _str_replace(column, pattern=p)
+                column = _str_replace(column, pattern=p, decimal=self.decimal)
             numeric = sbd.to_float32(column, strict=True)
             return numeric
         except Exception as e:
diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
index 201891567..326a2d084 100644
--- a/skrub/tests/test_to_float.py
+++ b/skrub/tests/test_to_float.py
@@ -75,4 +75,4 @@ def test_number_parsing(input_str, expected_float, decimal, df_module):
     column = df_module.make_column("col", [input_str])
     result = ToFloat(decimal=decimal).fit_transform(column)
 
-    assert result == expected_float
+    np.allclose(result[0], expected_float)

From 67f00c7783de9cf4ac53e5dec655f5b76cc36d80 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 14:58:51 +0100
Subject: [PATCH 03/23] Added doctest skip

---
 skrub/_to_float.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 3964969f3..13e07201b 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -205,10 +205,10 @@ class ToFloat(SingleColumnTransformer):
 
 
     Space or apostrophe as thousand separator
-    >>> s = pd.Series(["1 234 567,89", "1'234'567,89"], name="x")
-    >>> ToFloat(decimal=",").fit_transform(s)
-    0    1234567.89
-    1    1234567.89
+    >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
+    >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP
+    0    4567.89
+    1    4567.89
     Name: x, dtype: float32
 
 

From 47cc97d1479e9b77db07f4d110b4e4b8d5250ba2 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 15:34:57 +0100
Subject: [PATCH 04/23] Added documentation

---
 .../feature_engineering_numerical.rst         | 123 ++++++++++++++++++
 skrub/_to_float.py                            |  10 +-
 2 files changed, 130 insertions(+), 3 deletions(-)
 create mode 100644 doc/modules/column_level_featurizing/feature_engineering_numerical.rst

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
new file mode 100644
index 000000000..c6841a4d6
--- /dev/null
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -0,0 +1,123 @@
+.. |ToFloat| replace:: :class:`~skrub.ToFloat`
+.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer`
+.. |Cleaner| replace:: :class:`~skrub.Cleaner`
+
+.. _user_guide_feature_engineering_numeric_to_float:
+
+Converting heterogeneous numeric values to uniform float32
+==========================================================
+
+Many tabular datasets contain numeric information stored as strings, mixed
+representations, locale-specific formats, or other non-standard encodings.
+Common issues include:
+
+- Thousands separators (``1,234.56`` or ``1 234,56``)
+- Use of apostrophes as separators (``4'567.89``)
+- Negative numbers encoded inside parentheses (``(1,234.56)``)
+- String columns that contain mostly numeric values, but with occasional invalid entries
+
+To provide consistent numeric behavior, skrub includes the |ToFloat| transformer,
+which **standardizes all numeric-like columns to ``float32``** and handles a wide
+range of real-world formatting issues automatically.
+
+The |ToFloat| transformer is used internally by both the |Cleaner| class and the
+|TableVectorizer| to guarantee that downstream estimators receive clean and
+uniform numeric data.
+
+What |ToFloat| does
+-------------------
+
+The |ToFloat| transformer provides:
+
+- **Automatic conversion to 32-bit floating-point values (`float32`).**
+  This dtype is lightweight and fully supported by scikit-learn estimators.
+
+- **Automatic parsing of decimal separators**, regardless of locale:
+  - ``.`` or ``,`` can be used as decimal point
+  - thousands separators (``.``, ``,``, space, apostrophe) are removed automatically
+
+- **Parentheses interpreted as negative numbers**, a common format in financial datasets:
+  - ``(1,234.56)`` → ``-1234.56``
+
+- **Scientific notation parsing** (e.g. ``1.23e+4``)
+
+- **Graceful handling of invalid or non-numeric values during transform**:
+  - During ``fit``: non-convertible values raise a ``RejectColumn`` exception
+  - During ``transform``: invalid entries become ``NaN`` instead of failing
+
+- **Rejection of categorical and datetime columns**, which should not be cast to numeric.
+
+As with all skrub transformers, |ToFloat| behaves like a standard
+scikit-learn transformer and is fully compatible with pipelines.
+
+Examples
+--------
+
+Parsing numeric-formatted strings:
+
+>>> import pandas as pd
+>>> from skrub import ToFloat
+>>> s = pd.Series(['1.1', None, '3.3'], name='x')
+>>> ToFloat().fit_transform(s)
+0    1.1
+1    NaN
+2    3.3
+Name: x, dtype: float32
+
+Automatic handling of locale-dependent decimal separators:
+
+>>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
+>>> ToFloat(decimal=",").fit_transform(s)   # doctest: +SKIP
+0    4567.89
+1    4567.89
+Name: x, dtype: float32
+
+Parentheses interpreted as negative numbers:
+
+>>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg")
+>>> ToFloat().fit_transform(s)   # doctest: +SKIP
+0   -1234.56
+1   -1234.56
+Name: neg, dtype: float32
+
+Scientific notation:
+
+>>> s = pd.Series(["1.23e+4", "1.23E+4"])
+>>> ToFloat(decimal=".").fit_transform(s)
+0    12300.0
+1    12300.0
+dtype: float32
+
+Numeric, boolean, and extension dtypes are also standardized:
+
+>>> pd.Series([True, False])
+0    1.0
+1    0.0
+dtype: float32
+
+>>> pd.Series([1.1, 2.2], dtype="Float32")
+0    1.1
+1    2.2
+dtype: float32
+
+Columns that cannot be converted are rejected during ``fit``:
+
+>>> pd.Series(['1.1', 'hello'])
+Traceback (most recent call last):
+    ...
+skrub._apply_to_cols.RejectColumn: Could not convert column '...' to numbers.
+
+How |ToFloat| is used in skrub
+------------------------------
+
+The |ToFloat| transformer is used internally in:
+
+- the **Cleaner** (|Cleaner|), to normalize all numeric-like columns before modeling
+- the **|TableVectorizer|**, ensuring a consistent numeric dtype across all numeric features
+
+This makes |ToFloat| a core building block of skrub’s handling of heterogeneous
+tabular data.
+
+``ToFloat`` ensures that downstream machine-learning models receive numeric data
+that is clean, consistent, lightweight, and free of locale-specific quirks or
+string-encoded values.
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 13e07201b..dca4cb339 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -44,6 +44,13 @@ class ToFloat(SingleColumnTransformer):
     During ``transform``, entries for which conversion fails are replaced by
     null values.
 
+    Parameters
+    ----------
+    decimal : str, default='.'
+        Character to recognize as the decimal separator when converting from
+        strings to floats. Other possible decimal separators are removed from
+        the strings before conversion.
+
     Examples
     --------
     >>> import pandas as pd
@@ -203,15 +210,12 @@ class ToFloat(SingleColumnTransformer):
     1    12300.0
     Name: x, dtype: float32
 
-
     Space or apostrophe as thousand separator
     >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
     >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP
     0    4567.89
     1    4567.89
     Name: x, dtype: float32
-
-
     """  # noqa: E501
 
     def __init__(self, decimal="."):

From daa9557c4897e3e66897c62d7f6d86c62eb87c3d Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 15:39:01 +0100
Subject: [PATCH 05/23] Added elipsis on doctests

---
 skrub/_to_float.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index dca4cb339..dfe85b997 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -196,11 +196,11 @@ class ToFloat(SingleColumnTransformer):
     True
 
     Handling parentheses around negative numbers
-    >>> s = pd.Series(["-1,234.56", "1.234,56", "(1,234.56)"], name='parens')
-    >>> to_float.fit_transform(s) #doctest: +SKIP
-    0   -1234.56
-    1    1234.56
-    2   -1234.56
+    >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens')
+    >>> to_float.fit_transform(s)
+    0   -1234.5...
+    1    1234.5...
+    2   -1234.5...
     dtype: float32
 
     Scientific notation
@@ -212,9 +212,9 @@ class ToFloat(SingleColumnTransformer):
 
     Space or apostrophe as thousand separator
     >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
-    >>> ToFloat(decimal=",").fit_transform(s) #doctest: +SKIP
-    0    4567.89
-    1    4567.89
+    >>> ToFloat(decimal=",").fit_transform(s)
+    0    4567.8...
+    1    4567.8...
     Name: x, dtype: float32
     """  # noqa: E501
 

From 292a5c1746694b7c9babb6d19576b1cb41984f95 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 16:08:15 +0100
Subject: [PATCH 06/23] Fixed example doc

---
 .../feature_engineering_numerical.rst           | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index c6841a4d6..0ce2b0191 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -88,24 +88,13 @@ Scientific notation:
 1    12300.0
 dtype: float32
 
-Numeric, boolean, and extension dtypes are also standardized:
-
->>> pd.Series([True, False])
-0    1.0
-1    0.0
-dtype: float32
-
->>> pd.Series([1.1, 2.2], dtype="Float32")
-0    1.1
-1    2.2
-dtype: float32
-
 Columns that cannot be converted are rejected during ``fit``:
 
->>> pd.Series(['1.1', 'hello'])
+>>> s = pd.Series(['1.1', 'hello'], name='x')
+>>> ToFloat(decimal=".").fit_transform(s)
 Traceback (most recent call last):
     ...
-skrub._apply_to_cols.RejectColumn: Could not convert column '...' to numbers.
+skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers.
 
 How |ToFloat| is used in skrub
 ------------------------------

From 6821b325f2c8c2672fe721cffcafea565e1d8964 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 16:37:53 +0100
Subject: [PATCH 07/23] Improved users guide

---
 .../feature_engineering_numerical.rst         | 59 ++++++++++++-------
 1 file changed, 38 insertions(+), 21 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 0ce2b0191..535883d93 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -17,10 +17,10 @@ Common issues include:
 - String columns that contain mostly numeric values, but with occasional invalid entries
 
 To provide consistent numeric behavior, skrub includes the |ToFloat| transformer,
-which **standardizes all numeric-like columns to ``float32``** and handles a wide
+which standardizes all numeric-like columns to ``float32`` and handles a wide
 range of real-world formatting issues automatically.
 
-The |ToFloat| transformer is used internally by both the |Cleaner| class and the
+The |ToFloat| transformer is used internally by both the |Cleaner| and the
 |TableVectorizer| to guarantee that downstream estimators receive clean and
 uniform numeric data.
 
@@ -50,6 +50,22 @@ The |ToFloat| transformer provides:
 As with all skrub transformers, |ToFloat| behaves like a standard
 scikit-learn transformer and is fully compatible with pipelines.
 
+How to use |ToFloat|
+--------------------
+The |ToFloat| transformer must be applied to individual columns. It behaves like
+a standard scikit-learn transformer.
+Each column is expected to use a single decimal separator, which is
+specified through the ``decimal`` parameter. If this parameter is not provided,
+the default decimal separator is ``'.'``.
+
+During ``fit``, |ToFloat| attempts to convert all values in the column to
+numeric values after automatically removing other possible thousands separators
+(``,``, ``.``, space, apostrophe). If any value cannot be converted, the column
+is rejected with a ``RejectColumn`` exception.
+
+During ``transform``, invalid or non-convertible values are replaced by ``NaN``
+instead of raising an error.
+
 Examples
 --------
 
@@ -64,20 +80,22 @@ Parsing numeric-formatted strings:
 2    3.3
 Name: x, dtype: float32
 
-Automatic handling of locale-dependent decimal separators:
+Locale-dependent decimal separators can be handled by specifying the
+``decimal`` parameter. Here we use comma as decimal separator, and
+remove spaces and apostrophes as thousands separators:
 
 >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
->>> ToFloat(decimal=",").fit_transform(s)   # doctest: +SKIP
-0    4567.89
-1    4567.89
+>>> ToFloat(decimal=",").fit_transform(s)
+0    4567.8...
+1    4567.8...
 Name: x, dtype: float32
 
 Parentheses interpreted as negative numbers:
 
 >>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg")
->>> ToFloat().fit_transform(s)   # doctest: +SKIP
-0   -1234.56
-1   -1234.56
+>>> ToFloat().fit_transform(s)
+0   -1234.5...
+1   -1234.5...
 Name: neg, dtype: float32
 
 Scientific notation:
@@ -96,17 +114,16 @@ Traceback (most recent call last):
     ...
 skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers.
 
-How |ToFloat| is used in skrub
-------------------------------
-
-The |ToFloat| transformer is used internally in:
 
-- the **Cleaner** (|Cleaner|), to normalize all numeric-like columns before modeling
-- the **|TableVectorizer|**, ensuring a consistent numeric dtype across all numeric features
-
-This makes |ToFloat| a core building block of skrub’s handling of heterogeneous
-tabular data.
+During ``transform``, invalid entries become ``NaN`` instead of raising an error:
+>>> s = pd.Series(['1.1', '2.2'], name='x')
+>>> to_float = ToFloat(decimal=".")
+>>> to_float.fit_transform(s)
+0    1.1
+1    2.2
+Name: x, dtype: float32
 
-``ToFloat`` ensures that downstream machine-learning models receive numeric data
-that is clean, consistent, lightweight, and free of locale-specific quirks or
-string-encoded values.
+>>> to_float.transform(pd.Series(['3.3', 'invalid'], name='x'))
+0    3.3
+1    NaN
+Name: x, dtype: float32

From 9df1ba24f12ebb4ba4a49bd4e85efa212aa3b9ce Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 1 Dec 2025 17:14:06 +0100
Subject: [PATCH 08/23] Fixed tests

---
 skrub/_to_float.py           | 14 ++++++++------
 skrub/tests/test_to_float.py | 15 ++++++++++++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index dfe85b997..21dd327a9 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -8,19 +8,23 @@
 
 
 @dispatch
-def _str_replace(col, pattern, strict=True):
+def _str_replace(col, strict=True):
     raise_dispatch_unregistered_type(col, kind="Series")
 
 
 @_str_replace.specialize("pandas", argument_type="Column")
-def _str_replace_pandas(col, pattern, decimal):
+def _str_replace_pandas(col, decimal):
+    pattern = POSSIBLE_SEPARATORS.copy()
+    pattern.remove(decimal)
     col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
     col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True)
     return col.str.replace(decimal, ".", regex=False)
 
 
 @_str_replace.specialize("polars", argument_type="Column")
-def _str_replace_polars(col, pattern, decimal):
+def _str_replace_polars(col, decimal):
+    pattern = POSSIBLE_SEPARATORS.copy()
+    pattern.remove(decimal)
     col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
     col = col.str.replace_all("[" + "".join(pattern) + "]", "")
     return col.str.replace_all(f"[{decimal}]", ".")
@@ -247,9 +251,7 @@ def fit_transform(self, column, y=None):
             )
         try:
             if sbd.is_string(column):
-                p = POSSIBLE_SEPARATORS.copy()
-                p.remove(self.decimal)
-                column = _str_replace(column, pattern=p, decimal=self.decimal)
+                column = _str_replace(column, decimal=self.decimal)
             numeric = sbd.to_float32(column, strict=True)
             return numeric
         except Exception as e:
diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
index 326a2d084..a7498e0a6 100644
--- a/skrub/tests/test_to_float.py
+++ b/skrub/tests/test_to_float.py
@@ -5,7 +5,7 @@
 from skrub._single_column_transformer import RejectColumn
 from skrub._to_categorical import ToCategorical
 from skrub._to_datetime import ToDatetime
-from skrub._to_float import ToFloat
+from skrub._to_float import ToFloat, _str_replace
 from skrub.conftest import skip_polars_installed_without_pyarrow
 
 
@@ -76,3 +76,16 @@ def test_number_parsing(input_str, expected_float, decimal, df_module):
     result = ToFloat(decimal=decimal).fit_transform(column)
 
     np.allclose(result[0], expected_float)
+
+
+def test_str_replace(df_module):
+    s = df_module.make_column("x", ["1,234.56", "7.890,12", "3 456,78", "9'012.34"])
+    result_dot = _str_replace(s, decimal=".")
+
+    expected_dot = df_module.make_column(
+        "x", ["1234.56", "7890.12", "3456.78", "9012.34"]
+    )
+    np.all(sbd.to_list(result_dot) == sbd.to_list(expected_dot))
+
+    with pytest.raises(TypeError):
+        _str_replace([1, 2, 3], decimal=".")

From 620bd12470f02d9b1ad76ea47eaec8330695ca68 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 15 Dec 2025 16:36:34 +0100
Subject: [PATCH 09/23] WIP: Improved column verification

---
 skrub/_to_float.py | 68 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 10 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 21dd327a9..5f54696d3 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -1,3 +1,5 @@
+import re
+
 from . import _dataframe as sbd
 from ._dispatch import dispatch, raise_dispatch_unregistered_type
 from ._single_column_transformer import RejectColumn, SingleColumnTransformer
@@ -7,26 +9,61 @@
 POSSIBLE_SEPARATORS = [".", ",", "'", " "]
 
 
+def _build_number_regex(decimal, thousand):
+    d = re.escape(decimal)
+    t = re.escape(thousand)
+
+    integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)"
+    decimal_part = rf"(?:{d}\d+)?"
+    scientific = r"(?:[eE][+-]?\d+)?"
+
+    return rf"""
+        ^
+        \(?
+        [+-]?
+        {integer}
+        {decimal_part}
+        {scientific}
+        \)?
+        $
+    """
+
+
+@dispatch
+def _str_is_valid_number(col, number_re):
+    raise_dispatch_unregistered_type(col, kind="Series")
+
+
+@_str_is_valid_number.specialize("pandas", argument_type="Column")
+def _str_is_valid_number_pandas(col, number_re):
+    if not col.str.match(number_re, na=False).all():
+        raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
+    return True
+
+
+@_str_is_valid_number.specialize("polars", argument_type="Column")
+def _str_is_valid_number_polars(col, number_re):
+    if not col.str.contains(number_re.pattern).all():
+        raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
+    return True
+
+
 @dispatch
 def _str_replace(col, strict=True):
     raise_dispatch_unregistered_type(col, kind="Series")
 
 
 @_str_replace.specialize("pandas", argument_type="Column")
-def _str_replace_pandas(col, decimal):
-    pattern = POSSIBLE_SEPARATORS.copy()
-    pattern.remove(decimal)
+def _str_replace_pandas(col, decimal, thousand):
     col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
-    col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True)
+    col = col.str.replace(thousand, "", regex=False)
     return col.str.replace(decimal, ".", regex=False)
 
 
 @_str_replace.specialize("polars", argument_type="Column")
-def _str_replace_polars(col, decimal):
-    pattern = POSSIBLE_SEPARATORS.copy()
-    pattern.remove(decimal)
+def _str_replace_polars(col, decimal, thousand):
     col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
-    col = col.str.replace_all("[" + "".join(pattern) + "]", "")
+    col = col.str.replace_all(thousand, "")
     return col.str.replace_all(f"[{decimal}]", ".")
 
 
@@ -222,9 +259,10 @@ class ToFloat(SingleColumnTransformer):
     Name: x, dtype: float32
     """  # noqa: E501
 
-    def __init__(self, decimal="."):
+    def __init__(self, decimal=".", thousand=","):
         super().__init__()
         self.decimal = decimal
+        self.thousand = thousand
 
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
@@ -244,6 +282,9 @@ def fit_transform(self, column, y=None):
         """
         del y
         self.all_outputs_ = [sbd.name(column)]
+        if self.thousand == self.decimal:
+            raise ValueError("The thousand and decimal separators must differ.")
+
         if sbd.is_any_date(column) or sbd.is_categorical(column):
             raise RejectColumn(
                 f"Refusing to cast column {sbd.name(column)!r} "
@@ -251,7 +292,14 @@ def fit_transform(self, column, y=None):
             )
         try:
             if sbd.is_string(column):
-                column = _str_replace(column, decimal=self.decimal)
+                self._number_re_ = re.compile(
+                    _build_number_regex(self.decimal, self.thousand),
+                    re.VERBOSE,
+                )
+                _str_is_valid_number(column, self._number_re_)
+                column = _str_replace(
+                    column, decimal=self.decimal, thousand=self.thousand
+                )
             numeric = sbd.to_float32(column, strict=True)
             return numeric
         except Exception as e:

From 0be30f3a064b3fe863d8e84af8c104138276a863 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 15 Dec 2025 16:49:56 +0100
Subject: [PATCH 10/23] WIP: Removed pattern and include thousand separator

---
 skrub/_to_float.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 5f54696d3..dd66a40d3 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -6,8 +6,6 @@
 
 __all__ = ["ToFloat"]
 
-POSSIBLE_SEPARATORS = [".", ",", "'", " "]
-
 
 def _build_number_regex(decimal, thousand):
     d = re.escape(decimal)
@@ -259,10 +257,10 @@ class ToFloat(SingleColumnTransformer):
     Name: x, dtype: float32
     """  # noqa: E501
 
-    def __init__(self, decimal=".", thousand=","):
+    def __init__(self, decimal=".", thousand=None):
         super().__init__()
         self.decimal = decimal
-        self.thousand = thousand
+        self.thousand = thousand if thousand is not None else ""
 
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.

From ec7d687e686223cea13e6ddb8baee845ed4ee2ca Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 15 Dec 2025 18:00:52 +0100
Subject: [PATCH 11/23] WIP: Regex modification for polars

---
 skrub/_to_float.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index dd66a40d3..6c6010b01 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -14,17 +14,15 @@ def _build_number_regex(decimal, thousand):
     integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)"
     decimal_part = rf"(?:{d}\d+)?"
     scientific = r"(?:[eE][+-]?\d+)?"
-
-    return rf"""
-        ^
-        \(?
-        [+-]?
-        {integer}
-        {decimal_part}
-        {scientific}
-        \)?
-        $
-    """
+    return rf"^\(?[+-]?(?:{integer}{decimal_part}{scientific})?\)?$"
+    # return rf"""
+    #     ^
+    #     \(?
+    #     [+-]?
+    #     (?:{integer}{decimal_part}{scientific})?
+    #     \)?
+    #     $
+    # """
 
 
 @dispatch
@@ -34,14 +32,15 @@ def _str_is_valid_number(col, number_re):
 
 @_str_is_valid_number.specialize("pandas", argument_type="Column")
 def _str_is_valid_number_pandas(col, number_re):
-    if not col.str.match(number_re, na=False).all():
+    if not col.fillna("").str.match(number_re, na=False).all():
         raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
     return True
 
 
 @_str_is_valid_number.specialize("polars", argument_type="Column")
 def _str_is_valid_number_polars(col, number_re):
-    if not col.str.contains(number_re.pattern).all():
+    # pattern = re.sub(r'\s+', '', number_re.pattern)
+    if not col.fill_null("").str.contains(number_re.pattern, literal=False).all():
         raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
     return True
 

From 3e6dea1b6bd159826da9351ef453a72981cbbd0e Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Wed, 17 Dec 2025 12:13:39 +0100
Subject: [PATCH 12/23] Improved tests

---
 skrub/_to_float.py           | 22 ++++------
 skrub/tests/test_to_float.py | 82 +++++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 6c6010b01..cbf07fa77 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -12,17 +12,10 @@ def _build_number_regex(decimal, thousand):
     t = re.escape(thousand)
 
     integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)"
-    decimal_part = rf"(?:{d}\d+)?"
+    decimal_part = rf"{d}\d+"
     scientific = r"(?:[eE][+-]?\d+)?"
-    return rf"^\(?[+-]?(?:{integer}{decimal_part}{scientific})?\)?$"
-    # return rf"""
-    #     ^
-    #     \(?
-    #     [+-]?
-    #     (?:{integer}{decimal_part}{scientific})?
-    #     \)?
-    #     $
-    # """
+    number = rf"(?:{integer}(?:{decimal_part})?|{decimal_part})"
+    return rf"^\(?[+-]?(?:{number}{scientific})?\)?$"
 
 
 @dispatch
@@ -39,7 +32,6 @@ def _str_is_valid_number_pandas(col, number_re):
 
 @_str_is_valid_number.specialize("polars", argument_type="Column")
 def _str_is_valid_number_polars(col, number_re):
-    # pattern = re.sub(r'\s+', '', number_re.pattern)
     if not col.fill_null("").str.contains(number_re.pattern, literal=False).all():
         raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
     return True
@@ -60,7 +52,7 @@ def _str_replace_pandas(col, decimal, thousand):
 @_str_replace.specialize("polars", argument_type="Column")
 def _str_replace_polars(col, decimal, thousand):
     col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
-    col = col.str.replace_all(thousand, "")
+    col = col.str.replace_all(thousand, "", literal=True)
     return col.str.replace_all(f"[{decimal}]", ".")
 
 
@@ -259,7 +251,7 @@ class ToFloat(SingleColumnTransformer):
     def __init__(self, decimal=".", thousand=None):
         super().__init__()
         self.decimal = decimal
-        self.thousand = thousand if thousand is not None else ""
+        self.thousand = thousand
 
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
@@ -279,6 +271,10 @@ def fit_transform(self, column, y=None):
         """
         del y
         self.all_outputs_ = [sbd.name(column)]
+        if self.thousand is None:
+            self.thousand = ""  # No thousand separator
+        if self.decimal is None:
+            raise ValueError("The decimal separator cannot be None.")
         if self.thousand == self.decimal:
             raise ValueError("The thousand and decimal separators must differ.")
 
diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
index a7498e0a6..5d014d313 100644
--- a/skrub/tests/test_to_float.py
+++ b/skrub/tests/test_to_float.py
@@ -5,7 +5,7 @@
 from skrub._single_column_transformer import RejectColumn
 from skrub._to_categorical import ToCategorical
 from skrub._to_datetime import ToDatetime
-from skrub._to_float import ToFloat, _str_replace
+from skrub._to_float import ToFloat
 from skrub.conftest import skip_polars_installed_without_pyarrow
 
 
@@ -46,46 +46,52 @@ def test_rejected_columns(df_module):
 
 
 @pytest.mark.parametrize(
-    "input_str, expected_float, decimal",
+    "input_str, expected_float, decimal, thousand",
     [
-        ("1,234.56", 1234.56, "."),
-        ("1.234,56", 1234.56, ","),
-        ("1 234,56", 1234.56, ","),
-        ("1234.56", 1234.56, "."),
-        ("1234,56", 1234.56, ","),
-        ("1,234,567.89", 1234567.89, "."),
-        ("1.234.567,89", 1234567.89, ","),
-        ("1 234 567,89", 1234567.89, ","),
-        ("1'234'567.89", 1234567.89, "."),
-        ("1.23e+4", 12300.0, "."),
-        ("1.23E+4", 12300.0, "."),
-        ("1,23e+4", 12300.0, ","),
-        ("1,23E+4", 12300.0, ","),
-        ("-1,234.56", -1234.56, "."),
-        ("-1.234,56", -1234.56, ","),
-        ("(1,234.56)", -1234.56, "."),
-        ("(1.234,56)", -1234.56, ","),
-        ("1,23,456.78", 123456.78, "."),
-        ("12,3456.78", 123456.78, "."),
-        (".56", 0.56, "."),
-        (",56", 0.56, ","),
+        # valid numbers
+        ("1,234.56", 1234.56, ".", ","),
+        ("1.234,56", 1234.56, ",", "."),
+        ("1 234,56", 1234.56, ",", " "),
+        ("1234.56", 1234.56, ".", None),
+        ("1234,56", 1234.56, ",", None),
+        ("1,234,567.89", 1234567.89, ".", ","),
+        ("1.234.567,89", 1234567.89, ",", "."),
+        ("1 234 567,89", 1234567.89, ",", " "),
+        ("1'234'567.89", 1234567.89, ".", "'"),
+        ("1.23e+4", 12300.0, ".", None),
+        ("1.23E+4", 12300.0, ".", None),
+        ("-1,234.56", -1234.56, ".", ","),
+        ("(1,234.56)", -1234.56, ".", ","),
+        (".56", 0.56, ".", None),
+        (",56", 0.56, ",", None),
+        ("56", 56.0, ".", None),
     ],
 )
-def test_number_parsing(input_str, expected_float, decimal, df_module):
+def test_number_parsing_valid(input_str, expected_float, decimal, thousand, df_module):
     column = df_module.make_column("col", [input_str])
-    result = ToFloat(decimal=decimal).fit_transform(column)
+    result = ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)
+    assert np.allclose(result[0], expected_float)
 
-    np.allclose(result[0], expected_float)
 
-
-def test_str_replace(df_module):
-    s = df_module.make_column("x", ["1,234.56", "7.890,12", "3 456,78", "9'012.34"])
-    result_dot = _str_replace(s, decimal=".")
-
-    expected_dot = df_module.make_column(
-        "x", ["1234.56", "7890.12", "3456.78", "9012.34"]
-    )
-    np.all(sbd.to_list(result_dot) == sbd.to_list(expected_dot))
-
-    with pytest.raises(TypeError):
-        _str_replace([1, 2, 3], decimal=".")
+@pytest.mark.parametrize(
+    "input_str, decimal, thousand",
+    [
+        # invalid grouping
+        ("1,23,456.78", ".", ","),
+        ("1.2.3.4", ".", None),
+        ("1.2.3.4,0", ",", "."),
+        ("12,3456.78", ".", ","),
+        ("1 234,567.34", ".", ","),
+        ("1'234,567.34", ".", ","),
+        ("1'234'234,567.34", ",", "'"),
+        ("123.45.67", ".", None),
+        ("1,,234", ".", ","),
+        ("1.23,45", ".", ","),
+        # decimal == thousand
+        ("123,456,789", ",", ","),
+    ],
+)
+def test_number_parsing_invalid(input_str, decimal, thousand, df_module):
+    column = df_module.make_column("col", [input_str])
+    with pytest.raises((RejectColumn, ValueError)):
+        ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)

From f8e63a63288219b87997d6c9bcb620e7237b4729 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Wed, 17 Dec 2025 13:14:22 +0100
Subject: [PATCH 13/23] Improving the docstrings and documentation

---
 .../feature_engineering_numerical.rst         | 24 +++++++++++--------
 skrub/_to_float.py                            | 10 +++++---
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 535883d93..14ef3d888 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -32,9 +32,13 @@ The |ToFloat| transformer provides:
 - **Automatic conversion to 32-bit floating-point values (`float32`).**
   This dtype is lightweight and fully supported by scikit-learn estimators.
 
-- **Automatic parsing of decimal separators**, regardless of locale:
-  - ``.`` or ``,`` can be used as decimal point
-  - thousands separators (``.``, ``,``, space, apostrophe) are removed automatically
+- **Automatic parsing of decimal and thousands separators**, regardless of locale:
+  - The decimal separator must be specified explicitly and can be either ``.`` or ``,``
+  - The thousands separator can be one of ``.``, ``,``, space (``" "``), apostrophe (``'``),
+  or None (no thousands separator)
+  - The transformer supports integers, decimals (including leading-decimal forms such as .56 or ,56), scientific notation
+  and negative numbers (including parentheses)
+  - Decimal and thousands separators must be different characters
 
 - **Parentheses interpreted as negative numbers**, a common format in financial datasets:
   - ``(1,234.56)`` → ``-1234.56``
@@ -54,9 +58,9 @@ How to use |ToFloat|
 --------------------
 The |ToFloat| transformer must be applied to individual columns. It behaves like
 a standard scikit-learn transformer.
-Each column is expected to use a single decimal separator, which is
-specified through the ``decimal`` parameter. If this parameter is not provided,
-the default decimal separator is ``'.'``.
+Each column is expected to use a single decimals and thousands separator, which is
+specified through the ``decimal`` and ``thousand`` parameter. If this parameter is not provided,
+the default decimal separators are ``'.'`` and ``None``.
 
 During ``fit``, |ToFloat| attempts to convert all values in the column to
 numeric values after automatically removing other possible thousands separators
@@ -81,11 +85,11 @@ Parsing numeric-formatted strings:
 Name: x, dtype: float32
 
 Locale-dependent decimal separators can be handled by specifying the
-``decimal`` parameter. Here we use comma as decimal separator, and
-remove spaces and apostrophes as thousands separators:
+``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and
+a spaces as thousands separators:
 
->>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
->>> ToFloat(decimal=",").fit_transform(s)
+>>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x")
+>>> ToFloat(decimal=",", thousand=" ").fit_transform(s)
 0    4567.8...
 1    4567.8...
 Name: x, dtype: float32
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index cbf07fa77..a58f4046d 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -80,6 +80,10 @@ class ToFloat(SingleColumnTransformer):
         Character to recognize as the decimal separator when converting from
         strings to floats. Other possible decimal separators are removed from
         the strings before conversion.
+    thousand : str or None, default=None
+    Character used as thousands separator. Supported values are ``"."``,
+    ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands
+    separator). The decimal and thousands separators must differ.
 
     Examples
     --------
@@ -240,9 +244,9 @@ class ToFloat(SingleColumnTransformer):
     1    12300.0
     Name: x, dtype: float32
 
-    Space or apostrophe as thousand separator
-    >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
-    >>> ToFloat(decimal=",").fit_transform(s)
+    Space as thousand separator
+    >>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x")
+    >>> ToFloat(decimal=",", thousand=" ").fit_transform(s)
     0    4567.8...
     1    4567.8...
     Name: x, dtype: float32

From 50d9b4746d78a9a6329d55288c1eb62693059531 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Wed, 17 Dec 2025 13:28:31 +0100
Subject: [PATCH 14/23] Improving documentation

---
 .../feature_engineering_numerical.rst                     | 4 ++--
 skrub/_to_float.py                                        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 14ef3d888..0f13e0ee1 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -88,10 +88,10 @@ Locale-dependent decimal separators can be handled by specifying the
 ``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and
 a spaces as thousands separators:
 
->>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x")
+>>> s = pd.Series(["4 567,89", "12 567,89"], name="x")
 >>> ToFloat(decimal=",", thousand=" ").fit_transform(s)
 0    4567.8...
-1    4567.8...
+1    12567.8...
 Name: x, dtype: float32
 
 Parentheses interpreted as negative numbers:
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index a58f4046d..a99be8a99 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -231,7 +231,7 @@ class ToFloat(SingleColumnTransformer):
 
     Handling parentheses around negative numbers
     >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens')
-    >>> to_float.fit_transform(s)
+    >>> ToFloat(decimal=".", thousand=",").fit_transform(s)
     0   -1234.5...
     1    1234.5...
     2   -1234.5...
@@ -245,10 +245,10 @@ class ToFloat(SingleColumnTransformer):
     Name: x, dtype: float32
 
     Space as thousand separator
-    >>> s = pd.Series(["4 567,89", "1 234 567,89"], name="x")
-    >>> ToFloat(decimal=",", thousand=" ").fit_transform(s)
+    >>> s = pd.Series(["4 567,89", "12 567,89"], name="x")
+    >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) # doctest: +ELLIPSIS
     0    4567.8...
-    1    4567.8...
+    1    12567.8...
     Name: x, dtype: float32
     """  # noqa: E501
 

From 36d1d8f4e1565cdc60ab9e6fc830829554b8d6fb Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:19:13 +0100
Subject: [PATCH 15/23] Update
 doc/modules/column_level_featurizing/feature_engineering_numerical.rst

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 .../feature_engineering_numerical.rst                       | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 0f13e0ee1..1274fa8f0 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -37,12 +37,10 @@ The |ToFloat| transformer provides:
   - The thousands separator can be one of ``.``, ``,``, space (``" "``), apostrophe (``'``),
   or None (no thousands separator)
   - The transformer supports integers, decimals (including leading-decimal forms such as .56 or ,56), scientific notation
-  and negative numbers (including parentheses)
+  and negative numbers
+  - Numbers in parentheses are interpreted as negative numbers (``(1,234.56)`` → ``-1234.56``). This format is more common in financial datasets.
   - Decimal and thousands separators must be different characters
 
-- **Parentheses interpreted as negative numbers**, a common format in financial datasets:
-  - ``(1,234.56)`` → ``-1234.56``
-
 - **Scientific notation parsing** (e.g. ``1.23e+4``)
 
 - **Graceful handling of invalid or non-numeric values during transform**:

From 0f149f1e312af1f308ca9b8cdf8bbf940f9496e1 Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:19:31 +0100
Subject: [PATCH 16/23] Update
 doc/modules/column_level_featurizing/feature_engineering_numerical.rst

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 .../feature_engineering_numerical.rst                | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 1274fa8f0..8abfc76f6 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -54,11 +54,13 @@ scikit-learn transformer and is fully compatible with pipelines.
 
 How to use |ToFloat|
 --------------------
-The |ToFloat| transformer must be applied to individual columns. It behaves like
-a standard scikit-learn transformer.
-Each column is expected to use a single decimals and thousands separator, which is
-specified through the ``decimal`` and ``thousand`` parameter. If this parameter is not provided,
-the default decimal separators are ``'.'`` and ``None``.
+The |ToFloat| transformer must be applied to individual columns, and it behaves
+like a standard scikit-learn transformer. 
+|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and 
+``None`` (no thousands separator) by default.
+Each column is expected to use a single separator for decimals, and one for thousands: 
+if any characters other than the provided selectors are encountered in the column, it will not 
+be converted. 
 
 During ``fit``, |ToFloat| attempts to convert all values in the column to
 numeric values after automatically removing other possible thousands separators

From 415aec18b6e1b9febe6bcef39a882b6549f0cfb1 Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:20:06 +0100
Subject: [PATCH 17/23] Update
 doc/modules/column_level_featurizing/feature_engineering_numerical.rst

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 .../column_level_featurizing/feature_engineering_numerical.rst  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 8abfc76f6..1ffa4b19b 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -86,7 +86,7 @@ Name: x, dtype: float32
 
 Locale-dependent decimal separators can be handled by specifying the
 ``decimal`` and ``thousand`` parameter. Here we use comma as decimal separator, and
-a spaces as thousands separators:
+a space as thousands separators:
 
 >>> s = pd.Series(["4 567,89", "12 567,89"], name="x")
 >>> ToFloat(decimal=",", thousand=" ").fit_transform(s)

From a19e149a4a07e0108b16b48f9c8a2fba6ecd270e Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:23:21 +0100
Subject: [PATCH 18/23] Update skrub/_to_float.py

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 skrub/_to_float.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index a99be8a99..a6d95d704 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -81,9 +81,9 @@ class ToFloat(SingleColumnTransformer):
         strings to floats. Other possible decimal separators are removed from
         the strings before conversion.
     thousand : str or None, default=None
-    Character used as thousands separator. Supported values are ``"."``,
-    ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands
-    separator). The decimal and thousands separators must differ.
+        Character used as thousands separator. Supported values are ``"."``,
+        ``,``, space (``" "``), apostrophe (``"'"``), or ``None`` (no thousands
+        separator). The decimal and thousands separators must differ.
 
     Examples
     --------

From 1754d073600c867a1be5e4e44981296bea031446 Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:24:15 +0100
Subject: [PATCH 19/23] Update skrub/_to_float.py

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 skrub/_to_float.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index a6d95d704..965d2a3f9 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -229,7 +229,8 @@ class ToFloat(SingleColumnTransformer):
     >>> to_float.fit_transform(s) is s
     True
 
-    Handling parentheses around negative numbers
+    Negative numbers represented using parentheses are converted
+    so they use "-" instead.
     >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens')
     >>> ToFloat(decimal=".", thousand=",").fit_transform(s)
     0   -1234.5...

From db44c3e0db8d7d624cfa08133b29b7b3319ab8cd Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:25:16 +0100
Subject: [PATCH 20/23] Update skrub/_to_float.py

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 skrub/_to_float.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 965d2a3f9..c9d45ea3f 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -238,7 +238,7 @@ class ToFloat(SingleColumnTransformer):
     2   -1234.5...
     dtype: float32
 
-    Scientific notation
+    Numbers that use scientific notation are converted:
     >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x")
     >>> ToFloat(decimal=".").fit_transform(s)
     0    12300.0

From 0b71c2611bf05a28b10acdcd0e034b87a5ea1078 Mon Sep 17 00:00:00 2001
From: gabrielapgomezji <147144881+gabrielapgomezji@users.noreply.github.com>
Date: Wed, 17 Dec 2025 16:26:03 +0100
Subject: [PATCH 21/23] Update skrub/_to_float.py

Co-authored-by: Riccardo Cappuzzo <7548232+rcap107@users.noreply.github.com>
---
 skrub/_to_float.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index c9d45ea3f..b0f4ef4df 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -245,7 +245,7 @@ class ToFloat(SingleColumnTransformer):
     1    12300.0
     Name: x, dtype: float32
 
-    Space as thousand separator
+    It is possible to specify the thousands separator, e.g., to use " "
     >>> s = pd.Series(["4 567,89", "12 567,89"], name="x")
     >>> ToFloat(decimal=",", thousand=" ").fit_transform(s) # doctest: +ELLIPSIS
     0    4567.8...

From 94435da87c421b5a0b8405c9a9fb92f874d959a1 Mon Sep 17 00:00:00 2001
From: GOMEZ JIMENEZ Gabriela <gabriela.gomez-jimenez@inria.fr>
Date: Mon, 19 Jan 2026 14:52:00 +0100
Subject: [PATCH 22/23] WIP

---
 skrub/_to_float.py           | 44 +++++++++++++++++++++++++++++++++---
 skrub/tests/test_to_float.py | 31 +++++++++++++++++++++++--
 2 files changed, 70 insertions(+), 5 deletions(-)

diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index b0f4ef4df..5d39d88af 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -8,13 +8,35 @@
 
 
 def _build_number_regex(decimal, thousand):
-    d = re.escape(decimal)
-    t = re.escape(thousand)
-
+    # Escape decimal and thousand separators to use in regex
+    d = re.escape(decimal)  # e.g., '.' → '\.', ',' → '\,'
+    t = re.escape(thousand)  # e.g., ',' → '\,', '.' → '\.'
+
+    # Matches integer parts:
+    # Either:
+    #   - one or more digits without thousand separators: \d+
+    #   - or digits grouped by thousand separators: \d{1,3}(?:{t}\d{3})+
+    #     e.g., '1,234' or '12,345,678'
     integer = rf"(?:\d+|\d{{1,3}}(?:{t}\d{{3}})+)"
+
+    # Matches decimal part after the decimal separator
+    # e.g., '.456' or ',456' depending on locale
     decimal_part = rf"{d}\d+"
+
+    # Matches optional scientific notation
+    # e.g., 'e10', 'E-5', 'e+3'
     scientific = r"(?:[eE][+-]?\d+)?"
+
+    # Full number can be:
+    #   - integer with optional decimal part
+    #   - or only decimal part (like '.5')
     number = rf"(?:{integer}(?:{decimal_part})?|{decimal_part})"
+
+    # Final regex:
+    #   - optional parentheses around the number: \( ... \)?
+    #   - optional leading + or - sign: [+-]?
+    #   - optional scientific notation is included in `number`
+    # Anchored to start (^) and end ($) of string
     return rf"^\(?[+-]?(?:{number}{scientific})?\)?$"
 
 
@@ -25,6 +47,10 @@ def _str_is_valid_number(col, number_re):
 
 @_str_is_valid_number.specialize("pandas", argument_type="Column")
 def _str_is_valid_number_pandas(col, number_re):
+    # Check if all values in the column match the number regex.
+    # - Fill NaN values with empty string to avoid match errors.
+    # - Use `str.match` with `na=False` to treat empty/missing values as non-matching.
+    # - If any value does not match, raise RejectColumn with a descriptive message.
     if not col.fillna("").str.match(number_re, na=False).all():
         raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
     return True
@@ -32,6 +58,10 @@ def _str_is_valid_number_pandas(col, number_re):
 
 @_str_is_valid_number.specialize("polars", argument_type="Column")
 def _str_is_valid_number_polars(col, number_re):
+    # Check if all values in the column match the number regex.
+    # - Fill NaN values with empty string to avoid match errors.
+    # - Use `str.match` with `na=False` to treat empty/missing values as non-matching.
+    # - If any value does not match, raise RejectColumn with a descriptive message.
     if not col.fill_null("").str.contains(number_re.pattern, literal=False).all():
         raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
     return True
@@ -44,15 +74,23 @@ def _str_replace(col, strict=True):
 
 @_str_replace.specialize("pandas", argument_type="Column")
 def _str_replace_pandas(col, decimal, thousand):
+    # Replace parentheses around numbers with a leading minus sign
+    # e.g., "(123.45)" → "-123.45"
     col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
+    # Remove thousand separators
     col = col.str.replace(thousand, "", regex=False)
+    # Replace decimal separator with '.'
     return col.str.replace(decimal, ".", regex=False)
 
 
 @_str_replace.specialize("polars", argument_type="Column")
 def _str_replace_polars(col, decimal, thousand):
+    # Replace parentheses around numbers with a leading minus sign
+    # e.g., "(123.45)" → "-123.45"
     col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
+    # Remove thousand separators
     col = col.str.replace_all(thousand, "", literal=True)
+    # Replace decimal separator with '.'
     return col.str.replace_all(f"[{decimal}]", ".")
 
 
diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
index 5d014d313..a642ac5e7 100644
--- a/skrub/tests/test_to_float.py
+++ b/skrub/tests/test_to_float.py
@@ -87,11 +87,38 @@ def test_number_parsing_valid(input_str, expected_float, decimal, thousand, df_m
         ("123.45.67", ".", None),
         ("1,,234", ".", ","),
         ("1.23,45", ".", ","),
-        # decimal == thousand
-        ("123,456,789", ",", ","),
     ],
 )
 def test_number_parsing_invalid(input_str, decimal, thousand, df_module):
     column = df_module.make_column("col", [input_str])
     with pytest.raises((RejectColumn, ValueError)):
         ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)
+
+
+@pytest.mark.parametrize(
+    "decimal, thousand",
+    [
+        # invalid because decimal and thousand are the same
+        (",", ","),
+        (".", "."),
+        # invalid because decimal is None
+        (None, ","),
+        (None, None),
+    ],
+)
+def test_invalid_parameters(decimal, thousand, df_module):
+    """
+    Test that ToFloat raises an exception if the parameters are invalid:
+    - decimal is None → ValueError
+    - thousand == decimal → ValueError
+    """
+    column = df_module.make_column("col", ["123", "456"])
+
+    if decimal is None:
+        with pytest.raises(ValueError, match="decimal separator cannot be None"):
+            ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)
+    else:
+        with pytest.raises(
+            ValueError, match="thousand and decimal separators must differ"
+        ):
+            ToFloat(decimal=decimal, thousand=thousand).fit_transform(column)

From 095f403803d1a99c8d6fcf908a8402a5da858756 Mon Sep 17 00:00:00 2001
From: Riccardo Cappuzzo <riccardo.cappuzzo@gmail.com>
Date: Tue, 20 Jan 2026 11:55:09 +0100
Subject: [PATCH 23/23] Reverting changes and cleaning up history

---
 CHANGES.rst                                        |  6 +++---
 .../feature_engineering_numerical.rst              | 14 +++++++-------
 skrub/_table_vectorizer.py                         |  4 ++--
 skrub/_to_float.py                                 |  6 ++----
 4 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 8ad38c571..f7e00c83f 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -11,6 +11,9 @@ Ongoing Development
 
 New features
 ------------
+- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.'
+  as decimal separator and it also handles negative numbers indicated with parentheses.
+  :pr:`1772` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
 
 Changes
 -------
@@ -73,9 +76,6 @@ New features
 - :class:`TableReport` now includes the ``open_tab`` parameter, which lets the
   user select which tab should be opened when the ``TableReport`` is
   rendered. :pr:`1737` by :user:`Riccardo Cappuzzo<rcap107>`.
-- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.'
-  as decimal separator and it also handles negative numbers indicated with parentheses.
-  :pr:`1772` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
 
 Changes
 -------
diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
index 1ffa4b19b..a1aae9eda 100644
--- a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
+++ b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -55,12 +55,12 @@ scikit-learn transformer and is fully compatible with pipelines.
 How to use |ToFloat|
 --------------------
 The |ToFloat| transformer must be applied to individual columns, and it behaves
-like a standard scikit-learn transformer. 
-|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and 
+like a standard scikit-learn transformer.
+|ToFloat| requires a ``decimal`` and a ``thousands`` separator, which are ``'.'`` and
 ``None`` (no thousands separator) by default.
-Each column is expected to use a single separator for decimals, and one for thousands: 
-if any characters other than the provided selectors are encountered in the column, it will not 
-be converted. 
+Each column is expected to use a single separator for decimals, and one for thousands:
+if any characters other than the provided selectors are encountered in the column, it will not
+be converted.
 
 During ``fit``, |ToFloat| attempts to convert all values in the column to
 numeric values after automatically removing other possible thousands separators
@@ -97,7 +97,7 @@ Name: x, dtype: float32
 Parentheses interpreted as negative numbers:
 
 >>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg")
->>> ToFloat().fit_transform(s)
+>>> ToFloat(thousand=",").fit_transform(s)
 0   -1234.5...
 1   -1234.5...
 Name: neg, dtype: float32
@@ -116,7 +116,7 @@ Columns that cannot be converted are rejected during ``fit``:
 >>> ToFloat(decimal=".").fit_transform(s)
 Traceback (most recent call last):
     ...
-skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers.
+skrub._single_column_transformer.RejectColumn: Could not convert column 'x' to numbers.
 
 
 During ``transform``, invalid entries become ``NaN`` instead of raising an error:
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 3e7ade92d..f3613a670 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -693,7 +693,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     We can inspect all the processing steps that were applied to a given column:
 
     >>> vectorizer.all_processing_steps_['B']
-    [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(), 'B_month': ToFloat(), ...}]
+    [CleanNullStrings(), DropUninformative(), ToDatetime(), DatetimeEncoder(), {'B_day': ToFloat(thousand=''), 'B_month': ToFloat(thousand=''), ...}]
 
     Note that as the encoder (``DatetimeEncoder()`` above) produces multiple
     columns, the last processing step is not described by a single transformer
@@ -768,7 +768,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     ``ToDatetime()``:
 
     >>> vectorizer.all_processing_steps_
-    {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(), PassThrough(), {'C': ToFloat()}]}
+    {'A': [Drop()], 'B': [OrdinalEncoder()], 'C': [CleanNullStrings(), DropUninformative(), ToFloat(thousand=''), PassThrough(), {'C': ToFloat(thousand='')}]}
 
     Specifying several ``specific_transformers`` for the same column is not allowed.
 
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
index 5d39d88af..b34463274 100644
--- a/skrub/_to_float.py
+++ b/skrub/_to_float.py
@@ -52,7 +52,7 @@ def _str_is_valid_number_pandas(col, number_re):
     # - Use `str.match` with `na=False` to treat empty/missing values as non-matching.
     # - If any value does not match, raise RejectColumn with a descriptive message.
     if not col.fillna("").str.match(number_re, na=False).all():
-        raise RejectColumn(f"The pattern could not match the column {sbd.name(col)!r}.")
+        raise RejectColumn(f"Could not convert column {sbd.name(col)!r} to numbers.")
     return True
 
 
@@ -294,7 +294,7 @@ class ToFloat(SingleColumnTransformer):
     def __init__(self, decimal=".", thousand=None):
         super().__init__()
         self.decimal = decimal
-        self.thousand = thousand
+        self.thousand = "" if thousand is None else thousand
 
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
@@ -314,8 +314,6 @@ def fit_transform(self, column, y=None):
         """
         del y
         self.all_outputs_ = [sbd.name(column)]
-        if self.thousand is None:
-            self.thousand = ""  # No thousand separator
         if self.decimal is None:
             raise ValueError("The decimal separator cannot be None.")
         if self.thousand == self.decimal: