skrub-data · gabrielapgomezji · Nov 24, 2025 · Nov 24, 2025 · Dec 1, 2025 · Dec 1, 2025
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -34,6 +34,9 @@ New features
   user select which tab should be opened when the ``TableReport`` is
   rendered. :pr:`1737` by :user:`Riccardo Cappuzzo<rcap107>`.
 
+- :class:`ToFloat32` has the parameter decimal to let the user specify whether they use ',' or '.'
+  as decimal separator and it also handles negative numbers indicated with parentheses.
+  :pr:`1772` by :user:`Gabriela Gómez Jiménez <gabrielapgomezji>`.
 
 Changes
 -------

diff --git a/doc/modules/column_level_featurizing/feature_engineering_numerical.rst b/doc/modules/column_level_featurizing/feature_engineering_numerical.rst
@@ -0,0 +1,112 @@
+.. |ToFloat| replace:: :class:`~skrub.ToFloat`
+.. |TableVectorizer| replace:: :class:`~skrub.TableVectorizer`
+.. |Cleaner| replace:: :class:`~skrub.Cleaner`
+
+.. _user_guide_feature_engineering_numeric_to_float:
+
+Converting heterogeneous numeric values to uniform float32
+==========================================================
+
+Many tabular datasets contain numeric information stored as strings, mixed
+representations, locale-specific formats, or other non-standard encodings.
+Common issues include:
+
+- Thousands separators (``1,234.56`` or ``1 234,56``)
+- Use of apostrophes as separators (``4'567.89``)
+- Negative numbers encoded inside parentheses (``(1,234.56)``)
+- String columns that contain mostly numeric values, but with occasional invalid entries
+
+To provide consistent numeric behavior, skrub includes the |ToFloat| transformer,
+which **standardizes all numeric-like columns to ``float32``** and handles a wide
+range of real-world formatting issues automatically.
+
+The |ToFloat| transformer is used internally by both the |Cleaner| class and the
+|TableVectorizer| to guarantee that downstream estimators receive clean and
+uniform numeric data.
+
+What |ToFloat| does
+-------------------
+
+The |ToFloat| transformer provides:
+
+- **Automatic conversion to 32-bit floating-point values (`float32`).**
+  This dtype is lightweight and fully supported by scikit-learn estimators.
+
+- **Automatic parsing of decimal separators**, regardless of locale:
+  - ``.`` or ``,`` can be used as decimal point
+  - thousands separators (``.``, ``,``, space, apostrophe) are removed automatically
+
+- **Parentheses interpreted as negative numbers**, a common format in financial datasets:
+  - ``(1,234.56)`` → ``-1234.56``
+
+- **Scientific notation parsing** (e.g. ``1.23e+4``)
+
+- **Graceful handling of invalid or non-numeric values during transform**:
+  - During ``fit``: non-convertible values raise a ``RejectColumn`` exception
+  - During ``transform``: invalid entries become ``NaN`` instead of failing
+
+- **Rejection of categorical and datetime columns**, which should not be cast to numeric.
+
+As with all skrub transformers, |ToFloat| behaves like a standard
+scikit-learn transformer and is fully compatible with pipelines.
+
+Examples
+--------
+
+Parsing numeric-formatted strings:
+
+>>> import pandas as pd
+>>> from skrub import ToFloat
+>>> s = pd.Series(['1.1', None, '3.3'], name='x')
+>>> ToFloat().fit_transform(s)
+0    1.1
+1    NaN
+2    3.3
+Name: x, dtype: float32
+
+Automatic handling of locale-dependent decimal separators:
+
+>>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
+>>> ToFloat(decimal=",").fit_transform(s)   # doctest: +SKIP
+0    4567.89
+1    4567.89
+Name: x, dtype: float32
+
+Parentheses interpreted as negative numbers:
+
+>>> s = pd.Series(["-1,234.56", "(1,234.56)"], name="neg")
+>>> ToFloat().fit_transform(s)   # doctest: +SKIP
+0   -1234.56
+1   -1234.56
+Name: neg, dtype: float32
+
+Scientific notation:
+
+>>> s = pd.Series(["1.23e+4", "1.23E+4"])
+>>> ToFloat(decimal=".").fit_transform(s)
+0    12300.0
+1    12300.0
+dtype: float32
+
+Columns that cannot be converted are rejected during ``fit``:
+
+>>> s = pd.Series(['1.1', 'hello'], name='x')
+>>> ToFloat(decimal=".").fit_transform(s)
+Traceback (most recent call last):
+    ...
+skrub._apply_to_cols.RejectColumn: Could not convert column 'x' to numbers.
+
+How |ToFloat| is used in skrub
+------------------------------
+
+The |ToFloat| transformer is used internally in:
+
+- the **Cleaner** (|Cleaner|), to normalize all numeric-like columns before modeling
+- the **|TableVectorizer|**, ensuring a consistent numeric dtype across all numeric features
+
+This makes |ToFloat| a core building block of skrub’s handling of heterogeneous
+tabular data.
+
+``ToFloat`` ensures that downstream machine-learning models receive numeric data
+that is clean, consistent, lightweight, and free of locale-specific quirks or
+string-encoded values.
diff --git a/skrub/_to_float.py b/skrub/_to_float.py
@@ -1,8 +1,30 @@
 from . import _dataframe as sbd
 from ._apply_to_cols import RejectColumn, SingleColumnTransformer
+from ._dispatch import dispatch, raise_dispatch_unregistered_type
 
 __all__ = ["ToFloat"]
 
+POSSIBLE_SEPARATORS = [".", ",", "'", " "]
+
+
+@dispatch
+def _str_replace(col, pattern, strict=True):
+    raise_dispatch_unregistered_type(col, kind="Series")
+
+
+@_str_replace.specialize("pandas", argument_type="Column")
+def _str_replace_pandas(col, pattern, decimal):
+    col = col.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
+    col = col.str.replace("[" + "".join(pattern) + "]", "", regex=True)
+    return col.str.replace(decimal, ".", regex=False)
+
+
+@_str_replace.specialize("polars", argument_type="Column")
+def _str_replace_polars(col, pattern, decimal):
+    col = col.str.replace_all(r"^\((.*)\)$", r"-$1")
+    col = col.str.replace_all("[" + "".join(pattern) + "]", "")
+    return col.str.replace_all(f"[{decimal}]", ".")
+
 
 class ToFloat(SingleColumnTransformer):
     """
@@ -22,6 +44,13 @@ class ToFloat(SingleColumnTransformer):
     During ``transform``, entries for which conversion fails are replaced by
     null values.
 
+    Parameters
+    ----------
+    decimal : str, default='.'
+        Character to recognize as the decimal separator when converting from
+        strings to floats. Other possible decimal separators are removed from
+        the strings before conversion.
-        strings to floats. Other possible decimal separators are removed from
-        the strings before conversion.
+        strings to floats.
-        strings to floats. Other possible decimal separators are removed from
-        the strings before conversion.
+        strings to floats.
+
     Examples
     --------
     >>> import pandas as pd
@@ -165,8 +194,34 @@ class ToFloat(SingleColumnTransformer):
     >>> s = pd.Series([1.1, None], dtype='float32')
     >>> to_float.fit_transform(s) is s
     True
+
+    Handling parentheses around negative numbers
+    >>> s = pd.Series(["-1,234.56", "1,234.56", "(1,234.56)"], name='parens')
+    >>> to_float.fit_transform(s)
+    0   -1234.5...
+    1    1234.5...
+    2   -1234.5...
+    dtype: float32
+
+    Scientific notation
+    >>> s = pd.Series(["1.23e+4", "1.23E+4"], name="x")
+    >>> ToFloat(decimal=".").fit_transform(s)
+    0    12300.0
+    1    12300.0
+    Name: x, dtype: float32
+
+    Space or apostrophe as thousand separator
+    >>> s = pd.Series(["4 567,89", "4'567,89"], name="x")
+    >>> ToFloat(decimal=",").fit_transform(s)
+    0    4567.8...
+    1    4567.8...
+    Name: x, dtype: float32
     """  # noqa: E501
 
+    def __init__(self, decimal="."):
+        super().__init__()
+        self.decimal = decimal
+
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
 
@@ -191,6 +246,10 @@ def fit_transform(self, column, y=None):
                 f"with dtype '{sbd.dtype(column)}' to numbers."
             )
         try:
+            if sbd.is_string(column):
+                p = POSSIBLE_SEPARATORS.copy()
+                p.remove(self.decimal)
+                column = _str_replace(column, pattern=p, decimal=self.decimal)
             numeric = sbd.to_float32(column, strict=True)
             return numeric
         except Exception as e:

diff --git a/skrub/tests/test_to_float.py b/skrub/tests/test_to_float.py
@@ -43,3 +43,36 @@ def test_rejected_columns(df_module):
             ToFloat().fit_transform(col)
         to_float = ToFloat().fit(df_module.make_column("c", [1.1]))
         assert is_float32(df_module, to_float.transform(col))
+
+
+@pytest.mark.parametrize(
+    "input_str, expected_float, decimal",
+    [
+        ("1,234.56", 1234.56, "."),
+        ("1.234,56", 1234.56, ","),
+        ("1 234,56", 1234.56, ","),
+        ("1234.56", 1234.56, "."),
+        ("1234,56", 1234.56, ","),
+        ("1,234,567.89", 1234567.89, "."),
+        ("1.234.567,89", 1234567.89, ","),
+        ("1 234 567,89", 1234567.89, ","),
+        ("1'234'567.89", 1234567.89, "."),
+        ("1.23e+4", 12300.0, "."),
+        ("1.23E+4", 12300.0, "."),
+        ("1,23e+4", 12300.0, ","),
+        ("1,23E+4", 12300.0, ","),
+        ("-1,234.56", -1234.56, "."),
+        ("-1.234,56", -1234.56, ","),
+        ("(1,234.56)", -1234.56, "."),
+        ("(1.234,56)", -1234.56, ","),
+        ("1,23,456.78", 123456.78, "."),
+        ("12,3456.78", 123456.78, "."),
+        (".56", 0.56, "."),
+        (",56", 0.56, ","),
+    ],
+)
+def test_number_parsing(input_str, expected_float, decimal, df_module):
+    column = df_module.make_column("col", [input_str])
+    result = ToFloat(decimal=decimal).fit_transform(column)
+
+    np.allclose(result[0], expected_float)