Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b

Lzforevr · Lzforevr · commit e618573c9891 · 2024-10-02T23:28:40.000+08:00
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -62,7 +62,116 @@ def to_numeric(
 ):
     """
     Convert argument to a numeric type.
-    ...
+
+    The default return dtype is `float64` or `int64`
+    depending on the data supplied. Use the `downcast` parameter
+    to obtain other dtypes.
+
+    Please note that precision loss may occur if really large numbers
+    are passed in. Due to the internal limitations of `ndarray`, if
+    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
+    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
+    passed in, it is very likely they will be converted to float so that
+    they can be stored in an `ndarray`. These warnings apply similarly to
+    `Series` since it internally leverages `ndarray`.
+
+    Parameters
+    ----------
+    arg : scalar, list, tuple, 1-d array, or Series
+        Argument to be converted.
+
+    errors : {'raise', 'coerce'}, default 'raise'
+        - If 'raise', then invalid parsing will raise an exception.
+        - If 'coerce', then invalid parsing will be set as NaN.
+
+    downcast : str, default None
+        Can be 'integer', 'signed', 'unsigned', or 'float'.
+        If not None, and if the data has been successfully cast to a
+        numerical dtype (or if the data was numeric to begin with),
+        downcast that resulting data to the smallest numerical dtype
+        possible according to the following rules:
+
+        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
+        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
+        - 'float': smallest float dtype (min.: np.float32)
+
+        As this behaviour is separate from the core conversion to
+        numeric values, any errors raised during the downcasting
+        will be surfaced regardless of the value of the 'errors' input.
+
+        In addition, downcasting will only occur if the size
+        of the resulting data's dtype is strictly larger than
+        the dtype it is to be cast to, so if none of the dtypes
+        checked satisfy that specification, no downcasting will be
+        performed on the data.
+
+    dtype_backend : {'numpy_nullable', 'pyarrow'}
+        Back-end data type applied to the resultant :class:`DataFrame`
+        (still experimental). If not specified, the default behavior
+        is to not use nullable data types. If specified, the behavior
+        is as follows:
+
+        * ``"numpy_nullable"``: returns nullable-dtype-backed object
+        * ``"pyarrow"``: returns with pyarrow-backed nullable object
+
+        .. versionadded:: 2.0
+
+    Returns
+    -------
+    ret
+        Numeric if parsing succeeded.
+        Return type depends on input.  Series if Series, otherwise ndarray.
+
+    See Also
+    --------
+    DataFrame.astype : Cast argument to a specified dtype.
+    to_datetime : Convert argument to datetime.
+    to_timedelta : Convert argument to timedelta.
+    numpy.ndarray.astype : Cast a numpy array to a specified type.
+    DataFrame.convert_dtypes : Convert dtypes.
+
+    Examples
+    --------
+    Take separate series and convert to numeric, coercing when told to
+
+    >>> s = pd.Series(["1.0", "2", -3])
+    >>> pd.to_numeric(s)
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float64
+    >>> pd.to_numeric(s, downcast="float")
+    0    1.0
+    1    2.0
+    2   -3.0
+    dtype: float32
+    >>> pd.to_numeric(s, downcast="signed")
+    0    1
+    1    2
+    2   -3
+    dtype: int8
+    >>> s = pd.Series(["apple", "1.0", "2", -3])
+    >>> pd.to_numeric(s, errors="coerce")
+    0    NaN
+    1    1.0
+    2    2.0
+    3   -3.0
+    dtype: float64
+
+    Downcasting of nullable integer and floating dtypes is supported:
+
+    >>> s = pd.Series([1, 2, 3], dtype="Int64")
+    >>> pd.to_numeric(s, downcast="integer")
+    0    1
+    1    2
+    2    3
+    dtype: Int8
+    >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
+    >>> pd.to_numeric(s, downcast="float")
+    0    1.0
+    1    2.1
+    2    3.0
+    dtype: Float32
     """
     if downcast not in (None, "integer", "signed", "unsigned", "float"):
         raise ValueError("invalid downcasting method provided")
@@ -99,6 +208,8 @@ def to_numeric(
     else:
         values = arg
 
+    # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
+    # save mask to reconstruct the full array after casting
     mask: npt.NDArray[np.bool_] | None = None
     if isinstance(values, BaseMaskedArray):
         mask = values._mask
@@ -109,7 +220,6 @@ def to_numeric(
         mask = values.isna()
         values = values.dropna().to_numpy()
     new_mask: np.ndarray | None = None
-
     if is_numeric_dtype(values_dtype):
         pass
     elif lib.is_np_dtype(values_dtype, "mM"):
@@ -122,7 +232,7 @@ def to_numeric(
             parsed_value = parse_numeric(x)
             if libmissing.checknull(parsed_values):
                 if errors == "raise":
-                    raise ValueError(f"Unable to parse string '{x}' at position {idx}")
+                    raise ValueError(f"Unable to parse string '{x}' at position{idx}")
                 elif errors == "coerce":
                     parsed_values.append(libmissing.NA)
                     new_mask.append(True)
@@ -135,6 +245,8 @@ def to_numeric(
         new_mask = np.array(new_mask, dtype=bool)
 
     if new_mask is not None:
+        # Remove unnecessary values, is expected later anyway and enables
+        # downcasting
         values = values[~new_mask]
     elif (
             dtype_backend is not lib.no_default
@@ -144,6 +256,8 @@ def to_numeric(
     ):
         new_mask = np.zeros(values.shape, dtype=np.bool_)
 
+    # attempt downcast only if the data has been successfully converted
+    # to a numerical dtype and if a downcast method has been specified
     if downcast is not None and is_numeric_dtype(values.dtype):
         typecodes: str | None = None
 
@@ -153,23 +267,30 @@ def to_numeric(
             typecodes = np.typecodes["UnsignedInteger"]
         elif downcast == "float":
             typecodes = np.typecodes["Float"]
+
+            # pandas support goes only to np.float32,
+            # as float dtypes smaller than that are
+            # extremely rare and not well supported
             float_32_char = np.dtype(np.float32).char
             float_32_ind = typecodes.index(float_32_char)
             typecodes = typecodes[float_32_ind:]
 
         if typecodes is not None:
+            # from smallest to largest
             for typecode in typecodes:
                 dtype = np.dtype(typecode)
                 if dtype.itemsize <= values.dtype.itemsize:
-                    # Only downcast if values are all integers
-                    if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all():
-                        continue  # Skip downcasting if there are any float values
                     values = maybe_downcast_numeric(values, dtype)
+
+                    # successful conversion
                     if values.dtype == dtype:
                         break
 
+    # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
+    # masked array
     if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
         if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
+            # GH 52588
             mask = new_mask
         else:
             mask = mask.copy()
@@ -199,18 +320,12 @@ def to_numeric(
     if is_series:
         return arg._constructor(values, index=arg.index, name=arg.name)
     elif is_index:
+        # because we want to coerce to numeric if possible,
+        # do not use _shallow_copy
         from pandas import Index
+
         return Index(values, name=arg.name)
     elif is_scalars:
         return values[0]
     else:
         return values
-
-
-if __name__ == "__main__":
-    import numpy as np
-
-    test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"]
-    result = to_numeric(test_data, errors="coerce")
-    print("Inputs:", test_data)
-    print("ParseResult:", result)