Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b

Lzforevr · Lzforevr · commit ec07f931a103 · 2024-10-02T23:23:07.000+08:00
diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py
@@ -62,116 +62,7 @@ def to_numeric(
 ):
     """
     Convert argument to a numeric type.
-
-    The default return dtype is `float64` or `int64`
-    depending on the data supplied. Use the `downcast` parameter
-    to obtain other dtypes.
-
-    Please note that precision loss may occur if really large numbers
-    are passed in. Due to the internal limitations of `ndarray`, if
-    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
-    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
-    passed in, it is very likely they will be converted to float so that
-    they can be stored in an `ndarray`. These warnings apply similarly to
-    `Series` since it internally leverages `ndarray`.
-
-    Parameters
-    ----------
-    arg : scalar, list, tuple, 1-d array, or Series
-        Argument to be converted.
-
-    errors : {'raise', 'coerce'}, default 'raise'
-        - If 'raise', then invalid parsing will raise an exception.
-        - If 'coerce', then invalid parsing will be set as NaN.
-
-    downcast : str, default None
-        Can be 'integer', 'signed', 'unsigned', or 'float'.
-        If not None, and if the data has been successfully cast to a
-        numerical dtype (or if the data was numeric to begin with),
-        downcast that resulting data to the smallest numerical dtype
-        possible according to the following rules:
-
-        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
-        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
-        - 'float': smallest float dtype (min.: np.float32)
-
-        As this behaviour is separate from the core conversion to
-        numeric values, any errors raised during the downcasting
-        will be surfaced regardless of the value of the 'errors' input.
-
-        In addition, downcasting will only occur if the size
-        of the resulting data's dtype is strictly larger than
-        the dtype it is to be cast to, so if none of the dtypes
-        checked satisfy that specification, no downcasting will be
-        performed on the data.
-
-    dtype_backend : {'numpy_nullable', 'pyarrow'}
-        Back-end data type applied to the resultant :class:`DataFrame`
-        (still experimental). If not specified, the default behavior
-        is to not use nullable data types. If specified, the behavior
-        is as follows:
-
-        * ``"numpy_nullable"``: returns nullable-dtype-backed object
-        * ``"pyarrow"``: returns with pyarrow-backed nullable object
-
-        .. versionadded:: 2.0
-
-    Returns
-    -------
-    ret
-        Numeric if parsing succeeded.
-        Return type depends on input.  Series if Series, otherwise ndarray.
-
-    See Also
-    --------
-    DataFrame.astype : Cast argument to a specified dtype.
-    to_datetime : Convert argument to datetime.
-    to_timedelta : Convert argument to timedelta.
-    numpy.ndarray.astype : Cast a numpy array to a specified type.
-    DataFrame.convert_dtypes : Convert dtypes.
-
-    Examples
-    --------
-    Take separate series and convert to numeric, coercing when told to
-
-    >>> s = pd.Series(["1.0", "2", -3])
-    >>> pd.to_numeric(s)
-    0    1.0
-    1    2.0
-    2   -3.0
-    dtype: float64
-    >>> pd.to_numeric(s, downcast="float")
-    0    1.0
-    1    2.0
-    2   -3.0
-    dtype: float32
-    >>> pd.to_numeric(s, downcast="signed")
-    0    1
-    1    2
-    2   -3
-    dtype: int8
-    >>> s = pd.Series(["apple", "1.0", "2", -3])
-    >>> pd.to_numeric(s, errors="coerce")
-    0    NaN
-    1    1.0
-    2    2.0
-    3   -3.0
-    dtype: float64
-
-    Downcasting of nullable integer and floating dtypes is supported:
-
-    >>> s = pd.Series([1, 2, 3], dtype="Int64")
-    >>> pd.to_numeric(s, downcast="integer")
-    0    1
-    1    2
-    2    3
-    dtype: Int8
-    >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
-    >>> pd.to_numeric(s, downcast="float")
-    0    1.0
-    1    2.1
-    2    3.0
-    dtype: Float32
+    ...
     """
     if downcast not in (None, "integer", "signed", "unsigned", "float"):
         raise ValueError("invalid downcasting method provided")
@@ -208,8 +99,6 @@ def to_numeric(
     else:
         values = arg
 
-    # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
-    # save mask to reconstruct the full array after casting
     mask: npt.NDArray[np.bool_] | None = None
     if isinstance(values, BaseMaskedArray):
         mask = values._mask
@@ -220,6 +109,7 @@ def to_numeric(
         mask = values.isna()
         values = values.dropna().to_numpy()
     new_mask: np.ndarray | None = None
+
     if is_numeric_dtype(values_dtype):
         pass
     elif lib.is_np_dtype(values_dtype, "mM"):
@@ -231,9 +121,9 @@ def to_numeric(
         for idx, x in enumerate(values):
             parsed_value = parse_numeric(x)
             if libmissing.checknull(parsed_values):
-                if errors == 'raise':
-                    raise ValueError(f"Unable to parse string '{x}' at position{idx}")
-                elif errors == 'coerce':
+                if errors == "raise":
+                    raise ValueError(f"Unable to parse string '{x}' at position {idx}")
+                elif errors == "coerce":
                     parsed_values.append(libmissing.NA)
                     new_mask.append(True)
                     continue
@@ -245,8 +135,6 @@ def to_numeric(
         new_mask = np.array(new_mask, dtype=bool)
 
     if new_mask is not None:
-        # Remove unnecessary values, is expected later anyway and enables
-        # downcasting
         values = values[~new_mask]
     elif (
             dtype_backend is not lib.no_default
@@ -256,8 +144,6 @@ def to_numeric(
     ):
         new_mask = np.zeros(values.shape, dtype=np.bool_)
 
-    # attempt downcast only if the data has been successfully converted
-    # to a numerical dtype and if a downcast method has been specified
     if downcast is not None and is_numeric_dtype(values.dtype):
         typecodes: str | None = None
 
@@ -267,30 +153,23 @@ def to_numeric(
             typecodes = np.typecodes["UnsignedInteger"]
         elif downcast == "float":
             typecodes = np.typecodes["Float"]
-
-            # pandas support goes only to np.float32,
-            # as float dtypes smaller than that are
-            # extremely rare and not well supported
             float_32_char = np.dtype(np.float32).char
             float_32_ind = typecodes.index(float_32_char)
             typecodes = typecodes[float_32_ind:]
 
         if typecodes is not None:
-            # from smallest to largest
             for typecode in typecodes:
                 dtype = np.dtype(typecode)
                 if dtype.itemsize <= values.dtype.itemsize:
+                    # Only downcast if values are all integers
+                    if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all():
+                        continue  # Skip downcasting if there are any float values
                     values = maybe_downcast_numeric(values, dtype)
-
-                    # successful conversion
                     if values.dtype == dtype:
                         break
 
-    # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
-    # masked array
     if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
         if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
-            # GH 52588
             mask = new_mask
         else:
             mask = mask.copy()
@@ -320,10 +199,7 @@ def to_numeric(
     if is_series:
         return arg._constructor(values, index=arg.index, name=arg.name)
     elif is_index:
-        # because we want to coerce to numeric if possible,
-        # do not use _shallow_copy
         from pandas import Index
-
         return Index(values, name=arg.name)
     elif is_scalars:
         return values[0]
@@ -334,7 +210,7 @@ def to_numeric(
 if __name__ == "__main__":
     import numpy as np
 
-    test_data = ['0x1A', '0b1010', '0o17', '25', '3.14', 'invalid']
-    result = to_numeric(test_data, errors='coerce')
+    test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"]
+    result = to_numeric(test_data, errors="coerce")
     print("Inputs:", test_data)
     print("ParseResult:", result)