Skip to content

Commit e618573

Browse files
author
Lzforevr
committed
Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b
1 parent ec07f93 commit e618573

File tree

1 file changed

+130
-15
lines changed

1 file changed

+130
-15
lines changed

pandas/core/tools/numeric.py

Lines changed: 130 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,116 @@ def to_numeric(
6262
):
6363
"""
6464
Convert argument to a numeric type.
65-
...
65+
66+
The default return dtype is `float64` or `int64`
67+
depending on the data supplied. Use the `downcast` parameter
68+
to obtain other dtypes.
69+
70+
Please note that precision loss may occur if really large numbers
71+
are passed in. Due to the internal limitations of `ndarray`, if
72+
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
73+
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
74+
passed in, it is very likely they will be converted to float so that
75+
they can be stored in an `ndarray`. These warnings apply similarly to
76+
`Series` since it internally leverages `ndarray`.
77+
78+
Parameters
79+
----------
80+
arg : scalar, list, tuple, 1-d array, or Series
81+
Argument to be converted.
82+
83+
errors : {'raise', 'coerce'}, default 'raise'
84+
- If 'raise', then invalid parsing will raise an exception.
85+
- If 'coerce', then invalid parsing will be set as NaN.
86+
87+
downcast : str, default None
88+
Can be 'integer', 'signed', 'unsigned', or 'float'.
89+
If not None, and if the data has been successfully cast to a
90+
numerical dtype (or if the data was numeric to begin with),
91+
downcast that resulting data to the smallest numerical dtype
92+
possible according to the following rules:
93+
94+
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
95+
- 'unsigned': smallest unsigned int dtype (min.: np.uint8)
96+
- 'float': smallest float dtype (min.: np.float32)
97+
98+
As this behaviour is separate from the core conversion to
99+
numeric values, any errors raised during the downcasting
100+
will be surfaced regardless of the value of the 'errors' input.
101+
102+
In addition, downcasting will only occur if the size
103+
of the resulting data's dtype is strictly larger than
104+
the dtype it is to be cast to, so if none of the dtypes
105+
checked satisfy that specification, no downcasting will be
106+
performed on the data.
107+
108+
dtype_backend : {'numpy_nullable', 'pyarrow'}
109+
Back-end data type applied to the resultant :class:`DataFrame`
110+
(still experimental). If not specified, the default behavior
111+
is to not use nullable data types. If specified, the behavior
112+
is as follows:
113+
114+
* ``"numpy_nullable"``: returns nullable-dtype-backed object
115+
* ``"pyarrow"``: returns with pyarrow-backed nullable object
116+
117+
.. versionadded:: 2.0
118+
119+
Returns
120+
-------
121+
ret
122+
Numeric if parsing succeeded.
123+
Return type depends on input. Series if Series, otherwise ndarray.
124+
125+
See Also
126+
--------
127+
DataFrame.astype : Cast argument to a specified dtype.
128+
to_datetime : Convert argument to datetime.
129+
to_timedelta : Convert argument to timedelta.
130+
numpy.ndarray.astype : Cast a numpy array to a specified type.
131+
DataFrame.convert_dtypes : Convert dtypes.
132+
133+
Examples
134+
--------
135+
Take separate series and convert to numeric, coercing when told to
136+
137+
>>> s = pd.Series(["1.0", "2", -3])
138+
>>> pd.to_numeric(s)
139+
0 1.0
140+
1 2.0
141+
2 -3.0
142+
dtype: float64
143+
>>> pd.to_numeric(s, downcast="float")
144+
0 1.0
145+
1 2.0
146+
2 -3.0
147+
dtype: float32
148+
>>> pd.to_numeric(s, downcast="signed")
149+
0 1
150+
1 2
151+
2 -3
152+
dtype: int8
153+
>>> s = pd.Series(["apple", "1.0", "2", -3])
154+
>>> pd.to_numeric(s, errors="coerce")
155+
0 NaN
156+
1 1.0
157+
2 2.0
158+
3 -3.0
159+
dtype: float64
160+
161+
Downcasting of nullable integer and floating dtypes is supported:
162+
163+
>>> s = pd.Series([1, 2, 3], dtype="Int64")
164+
>>> pd.to_numeric(s, downcast="integer")
165+
0 1
166+
1 2
167+
2 3
168+
dtype: Int8
169+
>>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
170+
>>> pd.to_numeric(s, downcast="float")
171+
0 1.0
172+
1 2.1
173+
2 3.0
174+
dtype: Float32
66175
"""
67176
if downcast not in (None, "integer", "signed", "unsigned", "float"):
68177
raise ValueError("invalid downcasting method provided")
@@ -99,6 +208,8 @@ def to_numeric(
99208
else:
100209
values = arg
101210

211+
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
212+
# save mask to reconstruct the full array after casting
102213
mask: npt.NDArray[np.bool_] | None = None
103214
if isinstance(values, BaseMaskedArray):
104215
mask = values._mask
@@ -109,7 +220,6 @@ def to_numeric(
109220
mask = values.isna()
110221
values = values.dropna().to_numpy()
111222
new_mask: np.ndarray | None = None
112-
113223
if is_numeric_dtype(values_dtype):
114224
pass
115225
elif lib.is_np_dtype(values_dtype, "mM"):
@@ -122,7 +232,7 @@ def to_numeric(
122232
parsed_value = parse_numeric(x)
123233
if libmissing.checknull(parsed_values):
124234
if errors == "raise":
125-
raise ValueError(f"Unable to parse string '{x}' at position {idx}")
235+
raise ValueError(f"Unable to parse string '{x}' at position{idx}")
126236
elif errors == "coerce":
127237
parsed_values.append(libmissing.NA)
128238
new_mask.append(True)
@@ -135,6 +245,8 @@ def to_numeric(
135245
new_mask = np.array(new_mask, dtype=bool)
136246

137247
if new_mask is not None:
248+
# Remove unnecessary values, is expected later anyway and enables
249+
# downcasting
138250
values = values[~new_mask]
139251
elif (
140252
dtype_backend is not lib.no_default
@@ -144,6 +256,8 @@ def to_numeric(
144256
):
145257
new_mask = np.zeros(values.shape, dtype=np.bool_)
146258

259+
# attempt downcast only if the data has been successfully converted
260+
# to a numerical dtype and if a downcast method has been specified
147261
if downcast is not None and is_numeric_dtype(values.dtype):
148262
typecodes: str | None = None
149263

@@ -153,23 +267,30 @@ def to_numeric(
153267
typecodes = np.typecodes["UnsignedInteger"]
154268
elif downcast == "float":
155269
typecodes = np.typecodes["Float"]
270+
271+
# pandas support goes only to np.float32,
272+
# as float dtypes smaller than that are
273+
# extremely rare and not well supported
156274
float_32_char = np.dtype(np.float32).char
157275
float_32_ind = typecodes.index(float_32_char)
158276
typecodes = typecodes[float_32_ind:]
159277

160278
if typecodes is not None:
279+
# from smallest to largest
161280
for typecode in typecodes:
162281
dtype = np.dtype(typecode)
163282
if dtype.itemsize <= values.dtype.itemsize:
164-
# Only downcast if values are all integers
165-
if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all():
166-
continue # Skip downcasting if there are any float values
167283
values = maybe_downcast_numeric(values, dtype)
284+
285+
# successful conversion
168286
if values.dtype == dtype:
169287
break
170288

289+
# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
290+
# masked array
171291
if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
172292
if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
293+
# GH 52588
173294
mask = new_mask
174295
else:
175296
mask = mask.copy()
@@ -199,18 +320,12 @@ def to_numeric(
199320
if is_series:
200321
return arg._constructor(values, index=arg.index, name=arg.name)
201322
elif is_index:
323+
# because we want to coerce to numeric if possible,
324+
# do not use _shallow_copy
202325
from pandas import Index
326+
203327
return Index(values, name=arg.name)
204328
elif is_scalars:
205329
return values[0]
206330
else:
207331
return values
208-
209-
210-
if __name__ == "__main__":
211-
import numpy as np
212-
213-
test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"]
214-
result = to_numeric(test_data, errors="coerce")
215-
print("Inputs:", test_data)
216-
print("ParseResult:", result)

0 commit comments

Comments
 (0)