Skip to content

Commit ec07f93

Browse files
author
Lzforevr
committed
Enhanced numeric.py to process hexadecimal,decimal,binary formats like 0x,0o,0b
1 parent 7c3aa3b commit ec07f93

File tree

1 file changed

+10
-134
lines changed

1 file changed

+10
-134
lines changed

pandas/core/tools/numeric.py

Lines changed: 10 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -62,116 +62,7 @@ def to_numeric(
6262
):
6363
"""
6464
Convert argument to a numeric type.
65-
66-
The default return dtype is `float64` or `int64`
67-
depending on the data supplied. Use the `downcast` parameter
68-
to obtain other dtypes.
69-
70-
Please note that precision loss may occur if really large numbers
71-
are passed in. Due to the internal limitations of `ndarray`, if
72-
numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
73-
or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
74-
passed in, it is very likely they will be converted to float so that
75-
they can be stored in an `ndarray`. These warnings apply similarly to
76-
`Series` since it internally leverages `ndarray`.
77-
78-
Parameters
79-
----------
80-
arg : scalar, list, tuple, 1-d array, or Series
81-
Argument to be converted.
82-
83-
errors : {'raise', 'coerce'}, default 'raise'
84-
- If 'raise', then invalid parsing will raise an exception.
85-
- If 'coerce', then invalid parsing will be set as NaN.
86-
87-
downcast : str, default None
88-
Can be 'integer', 'signed', 'unsigned', or 'float'.
89-
If not None, and if the data has been successfully cast to a
90-
numerical dtype (or if the data was numeric to begin with),
91-
downcast that resulting data to the smallest numerical dtype
92-
possible according to the following rules:
93-
94-
- 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
95-
- 'unsigned': smallest unsigned int dtype (min.: np.uint8)
96-
- 'float': smallest float dtype (min.: np.float32)
97-
98-
As this behaviour is separate from the core conversion to
99-
numeric values, any errors raised during the downcasting
100-
will be surfaced regardless of the value of the 'errors' input.
101-
102-
In addition, downcasting will only occur if the size
103-
of the resulting data's dtype is strictly larger than
104-
the dtype it is to be cast to, so if none of the dtypes
105-
checked satisfy that specification, no downcasting will be
106-
performed on the data.
107-
108-
dtype_backend : {'numpy_nullable', 'pyarrow'}
109-
Back-end data type applied to the resultant :class:`DataFrame`
110-
(still experimental). If not specified, the default behavior
111-
is to not use nullable data types. If specified, the behavior
112-
is as follows:
113-
114-
* ``"numpy_nullable"``: returns nullable-dtype-backed object
115-
* ``"pyarrow"``: returns with pyarrow-backed nullable object
116-
117-
.. versionadded:: 2.0
118-
119-
Returns
120-
-------
121-
ret
122-
Numeric if parsing succeeded.
123-
Return type depends on input. Series if Series, otherwise ndarray.
124-
125-
See Also
126-
--------
127-
DataFrame.astype : Cast argument to a specified dtype.
128-
to_datetime : Convert argument to datetime.
129-
to_timedelta : Convert argument to timedelta.
130-
numpy.ndarray.astype : Cast a numpy array to a specified type.
131-
DataFrame.convert_dtypes : Convert dtypes.
132-
133-
Examples
134-
--------
135-
Take separate series and convert to numeric, coercing when told to
136-
137-
>>> s = pd.Series(["1.0", "2", -3])
138-
>>> pd.to_numeric(s)
139-
0 1.0
140-
1 2.0
141-
2 -3.0
142-
dtype: float64
143-
>>> pd.to_numeric(s, downcast="float")
144-
0 1.0
145-
1 2.0
146-
2 -3.0
147-
dtype: float32
148-
>>> pd.to_numeric(s, downcast="signed")
149-
0 1
150-
1 2
151-
2 -3
152-
dtype: int8
153-
>>> s = pd.Series(["apple", "1.0", "2", -3])
154-
>>> pd.to_numeric(s, errors="coerce")
155-
0 NaN
156-
1 1.0
157-
2 2.0
158-
3 -3.0
159-
dtype: float64
160-
161-
Downcasting of nullable integer and floating dtypes is supported:
162-
163-
>>> s = pd.Series([1, 2, 3], dtype="Int64")
164-
>>> pd.to_numeric(s, downcast="integer")
165-
0 1
166-
1 2
167-
2 3
168-
dtype: Int8
169-
>>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
170-
>>> pd.to_numeric(s, downcast="float")
171-
0 1.0
172-
1 2.1
173-
2 3.0
174-
dtype: Float32
65+
...
17566
"""
17667
if downcast not in (None, "integer", "signed", "unsigned", "float"):
17768
raise ValueError("invalid downcasting method provided")
@@ -208,8 +99,6 @@ def to_numeric(
20899
else:
209100
values = arg
210101

211-
# GH33013: for IntegerArray & FloatingArray extract non-null values for casting
212-
# save mask to reconstruct the full array after casting
213102
mask: npt.NDArray[np.bool_] | None = None
214103
if isinstance(values, BaseMaskedArray):
215104
mask = values._mask
@@ -220,6 +109,7 @@ def to_numeric(
220109
mask = values.isna()
221110
values = values.dropna().to_numpy()
222111
new_mask: np.ndarray | None = None
112+
223113
if is_numeric_dtype(values_dtype):
224114
pass
225115
elif lib.is_np_dtype(values_dtype, "mM"):
@@ -231,9 +121,9 @@ def to_numeric(
231121
for idx, x in enumerate(values):
232122
parsed_value = parse_numeric(x)
233123
if libmissing.checknull(parsed_values):
234-
if errors == 'raise':
235-
raise ValueError(f"Unable to parse string '{x}' at position{idx}")
236-
elif errors == 'coerce':
124+
if errors == "raise":
125+
raise ValueError(f"Unable to parse string '{x}' at position {idx}")
126+
elif errors == "coerce":
237127
parsed_values.append(libmissing.NA)
238128
new_mask.append(True)
239129
continue
@@ -245,8 +135,6 @@ def to_numeric(
245135
new_mask = np.array(new_mask, dtype=bool)
246136

247137
if new_mask is not None:
248-
# Remove unnecessary values, is expected later anyway and enables
249-
# downcasting
250138
values = values[~new_mask]
251139
elif (
252140
dtype_backend is not lib.no_default
@@ -256,8 +144,6 @@ def to_numeric(
256144
):
257145
new_mask = np.zeros(values.shape, dtype=np.bool_)
258146

259-
# attempt downcast only if the data has been successfully converted
260-
# to a numerical dtype and if a downcast method has been specified
261147
if downcast is not None and is_numeric_dtype(values.dtype):
262148
typecodes: str | None = None
263149

@@ -267,30 +153,23 @@ def to_numeric(
267153
typecodes = np.typecodes["UnsignedInteger"]
268154
elif downcast == "float":
269155
typecodes = np.typecodes["Float"]
270-
271-
# pandas support goes only to np.float32,
272-
# as float dtypes smaller than that are
273-
# extremely rare and not well supported
274156
float_32_char = np.dtype(np.float32).char
275157
float_32_ind = typecodes.index(float_32_char)
276158
typecodes = typecodes[float_32_ind:]
277159

278160
if typecodes is not None:
279-
# from smallest to largest
280161
for typecode in typecodes:
281162
dtype = np.dtype(typecode)
282163
if dtype.itemsize <= values.dtype.itemsize:
164+
# Only downcast if values are all integers
165+
if downcast in ("integer", "signed", "unsigned") and not np.isin(np.mod(values, 1), 0).all():
166+
continue # Skip downcasting if there are any float values
283167
values = maybe_downcast_numeric(values, dtype)
284-
285-
# successful conversion
286168
if values.dtype == dtype:
287169
break
288170

289-
# GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
290-
# masked array
291171
if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
292172
if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
293-
# GH 52588
294173
mask = new_mask
295174
else:
296175
mask = mask.copy()
@@ -320,10 +199,7 @@ def to_numeric(
320199
if is_series:
321200
return arg._constructor(values, index=arg.index, name=arg.name)
322201
elif is_index:
323-
# because we want to coerce to numeric if possible,
324-
# do not use _shallow_copy
325202
from pandas import Index
326-
327203
return Index(values, name=arg.name)
328204
elif is_scalars:
329205
return values[0]
@@ -334,7 +210,7 @@ def to_numeric(
334210
if __name__ == "__main__":
335211
import numpy as np
336212

337-
test_data = ['0x1A', '0b1010', '0o17', '25', '3.14', 'invalid']
338-
result = to_numeric(test_data, errors='coerce')
213+
test_data = ["0x1A", "0b1010", "0o17", "25", "3.14", "invalid"]
214+
result = to_numeric(test_data, errors="coerce")
339215
print("Inputs:", test_data)
340216
print("ParseResult:", result)

0 commit comments

Comments
 (0)