@@ -62,7 +62,116 @@ def to_numeric(
62
62
):
63
63
"""
64
64
Convert argument to a numeric type.
65
- ...
65
+
66
+ The default return dtype is `float64` or `int64`
67
+ depending on the data supplied. Use the `downcast` parameter
68
+ to obtain other dtypes.
69
+
70
+ Please note that precision loss may occur if really large numbers
71
+ are passed in. Due to the internal limitations of `ndarray`, if
72
+ numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
73
+ or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
74
+ passed in, it is very likely they will be converted to float so that
75
+ they can be stored in an `ndarray`. These warnings apply similarly to
76
+ `Series` since it internally leverages `ndarray`.
77
+
78
+ Parameters
79
+ ----------
80
+ arg : scalar, list, tuple, 1-d array, or Series
81
+ Argument to be converted.
82
+
83
+ errors : {'raise', 'coerce'}, default 'raise'
84
+ - If 'raise', then invalid parsing will raise an exception.
85
+ - If 'coerce', then invalid parsing will be set as NaN.
86
+
87
+ downcast : str, default None
88
+ Can be 'integer', 'signed', 'unsigned', or 'float'.
89
+ If not None, and if the data has been successfully cast to a
90
+ numerical dtype (or if the data was numeric to begin with),
91
+ downcast that resulting data to the smallest numerical dtype
92
+ possible according to the following rules:
93
+
94
+ - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
95
+ - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
96
+ - 'float': smallest float dtype (min.: np.float32)
97
+
98
+ As this behaviour is separate from the core conversion to
99
+ numeric values, any errors raised during the downcasting
100
+ will be surfaced regardless of the value of the 'errors' input.
101
+
102
+ In addition, downcasting will only occur if the size
103
+ of the resulting data's dtype is strictly larger than
104
+ the dtype it is to be cast to, so if none of the dtypes
105
+ checked satisfy that specification, no downcasting will be
106
+ performed on the data.
107
+
108
+ dtype_backend : {'numpy_nullable', 'pyarrow'}
109
+ Back-end data type applied to the resultant :class:`DataFrame`
110
+ (still experimental). If not specified, the default behavior
111
+ is to not use nullable data types. If specified, the behavior
112
+ is as follows:
113
+
114
+ * ``"numpy_nullable"``: returns nullable-dtype-backed object
115
+ * ``"pyarrow"``: returns with pyarrow-backed nullable object
116
+
117
+ .. versionadded:: 2.0
118
+
119
+ Returns
120
+ -------
121
+ ret
122
+ Numeric if parsing succeeded.
123
+ Return type depends on input. Series if Series, otherwise ndarray.
124
+
125
+ See Also
126
+ --------
127
+ DataFrame.astype : Cast argument to a specified dtype.
128
+ to_datetime : Convert argument to datetime.
129
+ to_timedelta : Convert argument to timedelta.
130
+ numpy.ndarray.astype : Cast a numpy array to a specified type.
131
+ DataFrame.convert_dtypes : Convert dtypes.
132
+
133
+ Examples
134
+ --------
135
+ Take separate series and convert to numeric, coercing when told to
136
+
137
+ >>> s = pd.Series(["1.0", "2", -3])
138
+ >>> pd.to_numeric(s)
139
+ 0 1.0
140
+ 1 2.0
141
+ 2 -3.0
142
+ dtype: float64
143
+ >>> pd.to_numeric(s, downcast="float")
144
+ 0 1.0
145
+ 1 2.0
146
+ 2 -3.0
147
+ dtype: float32
148
+ >>> pd.to_numeric(s, downcast="signed")
149
+ 0 1
150
+ 1 2
151
+ 2 -3
152
+ dtype: int8
153
+ >>> s = pd.Series(["apple", "1.0", "2", -3])
154
+ >>> pd.to_numeric(s, errors="coerce")
155
+ 0 NaN
156
+ 1 1.0
157
+ 2 2.0
158
+ 3 -3.0
159
+ dtype: float64
160
+
161
+ Downcasting of nullable integer and floating dtypes is supported:
162
+
163
+ >>> s = pd.Series([1, 2, 3], dtype="Int64")
164
+ >>> pd.to_numeric(s, downcast="integer")
165
+ 0 1
166
+ 1 2
167
+ 2 3
168
+ dtype: Int8
169
+ >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
170
+ >>> pd.to_numeric(s, downcast="float")
171
+ 0 1.0
172
+ 1 2.1
173
+ 2 3.0
174
+ dtype: Float32
66
175
"""
67
176
if downcast not in (None , "integer" , "signed" , "unsigned" , "float" ):
68
177
raise ValueError ("invalid downcasting method provided" )
@@ -99,6 +208,8 @@ def to_numeric(
99
208
else :
100
209
values = arg
101
210
211
+ # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
212
+ # save mask to reconstruct the full array after casting
102
213
mask : npt .NDArray [np .bool_ ] | None = None
103
214
if isinstance (values , BaseMaskedArray ):
104
215
mask = values ._mask
@@ -109,7 +220,6 @@ def to_numeric(
109
220
mask = values .isna ()
110
221
values = values .dropna ().to_numpy ()
111
222
new_mask : np .ndarray | None = None
112
-
113
223
if is_numeric_dtype (values_dtype ):
114
224
pass
115
225
elif lib .is_np_dtype (values_dtype , "mM" ):
@@ -122,7 +232,7 @@ def to_numeric(
122
232
parsed_value = parse_numeric (x )
123
233
if libmissing .checknull (parsed_values ):
124
234
if errors == "raise" :
125
- raise ValueError (f"Unable to parse string '{ x } ' at position { idx } " )
235
+ raise ValueError (f"Unable to parse string '{ x } ' at position{ idx } " )
126
236
elif errors == "coerce" :
127
237
parsed_values .append (libmissing .NA )
128
238
new_mask .append (True )
@@ -135,6 +245,8 @@ def to_numeric(
135
245
new_mask = np .array (new_mask , dtype = bool )
136
246
137
247
if new_mask is not None :
248
+ # Remove unnecessary values, is expected later anyway and enables
249
+ # downcasting
138
250
values = values [~ new_mask ]
139
251
elif (
140
252
dtype_backend is not lib .no_default
@@ -144,6 +256,8 @@ def to_numeric(
144
256
):
145
257
new_mask = np .zeros (values .shape , dtype = np .bool_ )
146
258
259
+ # attempt downcast only if the data has been successfully converted
260
+ # to a numerical dtype and if a downcast method has been specified
147
261
if downcast is not None and is_numeric_dtype (values .dtype ):
148
262
typecodes : str | None = None
149
263
@@ -153,23 +267,30 @@ def to_numeric(
153
267
typecodes = np .typecodes ["UnsignedInteger" ]
154
268
elif downcast == "float" :
155
269
typecodes = np .typecodes ["Float" ]
270
+
271
+ # pandas support goes only to np.float32,
272
+ # as float dtypes smaller than that are
273
+ # extremely rare and not well supported
156
274
float_32_char = np .dtype (np .float32 ).char
157
275
float_32_ind = typecodes .index (float_32_char )
158
276
typecodes = typecodes [float_32_ind :]
159
277
160
278
if typecodes is not None :
279
+ # from smallest to largest
161
280
for typecode in typecodes :
162
281
dtype = np .dtype (typecode )
163
282
if dtype .itemsize <= values .dtype .itemsize :
164
- # Only downcast if values are all integers
165
- if downcast in ("integer" , "signed" , "unsigned" ) and not np .isin (np .mod (values , 1 ), 0 ).all ():
166
- continue # Skip downcasting if there are any float values
167
283
values = maybe_downcast_numeric (values , dtype )
284
+
285
+ # successful conversion
168
286
if values .dtype == dtype :
169
287
break
170
288
289
+ # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
290
+ # masked array
171
291
if (mask is not None or new_mask is not None ) and not is_string_dtype (values .dtype ):
172
292
if mask is None or (new_mask is not None and new_mask .shape == mask .shape ):
293
+ # GH 52588
173
294
mask = new_mask
174
295
else :
175
296
mask = mask .copy ()
@@ -199,18 +320,12 @@ def to_numeric(
199
320
if is_series :
200
321
return arg ._constructor (values , index = arg .index , name = arg .name )
201
322
elif is_index :
323
+ # because we want to coerce to numeric if possible,
324
+ # do not use _shallow_copy
202
325
from pandas import Index
326
+
203
327
return Index (values , name = arg .name )
204
328
elif is_scalars :
205
329
return values [0 ]
206
330
else :
207
331
return values
208
-
209
-
210
- if __name__ == "__main__" :
211
- import numpy as np
212
-
213
- test_data = ["0x1A" , "0b1010" , "0o17" , "25" , "3.14" , "invalid" ]
214
- result = to_numeric (test_data , errors = "coerce" )
215
- print ("Inputs:" , test_data )
216
- print ("ParseResult:" , result )
0 commit comments