Skip to content

Commit 724c688

Browse files
committed
ENH: improved inference of timedelta64[ns] and datetime64[ns] in the prescence NaT/None/NaN
from object dtypes (GH5689) CLN: refactor of timedelta routines to cython for inclusion in maybe_convert_objects
1 parent e86e99c commit 724c688

File tree

11 files changed

+146
-53
lines changed

11 files changed

+146
-53
lines changed

pandas/core/common.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1514,7 +1514,8 @@ def _values_from_object(o):
15141514

15151515

15161516
def _possibly_convert_objects(values, convert_dates=True,
1517-
convert_numeric=True):
1517+
convert_numeric=True,
1518+
convert_timedeltas=True):
15181519
""" if we have an object dtype, try to coerce dates and/or numbers """
15191520

15201521
# if we have passed in a list or scalar
@@ -1539,6 +1540,22 @@ def _possibly_convert_objects(values, convert_dates=True,
15391540
values = lib.maybe_convert_objects(
15401541
values, convert_datetime=convert_dates)
15411542

1543+
# convert timedeltas
1544+
if convert_timedeltas and values.dtype == np.object_:
1545+
1546+
if convert_timedeltas == 'coerce':
1547+
from pandas.tseries.timedeltas import \
1548+
_possibly_cast_to_timedelta
1549+
values = _possibly_cast_to_timedelta(values)
1550+
1551+
# if we are all nans then leave me alone
1552+
if not isnull(new_values).all():
1553+
values = new_values
1554+
1555+
else:
1556+
values = lib.maybe_convert_objects(
1557+
values, convert_timedelta=convert_timedeltas)
1558+
15421559
# convert to numeric
15431560
if values.dtype == np.object_:
15441561
if convert_numeric:

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3626,7 +3626,7 @@ def append(self, other, ignore_index=False, verify_integrity=False):
36263626
index = None if other.name is None else [other.name]
36273627
other = other.reindex(self.columns, copy=False)
36283628
other = DataFrame(other.values.reshape((1, len(other))),
3629-
index=index, columns=self.columns)
3629+
index=index, columns=self.columns).convert_objects()
36303630
elif isinstance(other, list) and not isinstance(other[0], DataFrame):
36313631
other = DataFrame(other)
36323632
if (self.columns.get_indexer(other.columns) >= 0).all():

pandas/core/generic.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1844,16 +1844,18 @@ def copy(self, deep=True):
18441844
return self._constructor(data).__finalize__(self)
18451845

18461846
def convert_objects(self, convert_dates=True, convert_numeric=False,
1847-
copy=True):
1847+
convert_timedeltas=True, copy=True):
18481848
"""
18491849
Attempt to infer better dtype for object columns
18501850
18511851
Parameters
18521852
----------
1853-
convert_dates : if True, attempt to soft convert_dates, if 'coerce',
1853+
convert_dates : if True, attempt to soft convert dates, if 'coerce',
18541854
force conversion (and non-convertibles get NaT)
18551855
convert_numeric : if True attempt to coerce to numbers (including
18561856
strings), non-convertibles get NaN
1857+
convert_timedeltas : if True, attempt to soft convert timedeltas, if 'coerce',
1858+
force conversion (and non-convertibles get NaT)
18571859
copy : Boolean, if True, return copy, default is True
18581860
18591861
Returns
@@ -1863,6 +1865,7 @@ def convert_objects(self, convert_dates=True, convert_numeric=False,
18631865
return self._constructor(
18641866
self._data.convert(convert_dates=convert_dates,
18651867
convert_numeric=convert_numeric,
1868+
convert_timedeltas=convert_timedeltas,
18661869
copy=copy)).__finalize__(self)
18671870

18681871
#----------------------------------------------------------------------

pandas/core/internals.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,8 +1315,8 @@ def is_bool(self):
13151315
"""
13161316
return lib.is_bool_array(self.values.ravel())
13171317

1318-
def convert(self, convert_dates=True, convert_numeric=True, copy=True,
1319-
by_item=True):
1318+
def convert(self, convert_dates=True, convert_numeric=True, convert_timedeltas=True,
1319+
copy=True, by_item=True):
13201320
""" attempt to coerce any object types to better types
13211321
return a copy of the block (if copy = True)
13221322
by definition we ARE an ObjectBlock!!!!!
@@ -1334,7 +1334,8 @@ def convert(self, convert_dates=True, convert_numeric=True, copy=True,
13341334

13351335
values = com._possibly_convert_objects(
13361336
values.ravel(), convert_dates=convert_dates,
1337-
convert_numeric=convert_numeric
1337+
convert_numeric=convert_numeric,
1338+
convert_timedeltas=convert_timedeltas,
13381339
).reshape(values.shape)
13391340
values = _block_shape(values, ndim=self.ndim)
13401341
items = self.items.take([i])

pandas/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ from datetime import datetime as pydatetime
3131
# this is our tseries.pxd
3232
from datetime cimport *
3333

34-
from tslib cimport convert_to_tsobject
34+
from tslib cimport convert_to_tsobject, convert_to_timedelta64
3535
import tslib
3636
from tslib import NaT, Timestamp, repr_timedelta64
3737

pandas/src/inference.pyx

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
cimport util
2+
from tslib import NaT
23

34
_TYPE_MAP = {
45
np.int8: 'integer',
@@ -55,14 +56,18 @@ def infer_dtype(object _values):
5556

5657
val = util.get_value_1d(values, 0)
5758

58-
if util.is_datetime64_object(val):
59+
if util.is_datetime64_object(val) or val is NaT:
5960
if is_datetime64_array(values):
6061
return 'datetime64'
62+
elif is_timedelta_or_timedelta64_array(values):
63+
return 'timedelta'
6164
elif util.is_integer_object(val):
6265
if is_integer_array(values):
6366
return 'integer'
6467
elif is_integer_float_array(values):
6568
return 'mixed-integer-float'
69+
elif is_timedelta_or_timedelta64_array(values):
70+
return 'timedelta'
6671
return 'mixed-integer'
6772
elif is_datetime(val):
6873
if is_datetime_array(values):
@@ -258,20 +263,24 @@ def is_unicode_array(ndarray values):
258263

259264
def is_datetime_array(ndarray[object] values):
260265
cdef int i, n = len(values)
266+
cdef object v
261267
if n == 0:
262268
return False
263269
for i in range(n):
264-
if not is_datetime(values[i]):
270+
v = values[i]
271+
if not (is_datetime(v) or util._checknull(v) or v is NaT):
265272
return False
266273
return True
267274

268275

269276
def is_datetime64_array(ndarray values):
270277
cdef int i, n = len(values)
278+
cdef object v
271279
if n == 0:
272280
return False
273281
for i in range(n):
274-
if not util.is_datetime64_object(values[i]):
282+
v = values[i]
283+
if not (util.is_datetime64_object(v) or util._checknull(v) or v is NaT):
275284
return False
276285
return True
277286

@@ -299,12 +308,15 @@ def is_timedelta64_array(ndarray values):
299308
return True
300309

301310
def is_timedelta_or_timedelta64_array(ndarray values):
311+
""" infer with timedeltas and/or nat/none """
302312
import datetime
303313
cdef int i, n = len(values)
314+
cdef object v
304315
if n == 0:
305316
return False
306317
for i in range(n):
307-
if not (isinstance(values[i],datetime.timedelta) or isinstance(values[i],np.timedelta64)):
318+
v = values[i]
319+
if not (isinstance(v,datetime.timedelta) or isinstance(v,np.timedelta64) or util._checknull(v) or v is NaT):
308320
return False
309321
return True
310322

@@ -427,7 +439,7 @@ def maybe_convert_numeric(ndarray[object] values, set na_values,
427439
return ints
428440

429441
def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
430-
bint safe=0, bint convert_datetime=0):
442+
bint safe=0, bint convert_datetime=0, bint convert_timedelta=0):
431443
'''
432444
Type inference function-- convert object array to proper dtype
433445
'''
@@ -438,9 +450,11 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
438450
ndarray[int64_t] ints
439451
ndarray[uint8_t] bools
440452
ndarray[int64_t] idatetimes
453+
ndarray[int64_t] itimedeltas
441454
bint seen_float = 0
442455
bint seen_complex = 0
443456
bint seen_datetime = 0
457+
bint seen_timedelta = 0
444458
bint seen_int = 0
445459
bint seen_bool = 0
446460
bint seen_object = 0
@@ -457,6 +471,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
457471
bools = np.empty(n, dtype=np.uint8)
458472
datetimes = np.empty(n, dtype='M8[ns]')
459473
idatetimes = datetimes.view(np.int64)
474+
timedeltas = np.empty(n, dtype='m8[ns]')
475+
itimedeltas = timedeltas.view(np.int64)
460476

461477
onan = np.nan
462478
fnan = np.nan
@@ -481,9 +497,13 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
481497
seen_object = 1
482498
# objects[i] = val.astype('O')
483499
break
484-
elif util.is_timedelta64_object(val):
485-
seen_object = 1
486-
break
500+
elif is_timedelta(val):
501+
if convert_timedelta:
502+
itimedeltas[i] = convert_to_timedelta64(val, 'ns')
503+
seen_timedelta = 1
504+
else:
505+
seen_object = 1
506+
break
487507
elif util.is_integer_object(val):
488508
seen_int = 1
489509
floats[i] = <float64_t> val
@@ -523,7 +543,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
523543

524544
if not safe:
525545
if seen_null:
526-
if not seen_bool and not seen_datetime:
546+
if not seen_bool and not seen_datetime and not seen_timedelta:
527547
if seen_complex:
528548
return complexes
529549
elif seen_float or seen_int:
@@ -533,20 +553,23 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
533553
if seen_datetime:
534554
if not seen_numeric:
535555
return datetimes
556+
elif seen_timedelta:
557+
if not seen_numeric:
558+
return timedeltas
536559
else:
537560
if seen_complex:
538561
return complexes
539562
elif seen_float:
540563
return floats
541564
elif seen_int:
542565
return ints
543-
elif not seen_datetime and not seen_numeric:
566+
elif not seen_datetime and not seen_numeric and not seen_timedelta:
544567
return bools.view(np.bool_)
545568

546569
else:
547570
# don't cast int to float, etc.
548571
if seen_null:
549-
if not seen_bool and not seen_datetime:
572+
if not seen_bool and not seen_datetime and not seen_timedelta:
550573
if seen_complex:
551574
if not seen_int:
552575
return complexes
@@ -558,6 +581,9 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
558581
if seen_datetime:
559582
if not seen_numeric:
560583
return datetimes
584+
elif seen_timedelta:
585+
if not seen_numeric:
586+
return timedeltas
561587
else:
562588
if seen_complex:
563589
if not seen_int:
@@ -567,7 +593,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
567593
return floats
568594
elif seen_int:
569595
return ints
570-
elif not seen_datetime and not seen_numeric:
596+
elif not seen_datetime and not seen_numeric and not seen_timedelta:
571597
return bools.view(np.bool_)
572598

573599
return objects

pandas/tests/test_series.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2263,6 +2263,16 @@ def test_constructor_dtype_timedelta64(self):
22632263
for i in range(3)] + [np.nan ], dtype='m8[ns]' )
22642264
self.assert_(td.dtype == 'timedelta64[ns]')
22652265

2266+
td = Series([np.timedelta64(300000000), pd.NaT],dtype='m8[ns]')
2267+
self.assert_(td.dtype == 'timedelta64[ns]')
2268+
2269+
# improved inference
2270+
td = Series([np.timedelta64(300000000), pd.NaT])
2271+
self.assert_(td.dtype == 'timedelta64[ns]')
2272+
2273+
td = Series([pd.NaT, np.timedelta64(300000000)])
2274+
self.assert_(td.dtype == 'timedelta64[ns]')
2275+
22662276
# these are frequency conversion astypes
22672277
#for t in ['s', 'D', 'us', 'ms']:
22682278
# self.assertRaises(TypeError, td.astype, 'm8[%s]' % t)

pandas/tseries/tests/test_timedeltas.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -165,11 +165,13 @@ def conv(v):
165165
# single element conversion
166166
v = timedelta(seconds=1)
167167
result = to_timedelta(v,box=False)
168-
expected = to_timedelta([v])
168+
expected = np.timedelta64(timedelta(seconds=1))
169+
self.assert_(result == expected)
169170

170171
v = np.timedelta64(timedelta(seconds=1))
171172
result = to_timedelta(v,box=False)
172-
expected = to_timedelta([v])
173+
expected = np.timedelta64(timedelta(seconds=1))
174+
self.assert_(result == expected)
173175

174176
def test_timedelta_ops(self):
175177
_skip_if_numpy_not_friendly()

pandas/tseries/timedeltas.py

Lines changed: 5 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -70,42 +70,16 @@ def _convert_listlike(arg, box):
7070
_whitespace = re.compile('^\s*$')
7171

7272
def _coerce_scalar_to_timedelta_type(r, unit='ns'):
73-
# kludgy here until we have a timedelta scalar
74-
# handle the numpy < 1.7 case
75-
76-
def conv(v):
77-
if _np_version_under1p7:
78-
return timedelta(microseconds=v/1000.0)
79-
return np.timedelta64(v)
73+
""" convert strings to timedelta; coerce to np.timedelta64"""
8074

8175
if isinstance(r, compat.string_types):
76+
77+
# we are already converting to nanoseconds
8278
converter = _get_string_converter(r, unit=unit)
8379
r = converter()
84-
r = conv(r)
85-
elif r == tslib.iNaT:
86-
return r
87-
elif isnull(r):
88-
return np.timedelta64('NaT')
89-
elif isinstance(r, np.timedelta64):
90-
r = r.astype("m8[{0}]".format(unit.lower()))
91-
elif is_integer(r):
92-
r = tslib.cast_from_unit(r, unit)
93-
r = conv(r)
80+
unit='ns'
9481

95-
if _np_version_under1p7:
96-
if not isinstance(r, timedelta):
97-
raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
98-
if compat.PY3:
99-
# convert to microseconds in timedelta64
100-
r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000))
101-
else:
102-
return r
103-
104-
if isinstance(r, timedelta):
105-
r = np.timedelta64(r)
106-
elif not isinstance(r, np.timedelta64):
107-
raise AssertionError("Invalid type for timedelta scalar: %s" % type(r))
108-
return r.astype('timedelta64[ns]')
82+
return tslib.convert_to_timedelta(r,unit)
10983

11084
def _get_string_converter(r, unit='ns'):
11185
""" return a string converter for r to process the timedelta format """

pandas/tslib.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from numpy cimport ndarray, int64_t
22

33
cdef convert_to_tsobject(object, object, object)
4+
cdef convert_to_timedelta64(object, object)

0 commit comments

Comments
 (0)