diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3c5854602df53..63a64ffcf893c 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -355,7 +355,7 @@ cpdef array_to_datetime( iresult[i] = parse_pydatetime(val, &dts, creso=creso) elif PyDate_Check(val): - item_reso = NPY_DATETIMEUNIT.NPY_FR_s + item_reso = NPY_DATETIMEUNIT.NPY_FR_us state.update_creso(item_reso) if infer_reso: creso = state.creso diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2a080bcb19ae9..a16d14040d83a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -33,6 +33,7 @@ from pandas._libs.missing cimport checknull_with_nat_and_na from pandas._libs.tslibs.dtypes cimport ( abbrev_to_npy_unit, get_supported_reso, + get_supported_reso_for_dts, npy_unit_to_attrname, periods_per_second, ) @@ -507,6 +508,9 @@ cdef _TSObject convert_datetime_to_tsobject( if nanos: obj.dts.ps = nanos * 1000 + reso = get_supported_reso_for_dts(reso, &obj.dts) + obj.creso = reso + try: obj.value = npy_datetimestruct_to_datetime(reso, &obj.dts) except OverflowError as err: @@ -622,7 +626,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, &out_tzoffset, False ) if not string_to_dts_failed: - reso = get_supported_reso(out_bestunit) + reso = get_supported_reso_for_dts(out_bestunit, &dts) check_dts_bounds(&dts, reso) obj = _TSObject() obj.dts = dts diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index d8c536a34bc04..e8e8c6a477773 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -1,6 +1,9 @@ from numpy cimport int64_t -from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT +from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + npy_datetimestruct, +) cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit) @@ -9,6 +12,9 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) noexcept nogil cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1 cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1 cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso) +cdef NPY_DATETIMEUNIT get_supported_reso_for_dts( + NPY_DATETIMEUNIT reso, npy_datetimestruct* dts +) cdef bint is_supported_unit(NPY_DATETIMEUNIT reso) cdef dict c_OFFSET_TO_PERIOD_FREQSTR diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 4100f3d90e817..8913ff32720e8 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -2,11 +2,21 @@ # originals from enum import Enum +import numpy as np + +from cpython.object cimport ( + Py_GE, + Py_LE, +) + from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS from pandas._libs.tslibs.np_datetime cimport ( NPY_DATETIMEUNIT, + cmp_dtstructs, get_conversion_factor, import_pandas_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) import_pandas_datetime() @@ -504,6 +514,36 @@ cdef NPY_DATETIMEUNIT get_supported_reso(NPY_DATETIMEUNIT reso): return reso +cdef npy_datetimestruct dts_us_min, dts_us_max +pandas_datetime_to_datetimestruct( + np.iinfo(np.int64).min + 1, NPY_DATETIMEUNIT.NPY_FR_us, &dts_us_min +) +pandas_datetime_to_datetimestruct( + np.iinfo(np.int64).max, NPY_DATETIMEUNIT.NPY_FR_us, &dts_us_max +) + + +cdef NPY_DATETIMEUNIT get_supported_reso_for_dts( + NPY_DATETIMEUNIT reso, npy_datetimestruct* dts +): + # Similar as above, but taking the actual datetime value in account, + # defaulting to 'us' if possible. + if reso == NPY_DATETIMEUNIT.NPY_FR_GENERIC: + return NPY_DATETIMEUNIT.NPY_FR_ns + if reso < NPY_DATETIMEUNIT.NPY_FR_us: + if ( + cmp_dtstructs(dts, &dts_us_min, Py_GE) + and cmp_dtstructs(dts, &dts_us_max, Py_LE) + ): + return NPY_DATETIMEUNIT.NPY_FR_us + else: + # TODO still distinguish between ms or s? + return NPY_DATETIMEUNIT.NPY_FR_s + elif reso > NPY_DATETIMEUNIT.NPY_FR_ns: + return NPY_DATETIMEUNIT.NPY_FR_ns + return reso + + cdef bint is_supported_unit(NPY_DATETIMEUNIT reso): return ( reso == NPY_DATETIMEUNIT.NPY_FR_ns diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index b443aa7bede22..84c2a449cdb06 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -53,6 +53,7 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.dtypes cimport ( get_supported_reso, + get_supported_reso_for_dts, npy_unit_to_abbrev, npy_unit_to_attrname, ) @@ -421,7 +422,7 @@ def array_strptime( continue elif PyDate_Check(val): state.found_other = True - item_reso = NPY_DATETIMEUNIT.NPY_FR_s + item_reso = NPY_DATETIMEUNIT.NPY_FR_us state.update_creso(item_reso) if infer_reso: creso = state.creso @@ -460,7 +461,7 @@ def array_strptime( if string_to_dts_succeeded: # No error reported by string_to_dts, pick back up # where we left off - item_reso = get_supported_reso(out_bestunit) + item_reso = get_supported_reso_for_dts(out_bestunit, &dts) state.update_creso(item_reso) if infer_reso: creso = state.creso @@ -622,7 +623,7 @@ cdef tzinfo _parse_with_format( f"time data \"{val}\" doesn't match format \"{fmt}\"" ) - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_s + item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us iso_year = -1 year = 1900 @@ -710,11 +711,7 @@ cdef tzinfo _parse_with_format( elif parse_code == 10: # e.g. val='10:10:10.100'; fmt='%H:%M:%S.%f' s = found_dict["f"] - if len(s) <= 3: - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_ms - elif len(s) <= 6: - item_reso[0] = NPY_DATETIMEUNIT.NPY_FR_us - else: + if len(s) > 6: item_reso[0] = NPY_FR_ns # Pad to always return nanoseconds s += "0" * (9 - len(s)) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 3680273f5e98a..1765c4c589e8a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -65,7 +65,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], - dtype="M8[s]", + dtype="M8[us]" if all_parsers.engine != "pyarrow" else "M8[s]", name="index", ), ) @@ -167,7 +167,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], - dtype="M8[s]", + dtype="M8[us]" if all_parsers.engine != "pyarrow" else "M8[s]", name="index", ), ) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index cfa8785b24bde..7570a4df1e89f 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -263,7 +263,7 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 6), datetime(2000, 1, 7), ], - dtype="M8[s]", + dtype="M8[us]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 348c19ac0f0c6..66207e94aa910 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -155,5 +155,5 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) expected = df[:] - expected["date"] = expected["date"].astype("M8[s]") + expected["date"] = expected["date"].astype("M8[us]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9a15d9bc84a2e..75d295e431f0f 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -61,7 +61,6 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], - dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -126,7 +125,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="us")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -143,8 +142,6 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) - expected["a"] = expected["a"].astype("M8[s]") - expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -168,7 +165,7 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers - dti = date_range("2009-01-01", periods=3, freq="D", unit="s") + dti = date_range("2009-01-01", periods=3, freq="D", unit="us") index = MultiIndex.from_product( [ dti, @@ -218,6 +215,7 @@ def test_parse_tz_aware(all_parsers): if parser.engine == "pyarrow": pytz = pytest.importorskip("pytz") expected_tz = pytz.utc + expected.index = expected.index.as_unit("s") else: expected_tz = timezone.utc tm.assert_frame_equal(result, expected) @@ -303,7 +301,7 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) - expected["Date"] = expected["Date"].astype("M8[s]") + expected["Date"] = expected["Date"].astype("M8[us]") tm.assert_frame_equal(result, expected) @@ -314,22 +312,18 @@ def test_parse_dates_empty_string(all_parsers): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), + DataFrame([datetime(2016, 4, 15)], columns=["a"]), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame( - index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] - ), + DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], - dtype="M8[s]", - columns=["a", "b"], + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] ), ), ( @@ -339,8 +333,8 @@ def test_parse_dates_empty_string(all_parsers): index=MultiIndex.from_tuples( [ ( - Timestamp(2016, 4, 15).as_unit("s"), - Timestamp(2013, 9, 16).as_unit("s"), + Timestamp(2016, 4, 15), + Timestamp(2013, 9, 16), ) ], names=["a", "b"], @@ -411,7 +405,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), - unit="s", + unit="us", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -450,7 +444,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[s]") + expected = DataFrame({0: [expected]}, dtype="datetime64[us]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -483,7 +477,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[s]") + expected = DataFrame({0: [expected]}, dtype="datetime64[us]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -602,7 +596,6 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) - expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -611,9 +604,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" - expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" - ) + expected = DatetimeIndex(["2014-12-31", "2011-03-10"], freq=None, name="date") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -672,9 +663,7 @@ def test_dayfirst_warnings(): def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" - expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" - ) + expected = DatetimeIndex(["2014-01-31"], freq=None, name="date") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -729,7 +718,7 @@ def test_replace_nans_before_parsing_dates(all_parsers): Timestamp("2017-09-09"), ] }, - dtype="M8[s]", + dtype="M8[us]", ) tm.assert_frame_equal(result, expected) @@ -744,7 +733,6 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") - expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -763,9 +751,7 @@ def test_parse_dot_separated_dates(all_parsers): warn = None else: expected_index = DatetimeIndex( - ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ms]", - name="a", + ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], name="a" ) warn = UserWarning msg = r"when dayfirst=False \(the default\) was specified" @@ -798,7 +784,7 @@ def test_parse_dates_dict_format(all_parsers): "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], }, - dtype="M8[s]", + dtype="M8[us]", ) tm.assert_frame_equal(result, expected) @@ -840,6 +826,8 @@ def test_parse_dates_arrow_engine(all_parsers): "b": 1, } ) + if parser.engine == "pyarrow": + expected["a"] = expected["a"].dt.as_unit("s") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 6243185294894..792e6f8249fec 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -308,7 +308,6 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) - expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 99642ee4befc6..619b53abec8c7 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -43,7 +43,7 @@ def test_skip_rows_bug(all_parsers, skiprows): ) index = Index( [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - dtype="M8[s]", + dtype="M8[us]", name=0, ) @@ -88,7 +88,7 @@ def test_skip_rows_blank(all_parsers): ) index = Index( [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], - dtype="M8[s]", + dtype="M8[us]", name=0, ) diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 479f2468a86ab..308fae487f842 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -826,7 +826,7 @@ def test_append_raise(setup_path, using_infer_string): msg = re.escape( "Cannot serialize the column [foo] " "because its data contents are not [string] " - "but [datetime64[s]] object dtype" + "but [datetime64[us]] object dtype" ) with pytest.raises(ValueError, match=msg): store.append("df", df) diff --git a/pandas/tests/io/pytables/test_round_trip.py b/pandas/tests/io/pytables/test_round_trip.py index 409b92d2ddde1..7ef1af0a3f516 100644 --- a/pandas/tests/io/pytables/test_round_trip.py +++ b/pandas/tests/io/pytables/test_round_trip.py @@ -252,8 +252,7 @@ def test_table_values_dtypes_roundtrip(setup_path, using_infer_string): "int8": 1, "int64": 1, str_dtype: 1, - "datetime64[s]": 2, - "datetime64[ms]": 1, + "datetime64[us]": 3, "datetime64[ns]": 1, }, name="count", diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 78c612472cc59..8a650a804af17 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -117,8 +117,6 @@ def from_uri(path): df2 = df1 expected = df1[:] - if format in ["csv", "excel"]: - expected["dt"] = expected["dt"].dt.as_unit("s") tm.assert_frame_equal(df2, expected) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1b9ae5d8e7209..5fcb4cbec7579 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1858,10 +1858,8 @@ def test_api_custom_dateparsing_error( if conn_name == "postgresql_adbc_types" and pa_version_under14p1: expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") - elif "postgres" in conn_name or "mysql" in conn_name: - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") else: - expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 9bc88a7e0a824..eb966c55858fa 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -119,7 +119,7 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): expected = Series( [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5, - dtype="M8[s]", + dtype="M8[us]", ) expected[2] = np.nan ser[2] = np.nan @@ -146,7 +146,7 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): expected = Series( [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5, - dtype="M8[s]", + dtype="M8[us]", ) expected[2] = np.nan ser[2] = np.nan @@ -160,7 +160,7 @@ def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) expected = Series( np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), - dtype="M8[s]", + dtype="M8[us]", ) tm.assert_series_equal(result, expected) @@ -169,7 +169,7 @@ def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # GH 7930 ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[us]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -579,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[us]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -641,8 +641,6 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) - if constructor is Timestamp: - expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -714,7 +712,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[s, UTC]", + dtype="datetime64[us, UTC]", ), id="ISO8601, UTC", ), @@ -722,7 +720,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[s, UTC]", + dtype="datetime64[us, UTC]", ), id="non-ISO8601, UTC", ), @@ -1157,7 +1155,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ).as_unit("s") + ).as_unit("us") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1176,7 +1174,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[us, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1469,17 +1467,15 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values,exp_unit", + "datetimelikes,expected_values", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), - "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, - "s", ), ( (None,) @@ -1487,12 +1483,11 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), - "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values, exp_unit + self, datetimelikes, expected_values ): # GH#39882 ser = Series( @@ -1500,10 +1495,10 @@ def test_convert_object_to_datetime_with_cache( dtype="object", ) result_series = to_datetime(ser, errors="coerce") - expected_series = Series( - expected_values, - dtype=f"datetime64[{exp_unit}]", - ) + expected_series = Series(expected_values, dtype="datetime64[us]") + if expected_series.isna().all(): + # TODO should this also be `us`? + expected_series = expected_series.astype("datetime64[s]") tm.assert_series_equal(result_series, expected_series) @pytest.mark.parametrize( @@ -1578,13 +1573,15 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): format.startswith("%B") ^ outofbounds.startswith("J") ): # the strings don't match the given format, so they raise and we coerce - expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[us]") elif isinstance(outofbounds, datetime): expected = DatetimeIndex( [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" ) else: - expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1644,7 +1641,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ).as_unit("s") + ).as_unit("us") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1670,7 +1667,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ).as_unit("s") + ).as_unit("us") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1693,7 +1690,7 @@ def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 res = to_datetime(dt_str, format="%Y%m%d") dtobj = datetime.strptime(dt_str, "%Y%m%d") - expected = Timestamp(dtobj).as_unit("s") + expected = Timestamp(dtobj).as_unit("us") assert res == expected assert res.unit == expected.unit @@ -2214,7 +2211,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[us]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2420,7 +2417,7 @@ def test_to_datetime_with_space_in_series(self, cache): result_coerce = to_datetime(ser, errors="coerce", cache=cache) expected_coerce = Series( [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] - ).dt.as_unit("s") + ).dt.as_unit("us") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2541,7 +2538,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[s]") + expected = np.empty(4, dtype="M8[us]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2586,7 +2583,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[s]"), index=idx) + expected = Series(np.empty(5, dtype="M8[us]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2626,7 +2623,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ).as_unit("s") + ).as_unit("us") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2650,7 +2647,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[us]", freq=None ) # A. dayfirst arg correct, no warning @@ -2755,7 +2752,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[us]" ) tm.assert_series_equal(result, expected) @@ -2767,7 +2764,9 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") + expected = Series( + ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[us]" + ) tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2786,7 +2785,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[us]" ) tm.assert_series_equal(result, expected) @@ -2800,7 +2799,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) - expected = expected.dt.as_unit("s") + expected = expected.dt.as_unit("us") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2964,9 +2963,9 @@ def test_parsers(self, date_str, expected, cache): reso = { "nanosecond": "ns", "microsecond": "us", - "millisecond": "ms", - "second": "s", - }.get(reso_attrname, "s") + "millisecond": "us", + "second": "us", + }.get(reso_attrname, "us") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -3411,7 +3410,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[us]") tm.assert_series_equal(expected, result) @@ -3596,7 +3595,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[us, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way @@ -3769,3 +3768,77 @@ def test_to_datetime_wrapped_datetime64_ps(): ["1970-01-01 00:00:01.901901901"], dtype="datetime64[ns]", freq=None ) tm.assert_index_equal(result, expected) + + +class TestToDatetimeInferUnit: + @pytest.mark.parametrize( + "hour,unit", + [ + ("", "us"), + ("T09:00", "us"), + ("T09:00:00", "us"), + ("T09:00:00.123", "us"), + ("T09:00:00.123456", "us"), + ("T09:00:00.123456789", "ns"), + ("T09:00:00.123456789123", "ns"), + ], + ) + def test_strings(self, hour, unit): + result = to_datetime(["2020-01-01" + hour, "2020-01-02" + hour]) + assert result.dtype == f"datetime64[{unit}]" + + # parsing from out of bounds date does not actually work + # def test_strings_out_of_bounds(self): + # pd.to_datetime(["-290301-01-01"], format="ISO8601") + + @pytest.mark.parametrize( + "dt", + [ + datetime(2020, 1, 1), + datetime(2020, 1, 1, 9, 0, 30), + datetime(2020, 1, 1, 9, 0, 30, 123), + datetime(2020, 1, 1, 9, 0, 30, 123456), + ], + ) + def test_datetime_datetime(self, dt): + # range of datetime.datetime/date objects are always supported for us + result = to_datetime([dt, dt]) + assert result.dtype == "datetime64[us]" + + @pytest.mark.parametrize("year", [2012, 9999]) + def test_datetime_date(self, year): + dt = date(year, 1, 1) + result = to_datetime([dt, dt]) + assert result.dtype == "datetime64[us]" + + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_np_datetime64_array(self, unit): + # numpy datetime64 already has a unit -> preserve that in this case + arr = np.array(["2020-01-01T09:00:30.123456"], dtype=f"datetime64[{unit}]") + result = to_datetime(arr) + assert result.dtype == arr.dtype + + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_np_datetime64_objects(self, unit): + dt = np.datetime64("2020-01-01T09:00:30.123456", unit) + result = to_datetime([dt, dt]) + assert result.dtype == dt.dtype + + @pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) + def test_timestamp_objects(self, unit): + ts = Timestamp("2020-01-01T09:00:30").as_unit(unit) + result = to_datetime([ts, ts]) + assert result.dtype == f"datetime64[{unit}]" + + # @pytest.mark.parametrize("year", [2012, 9999]) + # def test_dataframe_components(self, year): + # df = pd.DataFrame({ + # "year": [year, year], + # "month": [1, 1], + # "day": [1, 2], + # "hour": [9, 10], + # "minute": [0, 30], + # "second": [30, 45], + # }) + # result = to_datetime(df) + # assert result.dtype == "datetime64[us]" diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index fc0000553049e..d319f4674e6d3 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -45,7 +45,7 @@ def test_infer_homogeoneous_date_objects(self): arr = np.array([None, dt2, dt2, dt2], dtype=object) result, tz = tslib.array_to_datetime(arr, creso=creso_infer) assert tz is None - expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[s]") + expected = np.array([np.datetime64("NaT"), dt2, dt2, dt2], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) def test_infer_homogeoneous_dt64(self): @@ -111,7 +111,7 @@ def test_array_to_datetime_with_tz_resolution(self): tz = tzoffset("custom", 3600) vals = np.array(["2016-01-01 02:03:04.567", NaT], dtype=object) res = tslib.array_to_datetime_with_tz(vals, tz, False, False, creso_infer) - assert res.dtype == "M8[ms]" + assert res.dtype == "M8[us]" vals2 = np.array([datetime(2016, 1, 1, 2, 3, 4), NaT], dtype=object) res2 = tslib.array_to_datetime_with_tz(vals2, tz, False, False, creso_infer) @@ -155,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[s]") + expected = np.array(expected, dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) @@ -209,10 +209,10 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( "invalid_date,exp_unit", [ - (date(1000, 1, 1), "s"), + (date(1000, 1, 1), "us"), (datetime(1000, 1, 1), "us"), - ("1000-01-01", "s"), - ("Jan 1, 1000", "s"), + ("1000-01-01", "us"), + ("Jan 1, 1000", "us"), (np.datetime64("1000-01-01"), "s"), ], ) @@ -235,7 +235,7 @@ def test_coerce_outside_ns_bounds_one_valid(): result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[s]") + expected = np.array(expected, dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) @@ -245,13 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[us]")) # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[us]")) def test_to_datetime_barely_out_of_bounds(): diff --git a/pandas/tests/tslibs/test_strptime.py b/pandas/tests/tslibs/test_strptime.py index d726006b03f6d..0905d6926bf2f 100644 --- a/pandas/tests/tslibs/test_strptime.py +++ b/pandas/tests/tslibs/test_strptime.py @@ -36,7 +36,9 @@ def test_array_strptime_resolution_inference_homogeneous_strings(self, tz): fmt = "%Y-%m-%d %H:%M:%S" dtstr = dt.strftime(fmt) arr = np.array([dtstr] * 3, dtype=object) - expected = np.array([dt.replace(tzinfo=None)] * 3, dtype="M8[s]") + expected = np.array( + [dt.replace(tzinfo=None, microsecond=0)] * 3, dtype="M8[us]" + ) res, _ = array_strptime(arr, fmt=fmt, utc=False, creso=creso_infer) tm.assert_numpy_array_equal(res, expected) @@ -97,14 +99,14 @@ def test_array_strptime_resolution_todaynow(self): def test_array_strptime_str_outside_nano_range(self): vals = np.array(["2401-09-15"], dtype=object) - expected = np.array(["2401-09-15"], dtype="M8[s]") + expected = np.array(["2401-09-15"], dtype="M8[us]") fmt = "ISO8601" res, _ = array_strptime(vals, fmt=fmt, creso=creso_infer) tm.assert_numpy_array_equal(res, expected) # non-iso -> different path vals2 = np.array(["Sep 15, 2401"], dtype=object) - expected2 = np.array(["2401-09-15"], dtype="M8[s]") + expected2 = np.array(["2401-09-15"], dtype="M8[us]") fmt2 = "%b %d, %Y" res2, _ = array_strptime(vals2, fmt=fmt2, creso=creso_infer) tm.assert_numpy_array_equal(res2, expected2)