Emit a warning when converting datetime or timedelta values to nanosecond precision (#7201)

spencerkclark · Illviljan · web-flow · commit be6594e6e327 · 2022-10-26T10:00:33.000-06:00
* [test-upstream] Emit a warning when converting to nanosecond precision

* Assert nanosecond dtype after conversion in all cases

* Apply suggestions from code review

Co-authored-by: Illviljan &lt;14371165+Illviljan@users.noreply.github.com&gt;

* [test-upstream] add separate test for second-precision TimedeltaIndex

Co-authored-by: Illviljan &lt;14371165+Illviljan@users.noreply.github.com&gt;
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -76,6 +76,13 @@ Internal Changes
   encoding times to preserve existing behavior and prevent future errors when it
   is eventually set to ``True`` by default in cftime (:pull:`7171`).  By
   `Spencer Clark <https://github.com/spencerkclark>`_.
+- Emit a warning under the development version of pandas when we convert
+  non-nanosecond precision datetime or timedelta values to nanosecond precision.
+  This was required in the past, because pandas previously was not compatible
+  with non-nanosecond precision values.  However pandas is currently working
+  towards removing this restriction.  When things stabilize in pandas we will
+  likely consider relaxing this behavior in xarray as well (:issue:`7175`,
+  :pull:`7201`).  By `Spencer Clark <https://github.com/spencerkclark>`_.
 
 .. _whats-new.2022.10.0:
 
diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -74,6 +74,12 @@
         T_Variable,
     )
 
+NON_NANOSECOND_WARNING = (
+    "Converting non-nanosecond precision {case} values to nanosecond precision. "
+    "This behavior can eventually be relaxed in xarray, as it is an artifact from "
+    "pandas which is now beginning to support non-nanosecond precision values."
+)
+
 
 class MissingDimensionsError(ValueError):
     """Error class used when we can't safely guess a dimension name."""
@@ -180,13 +186,58 @@ def _maybe_wrap_data(data):
     return data
 
 
+def _as_nanosecond_precision(data):
+    dtype = data.dtype
+    non_ns_datetime64 = (
+        dtype.kind == "M"
+        and isinstance(dtype, np.dtype)
+        and dtype != np.dtype("datetime64[ns]")
+    )
+    non_ns_datetime_tz_dtype = (
+        isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != "ns"
+    )
+    if non_ns_datetime64 or non_ns_datetime_tz_dtype:
+        warnings.warn(NON_NANOSECOND_WARNING.format(case="datetime"))
+        if isinstance(dtype, pd.DatetimeTZDtype):
+            nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz)
+        else:
+            nanosecond_precision_dtype = "datetime64[ns]"
+        return data.astype(nanosecond_precision_dtype)
+    elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"):
+        warnings.warn(NON_NANOSECOND_WARNING.format(case="timedelta"))
+        return data.astype("timedelta64[ns]")
+    else:
+        return data
+
+
 def _possibly_convert_objects(values):
     """Convert arrays of datetime.datetime and datetime.timedelta objects into
-    datetime64 and timedelta64, according to the pandas convention. Also used for
-    validating that datetime64 and timedelta64 objects are within the valid date
-    range for ns precision, as pandas will raise an error if they are not.
+    datetime64 and timedelta64, according to the pandas convention. For the time
+    being, convert any non-nanosecond precision DatetimeIndex or TimedeltaIndex
+    objects to nanosecond precision.  While pandas is relaxing this in version
+    2.0.0, in xarray we will need to make sure we are ready to handle
+    non-nanosecond precision datetimes or timedeltas in our code before allowing
+    such values to pass through unchanged.  Converting to nanosecond precision
+    through pandas.Series objects ensures that datetimes and timedeltas are
+    within the valid date range for ns precision, as pandas will raise an error
+    if they are not.
     """
-    return np.asarray(pd.Series(values.ravel())).reshape(values.shape)
+    as_series = pd.Series(values.ravel())
+    if as_series.dtype.kind in "mM":
+        as_series = _as_nanosecond_precision(as_series)
+    return np.asarray(as_series).reshape(values.shape)
+
+
+def _possibly_convert_datetime_or_timedelta_index(data):
+    """For the time being, convert any non-nanosecond precision DatetimeIndex or
+    TimedeltaIndex objects to nanosecond precision.  While pandas is relaxing
+    this in version 2.0.0, in xarray we will need to make sure we are ready to
+    handle non-nanosecond precision datetimes or timedeltas in our code
+    before allowing such values to pass through unchanged."""
+    if isinstance(data, (pd.DatetimeIndex, pd.TimedeltaIndex)):
+        return _as_nanosecond_precision(data)
+    else:
+        return data
 
 
 def as_compatible_data(data, fastpath=False):
@@ -210,6 +261,7 @@ def as_compatible_data(data, fastpath=False):
         return data.data
 
     if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES):
+        data = _possibly_convert_datetime_or_timedelta_index(data)
         return _maybe_wrap_data(data)
 
     if isinstance(data, tuple):
diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py
@@ -89,6 +89,11 @@ def _importorskip(
 requires_scipy_or_netCDF4 = pytest.mark.skipif(
     not has_scipy_or_netCDF4, reason="requires scipy or netCDF4"
 )
+# _importorskip does not work for development versions
+has_pandas_version_two = Version(pd.__version__).major >= 2
+requires_pandas_version_two = pytest.mark.skipif(
+    not has_pandas_version_two, reason="requires pandas 2.0.0"
+)
 
 # change some global options for tests
 set_options(warn_for_unclosed_files=True)
diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py
@@ -36,9 +36,11 @@
     assert_equal,
     assert_identical,
     assert_no_warnings,
+    has_pandas_version_two,
     raise_if_dask_computes,
     requires_cupy,
     requires_dask,
+    requires_pandas_version_two,
     requires_pint,
     requires_sparse,
     source_ndarray,
@@ -2520,6 +2522,26 @@ def test_datetime(self):
         assert np.ndarray == type(actual)
         assert np.dtype("datetime64[ns]") == actual.dtype
 
+    @requires_pandas_version_two
+    def test_tz_datetime(self) -> None:
+        tz = pytz.timezone("US/Eastern")
+        times_ns = pd.date_range("2000", periods=1, tz=tz)
+
+        times_s = times_ns.astype(pd.DatetimeTZDtype("s", tz))
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            actual = as_compatible_data(times_s)
+        assert actual.array == times_s
+        assert actual.array.dtype == pd.DatetimeTZDtype("ns", tz)
+
+        series = pd.Series(times_s)
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            actual = as_compatible_data(series)
+
+        np.testing.assert_array_equal(actual, series.values)
+        assert actual.dtype == np.dtype("datetime64[ns]")
+
     def test_full_like(self) -> None:
         # For more thorough tests, see test_variable.py
         orig = Variable(
@@ -2790,3 +2812,110 @@ def test_from_pint_wrapping_dask(self, Var):
         result = v.as_numpy()
         assert_identical(result, Var("x", arr))
         np.testing.assert_equal(v.to_numpy(), arr)
+
+
+@pytest.mark.parametrize(
+    ("values", "warns_under_pandas_version_two"),
+    [
+        (np.datetime64("2000-01-01", "ns"), False),
+        (np.datetime64("2000-01-01", "s"), True),
+        (np.array([np.datetime64("2000-01-01", "ns")]), False),
+        (np.array([np.datetime64("2000-01-01", "s")]), True),
+        (pd.date_range("2000", periods=1), False),
+        (datetime(2000, 1, 1), False),
+        (np.array([datetime(2000, 1, 1)]), False),
+        (pd.date_range("2000", periods=1, tz=pytz.timezone("US/Eastern")), False),
+        (
+            pd.Series(pd.date_range("2000", periods=1, tz=pytz.timezone("US/Eastern"))),
+            False,
+        ),
+    ],
+    ids=lambda x: f"{x}",
+)
+def test_datetime_conversion_warning(values, warns_under_pandas_version_two) -> None:
+    dims = ["time"] if isinstance(values, (np.ndarray, pd.Index, pd.Series)) else []
+    if warns_under_pandas_version_two and has_pandas_version_two:
+        with pytest.warns(UserWarning, match="non-nanosecond precision datetime"):
+            var = Variable(dims, values)
+    else:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            var = Variable(dims, values)
+
+    if var.dtype.kind == "M":
+        assert var.dtype == np.dtype("datetime64[ns]")
+    else:
+        # The only case where a non-datetime64 dtype can occur currently is in
+        # the case that the variable is backed by a timezone-aware
+        # DatetimeIndex, and thus is hidden within the PandasIndexingAdapter class.
+        assert var._data.array.dtype == pd.DatetimeTZDtype(
+            "ns", pytz.timezone("US/Eastern")
+        )
+
+
+@requires_pandas_version_two
+def test_pandas_two_only_datetime_conversion_warnings() -> None:
+    # Note these tests rely on pandas features that are only present in pandas
+    # 2.0.0 and above, and so for now cannot be parametrized.
+    cases = [
+        (pd.date_range("2000", periods=1), "datetime64[s]"),
+        (pd.Series(pd.date_range("2000", periods=1)), "datetime64[s]"),
+        (
+            pd.date_range("2000", periods=1, tz=pytz.timezone("US/Eastern")),
+            pd.DatetimeTZDtype("s", pytz.timezone("US/Eastern")),
+        ),
+        (
+            pd.Series(pd.date_range("2000", periods=1, tz=pytz.timezone("US/Eastern"))),
+            pd.DatetimeTZDtype("s", pytz.timezone("US/Eastern")),
+        ),
+    ]
+    for data, dtype in cases:
+        with pytest.warns(UserWarning, match="non-nanosecond precision datetime"):
+            var = Variable(["time"], data.astype(dtype))
+
+    if var.dtype.kind == "M":
+        assert var.dtype == np.dtype("datetime64[ns]")
+    else:
+        # The only case where a non-datetime64 dtype can occur currently is in
+        # the case that the variable is backed by a timezone-aware
+        # DatetimeIndex, and thus is hidden within the PandasIndexingAdapter class.
+        assert var._data.array.dtype == pd.DatetimeTZDtype(
+            "ns", pytz.timezone("US/Eastern")
+        )
+
+
+@pytest.mark.parametrize(
+    ("values", "warns_under_pandas_version_two"),
+    [
+        (np.timedelta64(10, "ns"), False),
+        (np.timedelta64(10, "s"), True),
+        (np.array([np.timedelta64(10, "ns")]), False),
+        (np.array([np.timedelta64(10, "s")]), True),
+        (pd.timedelta_range("1", periods=1), False),
+        (timedelta(days=1), False),
+        (np.array([timedelta(days=1)]), False),
+    ],
+    ids=lambda x: f"{x}",
+)
+def test_timedelta_conversion_warning(values, warns_under_pandas_version_two) -> None:
+    dims = ["time"] if isinstance(values, (np.ndarray, pd.Index)) else []
+    if warns_under_pandas_version_two and has_pandas_version_two:
+        with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"):
+            var = Variable(dims, values)
+    else:
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            var = Variable(dims, values)
+
+    assert var.dtype == np.dtype("timedelta64[ns]")
+
+
+@requires_pandas_version_two
+def test_pandas_two_only_timedelta_conversion_warning() -> None:
+    # Note this test relies on a pandas feature that is only present in pandas
+    # 2.0.0 and above, and so for now cannot be parametrized.
+    data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]")
+    with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"):
+        var = Variable(["time"], data)
+
+    assert var.dtype == np.dtype("timedelta64[ns]")