Merge branch 'main' into checks_extensions.ExtensionArray

mroeschke · web-flow · commit e1837a451bfe · 2024-08-19T10:13:07.000-10:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -543,7 +543,7 @@ Datetimelike
 - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
 - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
 - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
-- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
+- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
 - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
 - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
 - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)
diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
@@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #endif // NPY_NO_DEPRECATED_API
 
-#include <Python.h>
-
 #include "pandas/vendored/numpy/datetime/np_datetime.h"
-
 #define NO_IMPORT_ARRAY
 #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
 #include <numpy/ndarrayobject.h>
 #include <numpy/npy_common.h>
+#include <stdbool.h>
 
 #if defined(_WIN32)
 #ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
@@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler");
 #endif
 #endif
 
+#define XSTR(a) STR(a)
+#define STR(a) #a
+
 #define PD_CHECK_OVERFLOW(FUNC)                                                \
   do {                                                                         \
     if ((FUNC) != 0) {                                                         \
       PyGILState_STATE gstate = PyGILState_Ensure();                           \
       PyErr_SetString(PyExc_OverflowError,                                     \
-                      "Overflow occurred in npy_datetimestruct_to_datetime");  \
+                      "Overflow occurred at " __FILE__ ":" XSTR(__LINE__));    \
       PyGILState_Release(gstate);                                              \
       return -1;                                                               \
     }                                                                          \
@@ -139,53 +140,53 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
   npy_int64 year, days = 0;
   const int *month_lengths;
 
-  year = dts->year - 1970;
-  days = year * 365;
+  PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year));
+  PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days));
 
   /* Adjust for leap years */
   if (days >= 0) {
     /*
      * 1968 is the closest leap year before 1970.
      * Exclude the current year, so add 1.
      */
-    year += 1;
+    PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year));
     /* Add one day for each 4 years */
-    days += year / 4;
+    PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
     /* 1900 is the closest previous year divisible by 100 */
-    year += 68;
+    PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year));
     /* Subtract one day for each 100 years */
-    days -= year / 100;
+    PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
     /* 1600 is the closest previous year divisible by 400 */
-    year += 300;
+    PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year));
     /* Add one day for each 400 years */
-    days += year / 400;
+    PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
   } else {
     /*
      * 1972 is the closest later year after 1970.
      * Include the current year, so subtract 2.
      */
-    year -= 2;
+    PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year));
     /* Subtract one day for each 4 years */
-    days += year / 4;
+    PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
     /* 2000 is the closest later year divisible by 100 */
-    year -= 28;
+    PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year));
     /* Add one day for each 100 years */
-    days -= year / 100;
+    PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
     /* 2000 is also the closest later year divisible by 400 */
     /* Subtract one day for each 400 years */
-    days += year / 400;
+    PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
   }
 
   month_lengths = days_per_month_table[is_leapyear(dts->year)];
   month = dts->month - 1;
 
   /* Add the months */
   for (i = 0; i < month; ++i) {
-    days += month_lengths[i];
+    PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days));
   }
 
   /* Add the days */
-  days += dts->day - 1;
+  PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days));
 
   return days;
 }
@@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
   }
 
   const int64_t days = get_datetimestruct_days(dts);
+  if (days == -1) {
+    PyGILState_STATE gstate = PyGILState_Ensure();
+    bool did_error = PyErr_Occurred() == NULL ? false : true;
+    PyGILState_Release(gstate);
+    if (did_error) {
+      return -1;
+    }
+  }
+
   if (base == NPY_FR_D) {
     return days;
   }
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1137,6 +1137,21 @@ def test_infer_string_large_string_type(self, tmp_path, pa):
     #     assert result["strings"].dtype == "string"
     # FIXME: don't leave commented-out
 
+    def test_non_nanosecond_timestamps(self, temp_file):
+        # GH#49236
+        pa = pytest.importorskip("pyarrow", "11.0.0")
+        pq = pytest.importorskip("pyarrow.parquet")
+
+        arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us"))
+        table = pa.table([arr], names=["timestamp"])
+        pq.write_table(table, temp_file)
+        result = read_parquet(temp_file)
+        expected = pd.DataFrame(
+            data={"timestamp": [datetime.datetime(1600, 1, 1)]},
+            dtype="datetime64[us]",
+        )
+        tm.assert_frame_equal(result, expected)
+
 
 class TestParquetFastParquet(Base):
     @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
@@ -1178,6 +1193,10 @@ def test_duplicate_columns(self, fp):
         msg = "Cannot create parquet dataset with duplicate column names"
         self.check_error_on_write(df, fp, ValueError, msg)
 
+    @pytest.mark.xfail(
+        Version(np.__version__) >= Version("2.0.0"),
+        reason="fastparquet uses np.float_ in numpy2",
+    )
     def test_bool_with_none(self, fp):
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py
@@ -45,6 +45,7 @@
     _check_visible,
     get_y_axis,
 )
+from pandas.util.version import Version
 
 from pandas.io.formats.printing import pprint_thing
 
@@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self):
         d = {"a": np.arange(10), "b": np.arange(10)}
         df = DataFrame(d)
 
-        with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
-            df.plot(subplots=[("a", "bad_name")])
+        if Version(np.__version__) < Version("2.0.0"):
+            with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"):
+                df.plot(subplots=[("a", "bad_name")])
+        else:
+            with pytest.raises(
+                ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]"
+            ):
+                df.plot(subplots=[("a", "bad_name")])
 
     def test_group_subplot_duplicated_column(self):
         d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)}