Skip to content

Commit 408d1df

Browse files
authored
Merge branch 'main' into fix-docstring-timestamps#59458
2 parents 417b1d9 + ca2b8c3 commit 408d1df

File tree

14 files changed

+168
-102
lines changed

14 files changed

+168
-102
lines changed

doc/source/development/contributing_codebase.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -762,8 +762,7 @@ install pandas) by typing::
762762
your installation is probably fine and you can start contributing!
763763

764764
Often it is worth running only a subset of tests first around your changes before running the
765-
entire suite (tip: you can use the `pandas-coverage app <https://pandas-coverage-12d2130077bc.herokuapp.com/>`_)
766-
to find out which tests hit the lines of code you've modified, and then run only those).
765+
entire suite.
767766

768767
The easiest way to do this is with::
769768

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ Datetimelike
543543
- Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`)
544544
- Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`)
545545
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
546-
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`)
546+
- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`)
547547
- Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`)
548548
- Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`)
549549
- Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`)

pandas/_libs/src/vendored/numpy/datetime/np_datetime.c

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
2020
#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
2121
#endif // NPY_NO_DEPRECATED_API
2222

23-
#include <Python.h>
24-
2523
#include "pandas/vendored/numpy/datetime/np_datetime.h"
26-
2724
#define NO_IMPORT_ARRAY
2825
#define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY
2926
#include <numpy/ndarrayobject.h>
3027
#include <numpy/npy_common.h>
28+
#include <stdbool.h>
3129

3230
#if defined(_WIN32)
3331
#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
@@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler");
5856
#endif
5957
#endif
6058

59+
#define XSTR(a) STR(a)
60+
#define STR(a) #a
61+
6162
#define PD_CHECK_OVERFLOW(FUNC) \
6263
do { \
6364
if ((FUNC) != 0) { \
6465
PyGILState_STATE gstate = PyGILState_Ensure(); \
6566
PyErr_SetString(PyExc_OverflowError, \
66-
"Overflow occurred in npy_datetimestruct_to_datetime"); \
67+
"Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \
6768
PyGILState_Release(gstate); \
6869
return -1; \
6970
} \
@@ -139,53 +140,53 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) {
139140
npy_int64 year, days = 0;
140141
const int *month_lengths;
141142

142-
year = dts->year - 1970;
143-
days = year * 365;
143+
PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year));
144+
PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days));
144145

145146
/* Adjust for leap years */
146147
if (days >= 0) {
147148
/*
148149
* 1968 is the closest leap year before 1970.
149150
* Exclude the current year, so add 1.
150151
*/
151-
year += 1;
152+
PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year));
152153
/* Add one day for each 4 years */
153-
days += year / 4;
154+
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
154155
/* 1900 is the closest previous year divisible by 100 */
155-
year += 68;
156+
PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year));
156157
/* Subtract one day for each 100 years */
157-
days -= year / 100;
158+
PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
158159
/* 1600 is the closest previous year divisible by 400 */
159-
year += 300;
160+
PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year));
160161
/* Add one day for each 400 years */
161-
days += year / 400;
162+
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
162163
} else {
163164
/*
164165
* 1972 is the closest later year after 1970.
165166
* Include the current year, so subtract 2.
166167
*/
167-
year -= 2;
168+
PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year));
168169
/* Subtract one day for each 4 years */
169-
days += year / 4;
170+
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days));
170171
/* 2000 is the closest later year divisible by 100 */
171-
year -= 28;
172+
PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year));
172173
/* Add one day for each 100 years */
173-
days -= year / 100;
174+
PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days));
174175
/* 2000 is also the closest later year divisible by 400 */
175176
/* Subtract one day for each 400 years */
176-
days += year / 400;
177+
PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days));
177178
}
178179

179180
month_lengths = days_per_month_table[is_leapyear(dts->year)];
180181
month = dts->month - 1;
181182

182183
/* Add the months */
183184
for (i = 0; i < month; ++i) {
184-
days += month_lengths[i];
185+
PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days));
185186
}
186187

187188
/* Add the days */
188-
days += dts->day - 1;
189+
PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days));
189190

190191
return days;
191192
}
@@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
430431
}
431432

432433
const int64_t days = get_datetimestruct_days(dts);
434+
if (days == -1) {
435+
PyGILState_STATE gstate = PyGILState_Ensure();
436+
bool did_error = PyErr_Occurred() == NULL ? false : true;
437+
PyGILState_Release(gstate);
438+
if (did_error) {
439+
return -1;
440+
}
441+
}
442+
433443
if (base == NPY_FR_D) {
434444
return days;
435445
}

pandas/core/arrays/datetimelike.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import numpy as np
2121

22+
from pandas._config import using_string_dtype
2223
from pandas._config.config import get_option
2324

2425
from pandas._libs import (
@@ -1759,6 +1760,10 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
17591760
dtype='object')
17601761
"""
17611762
result = self._format_native_types(date_format=date_format, na_rep=np.nan)
1763+
if using_string_dtype():
1764+
from pandas import StringDtype
1765+
1766+
return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
17621767
return result.astype(object, copy=False)
17631768

17641769

pandas/core/arrays/datetimes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import numpy as np
1717

18+
from pandas._config import using_string_dtype
1819
from pandas._config.config import get_option
1920

2021
from pandas._libs import (
@@ -1332,6 +1333,13 @@ def month_name(self, locale=None) -> npt.NDArray[np.object_]:
13321333
values, "month_name", locale=locale, reso=self._creso
13331334
)
13341335
result = self._maybe_mask_results(result, fill_value=None)
1336+
if using_string_dtype():
1337+
from pandas import (
1338+
StringDtype,
1339+
array as pd_array,
1340+
)
1341+
1342+
return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
13351343
return result
13361344

13371345
def day_name(self, locale=None) -> npt.NDArray[np.object_]:
@@ -1393,6 +1401,14 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
13931401
values, "day_name", locale=locale, reso=self._creso
13941402
)
13951403
result = self._maybe_mask_results(result, fill_value=None)
1404+
if using_string_dtype():
1405+
# TODO: no tests that check for dtype of result as of 2024-08-15
1406+
from pandas import (
1407+
StringDtype,
1408+
array as pd_array,
1409+
)
1410+
1411+
return pd_array(result, dtype=StringDtype(na_value=np.nan)) # type: ignore[return-value]
13961412
return result
13971413

13981414
@property

pandas/core/arrays/string_arrow.py

Lines changed: 48 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
from functools import partial
43
import operator
54
import re
65
from typing import (
@@ -216,12 +215,17 @@ def dtype(self) -> StringDtype: # type: ignore[override]
216215
return self._dtype
217216

218217
def insert(self, loc: int, item) -> ArrowStringArray:
218+
if self.dtype.na_value is np.nan and item is np.nan:
219+
item = libmissing.NA
219220
if not isinstance(item, str) and item is not libmissing.NA:
220221
raise TypeError("Scalar must be NA or str")
221222
return super().insert(loc, item)
222223

223-
@classmethod
224-
def _result_converter(cls, values, na=None):
224+
def _result_converter(self, values, na=None):
225+
if self.dtype.na_value is np.nan:
226+
if not isna(na):
227+
values = values.fill_null(bool(na))
228+
return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
225229
return BooleanDtype().__from_arrow__(values)
226230

227231
def _maybe_convert_setitem_value(self, value):
@@ -492,11 +496,30 @@ def _str_get_dummies(self, sep: str = "|"):
492496
return dummies.astype(np.int64, copy=False), labels
493497

494498
def _convert_int_dtype(self, result):
499+
if self.dtype.na_value is np.nan:
500+
if isinstance(result, pa.Array):
501+
result = result.to_numpy(zero_copy_only=False)
502+
else:
503+
result = result.to_numpy()
504+
if result.dtype == np.int32:
505+
result = result.astype(np.int64)
506+
return result
507+
495508
return Int64Dtype().__from_arrow__(result)
496509

497510
def _reduce(
498511
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
499512
):
513+
if self.dtype.na_value is np.nan and name in ["any", "all"]:
514+
if not skipna:
515+
nas = pc.is_null(self._pa_array)
516+
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
517+
else:
518+
arr = pc.not_equal(self._pa_array, "")
519+
return ArrowExtensionArray(arr)._reduce(
520+
name, skipna=skipna, keepdims=keepdims, **kwargs
521+
)
522+
500523
result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
501524
if name in ("argmin", "argmax") and isinstance(result, pa.Array):
502525
return self._convert_int_dtype(result)
@@ -527,67 +550,31 @@ def _rank(
527550
)
528551
)
529552

530-
531-
class ArrowStringArrayNumpySemantics(ArrowStringArray):
532-
_storage = "pyarrow"
533-
_na_value = np.nan
534-
535-
@classmethod
536-
def _result_converter(cls, values, na=None):
537-
if not isna(na):
538-
values = values.fill_null(bool(na))
539-
return ArrowExtensionArray(values).to_numpy(na_value=np.nan)
540-
541-
def __getattribute__(self, item):
542-
# ArrowStringArray and we both inherit from ArrowExtensionArray, which
543-
# creates inheritance problems (Diamond inheritance)
544-
if item in ArrowStringArrayMixin.__dict__ and item not in (
545-
"_pa_array",
546-
"__dict__",
547-
):
548-
return partial(getattr(ArrowStringArrayMixin, item), self)
549-
return super().__getattribute__(item)
550-
551-
def _convert_int_dtype(self, result):
552-
if isinstance(result, pa.Array):
553-
result = result.to_numpy(zero_copy_only=False)
554-
else:
555-
result = result.to_numpy()
556-
if result.dtype == np.int32:
557-
result = result.astype(np.int64)
553+
def value_counts(self, dropna: bool = True) -> Series:
554+
result = super().value_counts(dropna=dropna)
555+
if self.dtype.na_value is np.nan:
556+
res_values = result._values.to_numpy()
557+
return result._constructor(
558+
res_values, index=result.index, name=result.name, copy=False
559+
)
558560
return result
559561

560562
def _cmp_method(self, other, op):
561563
result = super()._cmp_method(other, op)
562-
if op == operator.ne:
563-
return result.to_numpy(np.bool_, na_value=True)
564-
else:
565-
return result.to_numpy(np.bool_, na_value=False)
566-
567-
def value_counts(self, dropna: bool = True) -> Series:
568-
from pandas import Series
569-
570-
result = super().value_counts(dropna)
571-
return Series(
572-
result._values.to_numpy(), index=result.index, name=result.name, copy=False
573-
)
574-
575-
def _reduce(
576-
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
577-
):
578-
if name in ["any", "all"]:
579-
if not skipna:
580-
nas = pc.is_null(self._pa_array)
581-
arr = pc.or_kleene(nas, pc.not_equal(self._pa_array, ""))
564+
if self.dtype.na_value is np.nan:
565+
if op == operator.ne:
566+
return result.to_numpy(np.bool_, na_value=True)
582567
else:
583-
arr = pc.not_equal(self._pa_array, "")
584-
return ArrowExtensionArray(arr)._reduce(
585-
name, skipna=skipna, keepdims=keepdims, **kwargs
586-
)
587-
else:
588-
return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
568+
return result.to_numpy(np.bool_, na_value=False)
569+
return result
589570

590-
def insert(self, loc: int, item) -> ArrowStringArrayNumpySemantics:
591-
if item is np.nan:
592-
item = libmissing.NA
593-
return super().insert(loc, item) # type: ignore[return-value]
571+
572+
class ArrowStringArrayNumpySemantics(ArrowStringArray):
573+
_na_value = np.nan
574+
_str_get = ArrowStringArrayMixin._str_get
575+
_str_removesuffix = ArrowStringArrayMixin._str_removesuffix
576+
_str_capitalize = ArrowStringArrayMixin._str_capitalize
577+
_str_pad = ArrowStringArrayMixin._str_pad
578+
_str_title = ArrowStringArrayMixin._str_title
579+
_str_swapcase = ArrowStringArrayMixin._str_swapcase
580+
_str_slice_replace = ArrowStringArrayMixin._str_slice_replace

pandas/core/indexes/datetimes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def _engine_type(self) -> type[libindex.DatetimeEngine]:
263263
@doc(DatetimeArray.strftime)
264264
def strftime(self, date_format) -> Index:
265265
arr = self._data.strftime(date_format)
266-
return Index(arr, name=self.name, dtype=object)
266+
return Index(arr, name=self.name, dtype=arr.dtype)
267267

268268
@doc(DatetimeArray.tz_convert)
269269
def tz_convert(self, tz) -> Self:

pandas/core/indexes/extension.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def fget(self):
7474
return type(self)._simple_new(result, name=self.name)
7575
elif isinstance(result, ABCDataFrame):
7676
return result.set_index(self)
77-
return Index(result, name=self.name)
77+
return Index(result, name=self.name, dtype=result.dtype)
7878
return result
7979

8080
def fset(self, value) -> None:
@@ -101,7 +101,7 @@ def method(self, *args, **kwargs): # type: ignore[misc]
101101
return type(self)._simple_new(result, name=self.name)
102102
elif isinstance(result, ABCDataFrame):
103103
return result.set_index(self)
104-
return Index(result, name=self.name)
104+
return Index(result, name=self.name, dtype=result.dtype)
105105
return result
106106

107107
# error: "property" has no attribute "__name__"

0 commit comments

Comments
 (0)