Skip to content

Commit 1b8da24

Browse files
authored
Merge branch 'main' into Test_issue_57930
2 parents 4686153 + 1bb264c commit 1b8da24

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+1507
-225
lines changed

.github/workflows/unit-tests.yml

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ defaults:
2222

2323
jobs:
2424
ubuntu:
25-
runs-on: ubuntu-22.04
25+
runs-on: ${{ matrix.platform }}
2626
timeout-minutes: 90
2727
strategy:
2828
matrix:
29+
platform: [ubuntu-22.04, ubuntu-24.04-arm]
2930
env_file: [actions-310.yaml, actions-311.yaml, actions-312.yaml]
3031
# Prevent the include jobs from overriding other jobs
3132
pattern: [""]
@@ -35,9 +36,11 @@ jobs:
3536
env_file: actions-311-downstream_compat.yaml
3637
pattern: "not slow and not network and not single_cpu"
3738
pytest_target: "pandas/tests/test_downstream.py"
39+
platform: ubuntu-22.04
3840
- name: "Minimum Versions"
3941
env_file: actions-310-minimum_versions.yaml
4042
pattern: "not slow and not network and not single_cpu"
43+
platform: ubuntu-22.04
4144
- name: "Locale: it_IT"
4245
env_file: actions-311.yaml
4346
pattern: "not slow and not network and not single_cpu"
@@ -48,6 +51,7 @@ jobs:
4851
# Also install it_IT (its encoding is ISO8859-1) but do not activate it.
4952
# It will be temporarily activated during tests with locale.setlocale
5053
extra_loc: "it_IT"
54+
platform: ubuntu-22.04
5155
- name: "Locale: zh_CN"
5256
env_file: actions-311.yaml
5357
pattern: "not slow and not network and not single_cpu"
@@ -58,25 +62,32 @@ jobs:
5862
# Also install zh_CN (its encoding is gb2312) but do not activate it.
5963
# It will be temporarily activated during tests with locale.setlocale
6064
extra_loc: "zh_CN"
65+
platform: ubuntu-22.04
6166
- name: "Future infer strings"
6267
env_file: actions-312.yaml
6368
pandas_future_infer_string: "1"
69+
platform: ubuntu-22.04
6470
- name: "Future infer strings (without pyarrow)"
6571
env_file: actions-311.yaml
6672
pandas_future_infer_string: "1"
73+
platform: ubuntu-22.04
6774
- name: "Pypy"
6875
env_file: actions-pypy-39.yaml
6976
pattern: "not slow and not network and not single_cpu"
7077
test_args: "--max-worker-restart 0"
78+
platform: ubuntu-22.04
7179
- name: "Numpy Dev"
7280
env_file: actions-311-numpydev.yaml
7381
pattern: "not slow and not network and not single_cpu"
7482
test_args: "-W error::DeprecationWarning -W error::FutureWarning"
83+
platform: ubuntu-22.04
7584
- name: "Pyarrow Nightly"
7685
env_file: actions-311-pyarrownightly.yaml
7786
pattern: "not slow and not network and not single_cpu"
87+
pandas_future_infer_string: "1"
88+
platform: ubuntu-22.04
7889
fail-fast: false
79-
name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }}
90+
name: ${{ matrix.name || format('{0} {1}', matrix.platform, matrix.env_file) }}
8091
env:
8192
PATTERN: ${{ matrix.pattern }}
8293
LANG: ${{ matrix.lang || 'C.UTF-8' }}
@@ -91,7 +102,7 @@ jobs:
91102
REMOVE_PYARROW: ${{ matrix.name == 'Future infer strings (without pyarrow)' && '1' || '0' }}
92103
concurrency:
93104
# https://github.community/t/concurrecy-not-work-for-push/183068/7
94-
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}
105+
group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_future_infer_string }}-${{ matrix.platform }}
95106
cancel-in-progress: true
96107

97108
services:

.github/workflows/wheels.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ jobs:
9494
buildplat:
9595
- [ubuntu-22.04, manylinux_x86_64]
9696
- [ubuntu-22.04, musllinux_x86_64]
97+
- [ubuntu-24.04-arm, manylinux_aarch64]
9798
- [macos-13, macosx_x86_64]
9899
# Note: M1 images on Github Actions start from macOS 14
99100
- [macos-14, macosx_arm64]

ci/code_checks.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7272
-i "pandas.Series.dt PR01" `# Accessors are implemented as classes, but we do not document the Parameters section` \
7373
-i "pandas.Period.freq GL08" \
7474
-i "pandas.Period.ordinal GL08" \
75-
-i "pandas.RangeIndex.from_range PR01,SA01" \
7675
-i "pandas.Timedelta.max PR02" \
7776
-i "pandas.Timedelta.min PR02" \
7877
-i "pandas.Timedelta.resolution PR02" \
7978
-i "pandas.Timestamp.max PR02" \
8079
-i "pandas.Timestamp.min PR02" \
8180
-i "pandas.Timestamp.resolution PR02" \
8281
-i "pandas.Timestamp.tzinfo GL08" \
83-
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
8482
-i "pandas.arrays.TimedeltaArray PR07,SA01" \
8583
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
8684
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \

ci/deps/actions-311-pyarrownightly.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies:
2323

2424
- pip:
2525
- "tzdata>=2022.7"
26-
- "--extra-index-url https://pypi.fury.io/arrow-nightlies/"
26+
- "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple"
2727
- "--prefer-binary"
2828
- "--pre"
2929
- "pyarrow"

doc/source/reference/groupby.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ Function application
104104
DataFrameGroupBy.shift
105105
DataFrameGroupBy.size
106106
DataFrameGroupBy.skew
107+
DataFrameGroupBy.kurt
107108
DataFrameGroupBy.std
108109
DataFrameGroupBy.sum
109110
DataFrameGroupBy.var
@@ -159,6 +160,7 @@ Function application
159160
SeriesGroupBy.shift
160161
SeriesGroupBy.size
161162
SeriesGroupBy.skew
163+
SeriesGroupBy.kurt
162164
SeriesGroupBy.std
163165
SeriesGroupBy.sum
164166
SeriesGroupBy.var

doc/source/reference/window.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,14 @@ Rolling window functions
3030
Rolling.std
3131
Rolling.min
3232
Rolling.max
33+
Rolling.first
34+
Rolling.last
3335
Rolling.corr
3436
Rolling.cov
3537
Rolling.skew
3638
Rolling.kurt
3739
Rolling.apply
40+
Rolling.pipe
3841
Rolling.aggregate
3942
Rolling.quantile
4043
Rolling.sem
@@ -71,11 +74,14 @@ Expanding window functions
7174
Expanding.std
7275
Expanding.min
7376
Expanding.max
77+
Expanding.first
78+
Expanding.last
7479
Expanding.corr
7580
Expanding.cov
7681
Expanding.skew
7782
Expanding.kurt
7883
Expanding.apply
84+
Expanding.pipe
7985
Expanding.aggregate
8086
Expanding.quantile
8187
Expanding.sem

doc/source/whatsnew/v2.3.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ Other enhancements
3535
- The semantics for the ``copy`` keyword in ``__array__`` methods (i.e. called
3636
when using ``np.array()`` or ``np.asarray()`` on pandas objects) has been
3737
updated to work correctly with NumPy >= 2 (:issue:`57739`)
38+
- The :meth:`~Series.cumsum`, :meth:`~Series.cummin`, and :meth:`~Series.cummax` reductions are now implemented for ``StringDtype`` columns when backed by PyArrow (:issue:`60633`)
3839
- The :meth:`~Series.sum` reduction is now implemented for ``StringDtype`` columns (:issue:`59853`)
39-
-
4040

4141
.. ---------------------------------------------------------------------------
4242
.. _whatsnew_230.notable_bug_fixes:
@@ -105,6 +105,7 @@ Conversion
105105

106106
Strings
107107
^^^^^^^
108+
- Bug in :meth:`Series.__pos__` and :meth:`DataFrame.__pos__` did not raise for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`60710`)
108109
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
109110
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
110111
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
@@ -175,7 +176,6 @@ Other
175176
^^^^^
176177
- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
177178
are not installed (:issue:`60196`)
178-
-
179179

180180
.. ---------------------------------------------------------------------------
181181
.. _whatsnew_230.contributors:

doc/source/whatsnew/v3.0.0.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ Other enhancements
3030
^^^^^^^^^^^^^^^^^^
3131
- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
3232
- :class:`pandas.api.typing.SASReader` is available for typing the output of :func:`read_sas` (:issue:`55689`)
33+
- :meth:`pandas.api.interchange.from_dataframe` now uses the `PyCapsule Interface <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`_ if available, only falling back to the Dataframe Interchange Protocol if that fails (:issue:`60739`)
34+
- :class:`pandas.api.typing.NoDefault` is available for typing ``no_default``
3335
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3436
- :func:`pandas.merge` now validates the ``how`` parameter input (merge type) (:issue:`59435`)
3537
- :func:`read_spss` now supports kwargs to be passed to pyreadstat (:issue:`56356`)
@@ -44,17 +46,21 @@ Other enhancements
4446
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
4547
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
4648
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
49+
- :class:`Rolling` and :class:`Expanding` now support ``pipe`` method (:issue:`57076`)
4750
- :class:`Series` now supports the Arrow PyCapsule Interface for export (:issue:`59518`)
4851
- :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`)
4952
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
5053
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
5154
- :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`)
5255
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
5356
- :meth:`DataFrame.pivot_table` and :func:`pivot_table` now allow the passing of keyword arguments to ``aggfunc`` through ``**kwargs`` (:issue:`57884`)
57+
- :meth:`DataFrame.to_json` now encodes ``Decimal`` as strings instead of floats (:issue:`60698`)
5458
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5559
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5660
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
61+
- :class:`Rolling` and :class:`Expanding` now support aggregations ``first`` and ``last`` (:issue:`33155`)
5762
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
63+
- :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.sum`, :meth:`.SeriesGroupBy.mean` and :meth:`.SeriesGroupBy.sum` now accept ``skipna`` parameter (:issue:`15675`)
5864
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
5965
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
6066
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
@@ -686,6 +692,7 @@ MultiIndex
686692
- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`)
687693
- :meth:`MultiIndex.insert` would not insert NA value correctly at unified location of index -1 (:issue:`59003`)
688694
- :func:`MultiIndex.get_level_values` accessing a :class:`DatetimeIndex` does not carry the frequency attribute along (:issue:`58327`, :issue:`57949`)
695+
- Bug in :class:`DataFrame` arithmetic operations in case of unaligned MultiIndex columns (:issue:`60498`)
689696
-
690697

691698
I/O
@@ -807,6 +814,7 @@ Other
807814
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
808815
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
809816
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
817+
- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
810818

811819
.. ***DO NOT USE THIS SECTION***
812820

pandas/_libs/groupby.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def group_sum(
6666
result_mask: np.ndarray | None = ...,
6767
min_count: int = ...,
6868
is_datetimelike: bool = ...,
69+
skipna: bool = ...,
6970
) -> None: ...
7071
def group_prod(
7172
out: np.ndarray, # int64float_t[:, ::1]
@@ -115,6 +116,7 @@ def group_mean(
115116
is_datetimelike: bool = ..., # bint
116117
mask: np.ndarray | None = ...,
117118
result_mask: np.ndarray | None = ...,
119+
skipna: bool = ...,
118120
) -> None: ...
119121
def group_ohlc(
120122
out: np.ndarray, # floatingintuint_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -700,18 +700,19 @@ def group_sum(
700700
uint8_t[:, ::1] result_mask=None,
701701
Py_ssize_t min_count=0,
702702
bint is_datetimelike=False,
703+
bint skipna=True,
703704
) -> None:
704705
"""
705706
Only aggregates on axis=0 using Kahan summation
706707
"""
707708
cdef:
708709
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
709-
sum_t val, t, y
710+
sum_t val, t, y, nan_val
710711
sum_t[:, ::1] sumx, compensation
711712
int64_t[:, ::1] nobs
712713
Py_ssize_t len_values = len(values), len_labels = len(labels)
713714
bint uses_mask = mask is not None
714-
bint isna_entry
715+
bint isna_entry, isna_result
715716

716717
if len_values != len_labels:
717718
raise ValueError("len(index) != len(labels)")
@@ -722,6 +723,15 @@ def group_sum(
722723
compensation = np.zeros((<object>out).shape, dtype=(<object>out).base.dtype)
723724

724725
N, K = (<object>values).shape
726+
if uses_mask:
727+
nan_val = 0
728+
elif is_datetimelike:
729+
nan_val = NPY_NAT
730+
elif sum_t is int64_t or sum_t is uint64_t:
731+
# This has no effect as int64 can't be nan. Setting to 0 to avoid type error
732+
nan_val = 0
733+
else:
734+
nan_val = NAN
725735

726736
with nogil(sum_t is not object):
727737
for i in range(N):
@@ -736,8 +746,16 @@ def group_sum(
736746

737747
if uses_mask:
738748
isna_entry = mask[i, j]
749+
isna_result = result_mask[lab, j]
739750
else:
740751
isna_entry = _treat_as_na(val, is_datetimelike)
752+
isna_result = _treat_as_na(sumx[lab, j], is_datetimelike)
753+
754+
if not skipna and isna_result:
755+
# If sum is already NA, don't add to it. This is important for
756+
# datetimelikebecause adding a value to NPY_NAT may not result
757+
# in a NPY_NAT
758+
continue
741759

742760
if not isna_entry:
743761
nobs[lab, j] += 1
@@ -765,6 +783,11 @@ def group_sum(
765783
# because of no gil
766784
compensation[lab, j] = 0
767785
sumx[lab, j] = t
786+
elif not skipna:
787+
if uses_mask:
788+
result_mask[lab, j] = True
789+
else:
790+
sumx[lab, j] = nan_val
768791

769792
_check_below_mincount(
770793
out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
@@ -1100,6 +1123,7 @@ def group_mean(
11001123
bint is_datetimelike=False,
11011124
const uint8_t[:, ::1] mask=None,
11021125
uint8_t[:, ::1] result_mask=None,
1126+
bint skipna=True,
11031127
) -> None:
11041128
"""
11051129
Compute the mean per label given a label assignment for each value.
@@ -1125,6 +1149,8 @@ def group_mean(
11251149
Mask of the input values.
11261150
result_mask : ndarray[bool, ndim=2], optional
11271151
Mask of the out array
1152+
skipna : bool, optional
1153+
If True, ignore nans in `values`.
11281154

11291155
Notes
11301156
-----
@@ -1168,6 +1194,16 @@ def group_mean(
11681194
for j in range(K):
11691195
val = values[i, j]
11701196

1197+
if not skipna and (
1198+
(uses_mask and result_mask[lab, j]) or
1199+
(is_datetimelike and sumx[lab, j] == NPY_NAT) or
1200+
_treat_as_na(sumx[lab, j], False)
1201+
):
1202+
# If sum is already NA, don't add to it. This is important for
1203+
# datetimelike because adding a value to NPY_NAT may not result
1204+
# in NPY_NAT
1205+
continue
1206+
11711207
if uses_mask:
11721208
isna_entry = mask[i, j]
11731209
elif is_datetimelike:
@@ -1191,6 +1227,14 @@ def group_mean(
11911227
# because of no gil
11921228
compensation[lab, j] = 0.
11931229
sumx[lab, j] = t
1230+
elif not skipna:
1231+
# Set the nobs to 0 so that in case of datetimelike,
1232+
# dividing NPY_NAT by nobs may not result in a NPY_NAT
1233+
nobs[lab, j] = 0
1234+
if uses_mask:
1235+
result_mask[lab, j] = True
1236+
else:
1237+
sumx[lab, j] = nan_val
11941238

11951239
for i in range(ncounts):
11961240
for j in range(K):

0 commit comments

Comments
 (0)