Skip to content

Commit a3096d2

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-disallow-str-pyarrow-alias
2 parents c72a9bf + 57d2489 commit a3096d2

39 files changed

+637
-218
lines changed

.github/workflows/wheels.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ jobs:
9494
buildplat:
9595
- [ubuntu-22.04, manylinux_x86_64]
9696
- [ubuntu-22.04, musllinux_x86_64]
97-
- [macos-12, macosx_x86_64]
97+
- [macos-13, macosx_x86_64]
9898
# Note: M1 images on Github Actions start from macOS 14
9999
- [macos-14, macosx_arm64]
100100
- [windows-2022, win_amd64]

.pre-commit-config.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ ci:
1919
skip: [pyright, mypy]
2020
repos:
2121
- repo: https://github.com/astral-sh/ruff-pre-commit
22-
rev: v0.8.1
22+
rev: v0.8.6
2323
hooks:
2424
- id: ruff
2525
args: [--exit-non-zero-on-fix]
@@ -34,7 +34,7 @@ repos:
3434
- id: ruff-format
3535
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
3636
- repo: https://github.com/jendrikseipp/vulture
37-
rev: 'v2.13'
37+
rev: 'v2.14'
3838
hooks:
3939
- id: vulture
4040
entry: python scripts/run_vulture.py
@@ -74,7 +74,7 @@ repos:
7474
hooks:
7575
- id: isort
7676
- repo: https://github.com/asottile/pyupgrade
77-
rev: v3.19.0
77+
rev: v3.19.1
7878
hooks:
7979
- id: pyupgrade
8080
args: [--py310-plus]
@@ -95,12 +95,17 @@ repos:
9595
- id: sphinx-lint
9696
args: ["--enable", "all", "--disable", "line-too-long"]
9797
- repo: https://github.com/pre-commit/mirrors-clang-format
98-
rev: v19.1.4
98+
rev: v19.1.6
9999
hooks:
100100
- id: clang-format
101101
files: ^pandas/_libs/src|^pandas/_libs/include
102102
args: [-i]
103103
types_or: [c, c++]
104+
- repo: https://github.com/trim21/pre-commit-mirror-meson
105+
rev: v1.6.1
106+
hooks:
107+
- id: meson-fmt
108+
args: ['--inplace']
104109
- repo: local
105110
hooks:
106111
- id: pyright

asv_bench/benchmarks/io/csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ def setup(self):
594594
self.StringIO_input = StringIO(data)
595595

596596
def time_read_csv_index_col(self):
597-
read_csv(self.StringIO_input, index_col="a")
597+
read_csv(self.data(self.StringIO_input), index_col="a")
598598

599599

600600
class ReadCSVDatePyarrowEngine(StringIORewind):
@@ -605,7 +605,7 @@ def setup(self):
605605

606606
def time_read_csv_index_col(self):
607607
read_csv(
608-
self.StringIO_input,
608+
self.data(self.StringIO_input),
609609
parse_dates=["a"],
610610
engine="pyarrow",
611611
dtype_backend="pyarrow",

doc/source/user_guide/io.rst

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2340,6 +2340,7 @@ Read a URL with no options:
23402340
.. code-block:: ipython
23412341
23422342
In [320]: url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list"
2343+
23432344
In [321]: pd.read_html(url)
23442345
Out[321]:
23452346
[ Bank NameBank CityCity StateSt ... Acquiring InstitutionAI Closing DateClosing FundFund
@@ -2366,6 +2367,7 @@ Read a URL while passing headers alongside the HTTP request:
23662367
.. code-block:: ipython
23672368
23682369
In [322]: url = 'https://www.sump.org/notes/request/' # HTTP request reflector
2370+
23692371
In [323]: pd.read_html(url)
23702372
Out[323]:
23712373
[ 0 1
@@ -2378,14 +2380,16 @@ Read a URL while passing headers alongside the HTTP request:
23782380
1 Host: www.sump.org
23792381
2 User-Agent: Python-urllib/3.8
23802382
3 Connection: close]
2383+
23812384
In [324]: headers = {
2382-
In [325]: 'User-Agent':'Mozilla Firefox v14.0',
2383-
In [326]: 'Accept':'application/json',
2384-
In [327]: 'Connection':'keep-alive',
2385-
In [328]: 'Auth':'Bearer 2*/f3+fe68df*4'
2386-
In [329]: }
2387-
In [340]: pd.read_html(url, storage_options=headers)
2388-
Out[340]:
2385+
.....: 'User-Agent':'Mozilla Firefox v14.0',
2386+
.....: 'Accept':'application/json',
2387+
.....: 'Connection':'keep-alive',
2388+
.....: 'Auth':'Bearer 2*/f3+fe68df*4'
2389+
.....: }
2390+
2391+
In [325]: pd.read_html(url, storage_options=headers)
2392+
Out[325]:
23892393
[ 0 1
23902394
0 Remote Socket: 51.15.105.256:51760
23912395
1 Protocol Version: HTTP/1.1

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ Other enhancements
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
58+
- :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`)
5859
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
5960
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
6061
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
@@ -798,6 +799,7 @@ Other
798799
- Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`)
799800
- Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
800801
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)
802+
- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` throwing ``ValueError`` when ``regex=True`` and all NA values. (:issue:`60688`)
801803
- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`)
802804
- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
803805
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)

meson.build

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,13 @@
11
# This file is adapted from https://github.com/scipy/scipy/blob/main/meson.build
22
project(
33
'pandas',
4-
'c', 'cpp', 'cython',
4+
'c',
5+
'cpp',
6+
'cython',
57
version: run_command(['generate_version.py', '--print'], check: true).stdout().strip(),
68
license: 'BSD-3',
79
meson_version: '>=1.2.1',
8-
default_options: [
9-
'buildtype=release',
10-
'c_std=c11',
11-
'warning_level=2',
12-
]
10+
default_options: ['buildtype=release', 'c_std=c11', 'warning_level=2'],
1311
)
1412

1513
fs = import('fs')
@@ -18,41 +16,40 @@ tempita = files('generate_pxi.py')
1816
versioneer = files('generate_version.py')
1917

2018

21-
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'c')
22-
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language : 'cpp')
19+
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'c')
20+
add_project_arguments('-DNPY_NO_DEPRECATED_API=0', language: 'cpp')
2321

2422
# Allow supporting older numpys than the version compiled against
2523
# Set the define to the min supported version of numpy for pandas
2624
# e.g. right now this is targeting numpy 1.21+
27-
add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'c')
28-
add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language : 'cpp')
25+
add_project_arguments('-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION', language: 'c')
26+
add_project_arguments(
27+
'-DNPY_TARGET_VERSION=NPY_1_21_API_VERSION',
28+
language: 'cpp',
29+
)
2930

3031

3132
if fs.exists('_version_meson.py')
3233
py.install_sources('_version_meson.py', subdir: 'pandas')
3334
else
34-
custom_target('write_version_file',
35+
custom_target(
36+
'write_version_file',
3537
output: '_version_meson.py',
36-
command: [
37-
py, versioneer, '-o', '@OUTPUT@'
38-
],
38+
command: [py, versioneer, '-o', '@OUTPUT@'],
3939
build_by_default: true,
4040
build_always_stale: true,
4141
install: true,
42-
install_dir: py.get_install_dir() / 'pandas'
42+
install_dir: py.get_install_dir() / 'pandas',
4343
)
4444
meson.add_dist_script(py, versioneer, '-o', '_version_meson.py')
4545
endif
4646

4747
cy = meson.get_compiler('cython')
4848
if cy.version().version_compare('>=3.1.0')
49-
add_project_arguments('-Xfreethreading_compatible=true', language : 'cython')
49+
add_project_arguments('-Xfreethreading_compatible=true', language: 'cython')
5050
endif
5151

5252
# Needed by pandas.test() when it looks for the pytest ini options
53-
py.install_sources(
54-
'pyproject.toml',
55-
subdir: 'pandas'
56-
)
53+
py.install_sources('pyproject.toml', subdir: 'pandas')
5754

5855
subdir('pandas')

pandas/_libs/byteswap.pyx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ from libc.string cimport memcpy
1515

1616
def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
1717
cdef uint32_t value
18-
assert offset + sizeof(value) < len(data)
18+
assert offset + <Py_ssize_t>sizeof(value) < len(data)
1919
cdef const void *ptr = <unsigned char*>(data) + offset
2020
memcpy(&value, ptr, sizeof(value))
2121
if byteswap:
@@ -28,7 +28,7 @@ def read_float_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
2828

2929
def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
3030
cdef uint64_t value
31-
assert offset + sizeof(value) < len(data)
31+
assert offset + <Py_ssize_t>sizeof(value) < len(data)
3232
cdef const void *ptr = <unsigned char*>(data) + offset
3333
memcpy(&value, ptr, sizeof(value))
3434
if byteswap:
@@ -41,7 +41,7 @@ def read_double_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
4141

4242
def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
4343
cdef uint16_t res
44-
assert offset + sizeof(res) < len(data)
44+
assert offset + <Py_ssize_t>sizeof(res) < len(data)
4545
memcpy(&res, <const unsigned char*>(data) + offset, sizeof(res))
4646
if byteswap:
4747
res = _byteswap2(res)
@@ -50,7 +50,7 @@ def read_uint16_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
5050

5151
def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
5252
cdef uint32_t res
53-
assert offset + sizeof(res) < len(data)
53+
assert offset + <Py_ssize_t>sizeof(res) < len(data)
5454
memcpy(&res, <const unsigned char*>(data) + offset, sizeof(res))
5555
if byteswap:
5656
res = _byteswap4(res)
@@ -59,7 +59,7 @@ def read_uint32_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
5959

6060
def read_uint64_with_byteswap(bytes data, Py_ssize_t offset, bint byteswap):
6161
cdef uint64_t res
62-
assert offset + sizeof(res) < len(data)
62+
assert offset + <Py_ssize_t>sizeof(res) < len(data)
6363
memcpy(&res, <const unsigned char*>(data) + offset, sizeof(res))
6464
if byteswap:
6565
res = _byteswap8(res)

pandas/_libs/groupby.pyi

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ def group_skew(
9797
result_mask: np.ndarray | None = ...,
9898
skipna: bool = ...,
9999
) -> None: ...
100+
def group_kurt(
101+
out: np.ndarray, # float64_t[:, ::1]
102+
counts: np.ndarray, # int64_t[::1]
103+
values: np.ndarray, # ndarray[float64_T, ndim=2]
104+
labels: np.ndarray, # const intp_t[::1]
105+
mask: np.ndarray | None = ...,
106+
result_mask: np.ndarray | None = ...,
107+
skipna: bool = ...,
108+
) -> None: ...
100109
def group_mean(
101110
out: np.ndarray, # floating[:, ::1]
102111
counts: np.ndarray, # int64_t[::1]

pandas/_libs/groupby.pyx

Lines changed: 96 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -910,7 +910,7 @@ def group_var(
910910
@cython.wraparound(False)
911911
@cython.boundscheck(False)
912912
@cython.cdivision(True)
913-
@cython.cpow
913+
@cython.cpow(True)
914914
def group_skew(
915915
float64_t[:, ::1] out,
916916
int64_t[::1] counts,
@@ -961,7 +961,7 @@ def group_skew(
961961
isna_entry = _treat_as_na(val, False)
962962

963963
if not isna_entry:
964-
# Based on RunningStats::Push from
964+
# Running stats update based on RunningStats::Push from
965965
# https://www.johndcook.com/blog/skewness_kurtosis/
966966
n1 = nobs[lab, j]
967967
n = n1 + 1
@@ -995,6 +995,100 @@ def group_skew(
995995
)
996996

997997

998+
@cython.wraparound(False)
999+
@cython.boundscheck(False)
1000+
@cython.cdivision(True)
1001+
@cython.cpow(True)
1002+
def group_kurt(
1003+
float64_t[:, ::1] out,
1004+
int64_t[::1] counts,
1005+
ndarray[float64_t, ndim=2] values,
1006+
const intp_t[::1] labels,
1007+
const uint8_t[:, ::1] mask=None,
1008+
uint8_t[:, ::1] result_mask=None,
1009+
bint skipna=True,
1010+
) -> None:
1011+
cdef:
1012+
Py_ssize_t i, j, N, K, lab, ngroups = len(counts)
1013+
int64_t[:, ::1] nobs
1014+
Py_ssize_t len_values = len(values), len_labels = len(labels)
1015+
bint isna_entry, uses_mask = mask is not None
1016+
float64_t[:, ::1] M1, M2, M3, M4
1017+
float64_t delta, delta_n, delta_n2, term1, val
1018+
int64_t n1, n
1019+
float64_t ct, num, den, adj
1020+
1021+
if len_values != len_labels:
1022+
raise ValueError("len(index) != len(labels)")
1023+
1024+
nobs = np.zeros((<object>out).shape, dtype=np.int64)
1025+
1026+
# M1, M2, M3 and M4 correspond to 1st, 2nd, 3rd and 4th Moments
1027+
M1 = np.zeros((<object>out).shape, dtype=np.float64)
1028+
M2 = np.zeros((<object>out).shape, dtype=np.float64)
1029+
M3 = np.zeros((<object>out).shape, dtype=np.float64)
1030+
M4 = np.zeros((<object>out).shape, dtype=np.float64)
1031+
1032+
N, K = (<object>values).shape
1033+
1034+
out[:, :] = 0.0
1035+
1036+
with nogil:
1037+
for i in range(N):
1038+
lab = labels[i]
1039+
if lab < 0:
1040+
continue
1041+
1042+
counts[lab] += 1
1043+
1044+
for j in range(K):
1045+
val = values[i, j]
1046+
1047+
if uses_mask:
1048+
isna_entry = mask[i, j]
1049+
else:
1050+
isna_entry = _treat_as_na(val, False)
1051+
1052+
if not isna_entry:
1053+
# Running stats update based on RunningStats::Push from
1054+
# https://www.johndcook.com/blog/skewness_kurtosis/
1055+
n1 = nobs[lab, j]
1056+
n = n1 + 1
1057+
1058+
nobs[lab, j] = n
1059+
delta = val - M1[lab, j]
1060+
delta_n = delta / n
1061+
delta_n2 = delta_n * delta_n
1062+
term1 = delta * delta_n * n1
1063+
1064+
M1[lab, j] += delta_n
1065+
M4[lab, j] += (term1 * delta_n2 * (n*n - 3*n + 3)
1066+
+ 6 * delta_n2 * M2[lab, j]
1067+
- 4 * delta_n * M3[lab, j])
1068+
M3[lab, j] += term1 * delta_n * (n - 2) - 3 * delta_n * M2[lab, j]
1069+
M2[lab, j] += term1
1070+
elif not skipna:
1071+
M1[lab, j] = NaN
1072+
M2[lab, j] = NaN
1073+
M3[lab, j] = NaN
1074+
M4[lab, j] = NaN
1075+
1076+
for i in range(ngroups):
1077+
for j in range(K):
1078+
ct = <float64_t>nobs[i, j]
1079+
if ct < 4:
1080+
if result_mask is not None:
1081+
result_mask[i, j] = 1
1082+
out[i, j] = NaN
1083+
elif M2[i, j] == 0:
1084+
out[i, j] = 0
1085+
else:
1086+
num = ct * (ct + 1) * (ct - 1) * M4[i, j]
1087+
den = (ct - 2) * (ct - 3) * M2[i, j] ** 2
1088+
adj = 3.0 * (ct - 1) ** 2 / ((ct - 2) * (ct - 3))
1089+
out[i, j] = num / den - adj
1090+
1091+
9981092
@cython.wraparound(False)
9991093
@cython.boundscheck(False)
10001094
def group_mean(

pandas/_libs/include/pandas/vendored/klib/khash_python.h

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,9 @@ static void *traced_calloc(size_t num, size_t size) {
3434
}
3535

3636
static void *traced_realloc(void *old_ptr, size_t size) {
37+
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
3738
void *ptr = realloc(old_ptr, size);
3839
if (ptr != NULL) {
39-
if (old_ptr != ptr) {
40-
PyTraceMalloc_Untrack(KHASH_TRACE_DOMAIN, (uintptr_t)old_ptr);
41-
}
4240
PyTraceMalloc_Track(KHASH_TRACE_DOMAIN, (uintptr_t)ptr, size);
4341
}
4442
return ptr;

0 commit comments

Comments
 (0)