Skip to content

Commit d0b6abc

Browse files
committed
Merge branch 'bugfix-spss-kwargs' of https://github.com/astronights/pandas into bugfix-spss-kwargs
2 parents 8ef79d3 + 01bb03f commit d0b6abc

File tree

113 files changed

+843
-451
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

113 files changed

+843
-451
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ repos:
132132
types: [python]
133133
stages: [manual]
134134
additional_dependencies: &pyright_dependencies
135-
135+
136136
- id: pyright
137137
# note: assumes python env is setup and activated
138138
name: pyright reportGeneralTypeIssues
@@ -190,9 +190,6 @@ repos:
190190
# Check for deprecated messages without sphinx directive
191191
|(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)
192192
193-
# {foo!r} instead of {repr(foo)}
194-
|!r}
195-
196193
# builtin filter function
197194
|(?<!def)[\(\s]filter\(
198195
types_or: [python, cython, rst]

ci/code_checks.sh

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -73,42 +73,17 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7373
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
7474
pandas.Series.plot.line \
7575
pandas.Series.to_sql \
76-
pandas.errors.DatabaseError \
77-
pandas.errors.IndexingError \
78-
pandas.errors.InvalidColumnName \
7976
pandas.errors.SettingWithCopyWarning \
8077
pandas.errors.SpecificationError \
8178
pandas.errors.UndefinedVariableError \
82-
pandas.Timestamp.ceil \
83-
pandas.Timestamp.floor \
84-
pandas.Timestamp.round \
8579
pandas.read_json \
86-
pandas.io.json.build_table_schema \
8780
pandas.io.formats.style.Styler.to_latex \
8881
pandas.read_parquet \
8982
pandas.DataFrame.to_sql \
90-
pandas.read_stata \
91-
pandas.plotting.scatter_matrix \
92-
pandas.Index.droplevel \
93-
pandas.MultiIndex.names \
94-
pandas.MultiIndex.droplevel \
95-
pandas.Grouper \
9683
pandas.io.formats.style.Styler.map \
9784
pandas.io.formats.style.Styler.apply_index \
9885
pandas.io.formats.style.Styler.map_index \
9986
pandas.io.formats.style.Styler.format \
100-
pandas.io.formats.style.Styler.set_tooltips \
101-
pandas.io.formats.style.Styler.set_uuid \
102-
pandas.io.formats.style.Styler.pipe \
103-
pandas.io.formats.style.Styler.highlight_between \
104-
pandas.io.formats.style.Styler.highlight_quantile \
105-
pandas.io.formats.style.Styler.background_gradient \
106-
pandas.io.formats.style.Styler.text_gradient \
107-
pandas.DataFrame.values \
108-
pandas.DataFrame.groupby \
109-
pandas.DataFrame.sort_values \
110-
pandas.DataFrame.plot.hexbin \
111-
pandas.DataFrame.plot.line \
11287
RET=$(($RET + $?)) ; echo $MSG "DONE"
11388

11489
fi

doc/scripts/eval_performance.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ def bench_with(n, times=10, repeat=3, engine="numexpr"):
1717
return (
1818
np.array(
1919
timeit(
20-
f"df.eval(s, engine={repr(engine)})",
20+
f"df.eval(s, engine={engine!r})",
2121
setup=setup_common % (n, setup_with),
2222
repeat=repeat,
2323
number=times,
@@ -34,7 +34,7 @@ def bench_subset(n, times=20, repeat=3, engine="numexpr"):
3434
return (
3535
np.array(
3636
timeit(
37-
f"df.query(s, engine={repr(engine)})",
37+
f"df.query(s, engine={engine!r})",
3838
setup=setup_common % (n, setup_subset),
3939
repeat=repeat,
4040
number=times,
@@ -55,7 +55,7 @@ def bench(mn=3, mx=7, num=100, engines=("python", "numexpr"), verbose=False):
5555
for engine in engines:
5656
for i, n in enumerate(r):
5757
if verbose & (i % 10 == 0):
58-
print(f"engine: {repr(engine)}, i == {i:d}")
58+
print(f"engine: {engine!r}, i == {i:d}")
5959
ev_times = bench_with(n, times=1, repeat=1, engine=engine)
6060
ev.loc[i, engine] = np.mean(ev_times)
6161
qu_times = bench_subset(n, times=1, repeat=1, engine=engine)

doc/source/development/contributing.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Bug reports and enhancement requests
1919
====================================
2020

2121
Bug reports and enhancement requests are an important part of making pandas more stable and
22-
are curated though Github issues. When reporting and issue or request, please select the `appropriate
22+
are curated though Github issues. When reporting an issue or request, please select the `appropriate
2323
category and fill out the issue form fully <https://github.com/pandas-dev/pandas/issues/new/choose>`_
2424
to ensure others and the core development team can fully understand the scope of the issue.
2525

doc/source/user_guide/io.rst

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,7 +1704,7 @@ option parameter:
17041704

17051705
.. code-block:: python
17061706
1707-
storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}}}
1707+
storage_options = {"client_kwargs": {"endpoint_url": "http://127.0.0.1:5555"}}
17081708
df = pd.read_json("s3://pandas-test/test-1", storage_options=storage_options)
17091709
17101710
More sample configurations and documentation can be found at `S3Fs documentation
@@ -3015,14 +3015,15 @@ Read in the content of the "books.xml" as instance of ``StringIO`` or
30153015
Even read XML from AWS S3 buckets such as NIH NCBI PMC Article Datasets providing
30163016
Biomedical and Life Science Jorurnals:
30173017

3018-
.. ipython:: python
3019-
:okwarning:
3018+
.. code-block:: python
30203019
3021-
df = pd.read_xml(
3022-
"s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml",
3023-
xpath=".//journal-meta",
3024-
)
3025-
df
3020+
>>> df = pd.read_xml(
3021+
... "s3://pmc-oa-opendata/oa_comm/xml/all/PMC1236943.xml",
3022+
... xpath=".//journal-meta",
3023+
...)
3024+
>>> df
3025+
journal-id journal-title issn publisher
3026+
0 Cardiovasc Ultrasound Cardiovascular Ultrasound 1476-7120 NaN
30263027
30273028
With `lxml`_ as default ``parser``, you access the full-featured XML library
30283029
that extends Python's ElementTree API. One powerful tool is ability to query

doc/source/whatsnew/v2.3.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Performance improvements
103103
~~~~~~~~~~~~~~~~~~~~~~~~
104104
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
105105
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
106+
- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
106107
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
107108
-
108109

@@ -211,6 +212,8 @@ Styler
211212

212213
Other
213214
^^^^^
215+
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)
216+
214217

215218
.. ***DO NOT USE THIS SECTION***
216219

pandas/_config/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ class DeprecatedOption(NamedTuple):
8888

8989
class RegisteredOption(NamedTuple):
9090
key: str
91-
defval: object
91+
defval: Any
9292
doc: str
9393
validator: Callable[[object], Any] | None
9494
cb: Callable[[str], Any] | None
@@ -130,7 +130,7 @@ def _get_single_key(pat: str, silent: bool) -> str:
130130
if len(keys) == 0:
131131
if not silent:
132132
_warn_if_deprecated(pat)
133-
raise OptionError(f"No such keys(s): {repr(pat)}")
133+
raise OptionError(f"No such keys(s): {pat!r}")
134134
if len(keys) > 1:
135135
raise OptionError("Pattern matched multiple keys")
136136
key = keys[0]

pandas/_config/localization.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@
1010
import platform
1111
import re
1212
import subprocess
13-
from typing import TYPE_CHECKING
13+
from typing import (
14+
TYPE_CHECKING,
15+
cast,
16+
)
1417

1518
from pandas._config.config import options
1619

@@ -152,7 +155,7 @@ def get_locales(
152155
out_locales = []
153156
for x in split_raw_locales:
154157
try:
155-
out_locales.append(str(x, encoding=options.display.encoding))
158+
out_locales.append(str(x, encoding=cast(str, options.display.encoding)))
156159
except UnicodeError:
157160
# 'locale -a' is used to populated 'raw_locales' and on
158161
# Redhat 7 Linux (and maybe others) prints locale names

pandas/_libs/groupby.pyi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,10 @@ def group_shift_indexer(
4242
def group_fillna_indexer(
4343
out: np.ndarray, # ndarray[intp_t]
4444
labels: np.ndarray, # ndarray[int64_t]
45-
sorted_labels: npt.NDArray[np.intp],
4645
mask: npt.NDArray[np.uint8],
4746
limit: int, # int64_t
48-
dropna: bool,
47+
compute_ffill: bool,
48+
ngroups: int,
4949
) -> None: ...
5050
def group_any_all(
5151
out: np.ndarray, # uint8_t[::1]

pandas/_libs/groupby.pyx

Lines changed: 33 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -493,10 +493,10 @@ def group_shift_indexer(
493493
def group_fillna_indexer(
494494
ndarray[intp_t] out,
495495
ndarray[intp_t] labels,
496-
ndarray[intp_t] sorted_labels,
497496
ndarray[uint8_t] mask,
498497
int64_t limit,
499-
bint dropna,
498+
bint compute_ffill,
499+
int ngroups,
500500
) -> None:
501501
"""
502502
Indexes how to fill values forwards or backwards within a group.
@@ -508,50 +508,52 @@ def group_fillna_indexer(
508508
labels : np.ndarray[np.intp]
509509
Array containing unique label for each group, with its ordering
510510
matching up to the corresponding record in `values`.
511-
sorted_labels : np.ndarray[np.intp]
512-
obtained by `np.argsort(labels, kind="mergesort")`
513-
values : np.ndarray[np.uint8]
514-
Containing the truth value of each element.
515511
mask : np.ndarray[np.uint8]
516512
Indicating whether a value is na or not.
517-
limit : Consecutive values to fill before stopping, or -1 for no limit
518-
dropna : Flag to indicate if NaN groups should return all NaN values
513+
limit : int64_t
514+
Consecutive values to fill before stopping, or -1 for no limit.
515+
compute_ffill : bint
516+
Whether to compute ffill or bfill.
517+
ngroups : int
518+
Number of groups, larger than all entries of `labels`.
519519

520520
Notes
521521
-----
522522
This method modifies the `out` parameter rather than returning an object
523523
"""
524524
cdef:
525-
Py_ssize_t i, N, idx
526-
intp_t curr_fill_idx=-1
527-
int64_t filled_vals = 0
528-
529-
N = len(out)
525+
Py_ssize_t idx, N = len(out)
526+
intp_t label
527+
intp_t[::1] last = -1 * np.ones(ngroups, dtype=np.intp)
528+
intp_t[::1] fill_count = np.zeros(ngroups, dtype=np.intp)
530529

531530
# Make sure all arrays are the same size
532531
assert N == len(labels) == len(mask)
533532

534533
with nogil:
535-
for i in range(N):
536-
idx = sorted_labels[i]
537-
if dropna and labels[idx] == -1: # nan-group gets nan-values
538-
curr_fill_idx = -1
534+
# Can't use for loop with +/- step
535+
# https://github.com/cython/cython/issues/1106
536+
idx = 0 if compute_ffill else N-1
537+
for _ in range(N):
538+
label = labels[idx]
539+
if label == -1: # na-group gets na-values
540+
out[idx] = -1
539541
elif mask[idx] == 1: # is missing
540542
# Stop filling once we've hit the limit
541-
if filled_vals >= limit and limit != -1:
542-
curr_fill_idx = -1
543-
filled_vals += 1
544-
else: # reset items when not missing
545-
filled_vals = 0
546-
curr_fill_idx = idx
547-
548-
out[idx] = curr_fill_idx
549-
550-
# If we move to the next group, reset
551-
# the fill_idx and counter
552-
if i == N - 1 or labels[idx] != labels[sorted_labels[i + 1]]:
553-
curr_fill_idx = -1
554-
filled_vals = 0
543+
if limit != -1 and fill_count[label] >= limit:
544+
out[idx] = -1
545+
else:
546+
out[idx] = last[label]
547+
fill_count[label] += 1
548+
else:
549+
fill_count[label] = 0 # reset items when not missing
550+
last[label] = idx
551+
out[idx] = idx
552+
553+
if compute_ffill:
554+
idx += 1
555+
else:
556+
idx -= 1
555557

556558

557559
@cython.boundscheck(False)

0 commit comments

Comments
 (0)