Skip to content

Commit baa80ce

Browse files
committed
Merge branch 'bugfix-spss-kwargs' of https://github.com/astronights/pandas into bugfix-spss-kwargs
2 parents 61e298a + 24a7ee7 commit baa80ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+689
-696
lines changed

ci/code_checks.sh

Lines changed: 2 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -65,84 +65,38 @@ fi
6565
### DOCSTRINGS ###
6666
if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
6767

68-
MSG='Validate docstrings (EX01, EX02, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
69-
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX02,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
68+
MSG='Validate docstrings (EX01, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
69+
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
7070
RET=$(($RET + $?)) ; echo $MSG "DONE"
7171

7272
MSG='Partially validate docstrings (EX03)' ; echo $MSG
7373
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
74-
pandas.Series.dt.day_name \
75-
pandas.Series.str.len \
76-
pandas.Series.cat.set_categories \
77-
pandas.Series.plot.bar \
78-
pandas.Series.plot.hist \
7974
pandas.Series.plot.line \
8075
pandas.Series.to_sql \
81-
pandas.Series.to_latex \
82-
pandas.errors.CategoricalConversionWarning \
83-
pandas.errors.ChainedAssignmentError \
84-
pandas.errors.ClosedFileError \
8576
pandas.errors.DatabaseError \
8677
pandas.errors.IndexingError \
8778
pandas.errors.InvalidColumnName \
88-
pandas.errors.NumExprClobberingError \
89-
pandas.errors.PossibleDataLossError \
90-
pandas.errors.PossiblePrecisionLoss \
91-
pandas.errors.SettingWithCopyError \
9279
pandas.errors.SettingWithCopyWarning \
9380
pandas.errors.SpecificationError \
9481
pandas.errors.UndefinedVariableError \
95-
pandas.errors.ValueLabelTypeMismatch \
9682
pandas.Timestamp.ceil \
9783
pandas.Timestamp.floor \
9884
pandas.Timestamp.round \
99-
pandas.read_pickle \
100-
pandas.ExcelWriter \
10185
pandas.read_json \
10286
pandas.io.json.build_table_schema \
103-
pandas.DataFrame.to_latex \
10487
pandas.io.formats.style.Styler.to_latex \
10588
pandas.read_parquet \
10689
pandas.DataFrame.to_sql \
10790
pandas.read_stata \
108-
pandas.core.resample.Resampler.pipe \
109-
pandas.core.resample.Resampler.fillna \
110-
pandas.core.resample.Resampler.interpolate \
11191
pandas.plotting.scatter_matrix \
112-
pandas.pivot \
113-
pandas.merge_asof \
114-
pandas.wide_to_long \
115-
pandas.Index.rename \
11692
pandas.Index.droplevel \
117-
pandas.Index.isin \
118-
pandas.CategoricalIndex.set_categories \
11993
pandas.MultiIndex.names \
12094
pandas.MultiIndex.droplevel \
121-
pandas.IndexSlice \
122-
pandas.DatetimeIndex.month_name \
123-
pandas.DatetimeIndex.day_name \
124-
pandas.core.window.rolling.Rolling.corr \
12595
pandas.Grouper \
126-
pandas.core.groupby.SeriesGroupBy.apply \
127-
pandas.core.groupby.DataFrameGroupBy.apply \
128-
pandas.core.groupby.SeriesGroupBy.transform \
129-
pandas.core.groupby.SeriesGroupBy.pipe \
130-
pandas.core.groupby.DataFrameGroupBy.pipe \
131-
pandas.core.groupby.DataFrameGroupBy.describe \
132-
pandas.core.groupby.DataFrameGroupBy.idxmax \
133-
pandas.core.groupby.DataFrameGroupBy.idxmin \
134-
pandas.core.groupby.DataFrameGroupBy.value_counts \
135-
pandas.core.groupby.SeriesGroupBy.describe \
136-
pandas.core.groupby.DataFrameGroupBy.boxplot \
137-
pandas.core.groupby.DataFrameGroupBy.hist \
13896
pandas.io.formats.style.Styler.map \
13997
pandas.io.formats.style.Styler.apply_index \
14098
pandas.io.formats.style.Styler.map_index \
14199
pandas.io.formats.style.Styler.format \
142-
pandas.io.formats.style.Styler.format_index \
143-
pandas.io.formats.style.Styler.relabel_index \
144-
pandas.io.formats.style.Styler.hide \
145-
pandas.io.formats.style.Styler.set_td_classes \
146100
pandas.io.formats.style.Styler.set_tooltips \
147101
pandas.io.formats.style.Styler.set_uuid \
148102
pandas.io.formats.style.Styler.pipe \
@@ -152,20 +106,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
152106
pandas.io.formats.style.Styler.text_gradient \
153107
pandas.DataFrame.values \
154108
pandas.DataFrame.groupby \
155-
pandas.DataFrame.skew \
156-
pandas.DataFrame.var \
157-
pandas.DataFrame.idxmax \
158-
pandas.DataFrame.idxmin \
159-
pandas.DataFrame.last \
160-
pandas.DataFrame.pivot \
161109
pandas.DataFrame.sort_values \
162-
pandas.DataFrame.tz_convert \
163-
pandas.DataFrame.tz_localize \
164-
pandas.DataFrame.plot.bar \
165110
pandas.DataFrame.plot.hexbin \
166-
pandas.DataFrame.plot.hist \
167111
pandas.DataFrame.plot.line \
168-
pandas.DataFrame.hist \
169112
RET=$(($RET + $?)) ; echo $MSG "DONE"
170113

171114
fi

doc/source/whatsnew/v2.3.0.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ Deprecations
101101

102102
Performance improvements
103103
~~~~~~~~~~~~~~~~~~~~~~~~
104+
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
105+
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
104106
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
105107
-
106108

@@ -119,7 +121,7 @@ Categorical
119121

120122
Datetimelike
121123
^^^^^^^^^^^^
122-
-
124+
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
123125
-
124126

125127
Timedelta

pandas/_libs/tslibs/offsets.pyx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4860,15 +4860,15 @@ cpdef to_offset(freq, bint is_period=False):
48604860

48614861
tups = zip(split[0::4], split[1::4], split[2::4])
48624862
for n, (sep, stride, name) in enumerate(tups):
4863-
if is_period is False and name in c_OFFSET_DEPR_FREQSTR:
4863+
if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR:
48644864
warnings.warn(
48654865
f"\'{name}\' is deprecated and will be removed "
48664866
f"in a future version, please use "
4867-
f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.",
4867+
f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.",
48684868
FutureWarning,
48694869
stacklevel=find_stack_level(),
48704870
)
4871-
name = c_OFFSET_DEPR_FREQSTR[name]
4871+
name = c_OFFSET_DEPR_FREQSTR[name.upper()]
48724872
if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR:
48734873
if name.startswith("Y"):
48744874
raise ValueError(

pandas/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,4 +1973,7 @@ def warsaw(request) -> str:
19731973

19741974
@pytest.fixture
19751975
def arrow_string_storage():
1976+
"""
1977+
Fixture that lists possible PyArrow values for StringDtype storage field.
1978+
"""
19761979
return ("pyarrow", "pyarrow_numpy")

pandas/core/arrays/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1082,7 +1082,7 @@ def set_categories(
10821082
For :class:`pandas.Series`:
10831083
10841084
>>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'],
1085-
... categories=['a', 'b', 'c'], ordered=True)
1085+
... categories=['a', 'b', 'c'], ordered=True)
10861086
>>> ser = pd.Series(raw_cat)
10871087
>>> ser
10881088
0 a

pandas/core/arrays/datetimes.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,7 +1365,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
13651365
>>> idx
13661366
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
13671367
dtype='datetime64[ns]', freq='D')
1368-
>>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
1368+
>>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
13691369
Index(['Segunda', 'Terça', 'Quarta'], dtype='object')
13701370
"""
13711371
values = self._local_timestamps()
@@ -2775,11 +2775,6 @@ def _generate_range(
27752775
# variable has type "Optional[Timestamp]")
27762776
start = offset.rollforward(start) # type: ignore[assignment]
27772777

2778-
elif end and not offset.is_on_offset(end):
2779-
# Incompatible types in assignment (expression has type "datetime",
2780-
# variable has type "Optional[Timestamp]")
2781-
end = offset.rollback(end) # type: ignore[assignment]
2782-
27832778
# Unsupported operand types for < ("Timestamp" and "None")
27842779
if periods is None and end < start and offset.n >= 0: # type: ignore[operator]
27852780
end = None

pandas/core/frame.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9223,11 +9223,11 @@ def groupby(
92239223
You could also assign a list of column names or a list of index names.
92249224
92259225
>>> df = pd.DataFrame({
9226-
... "lev1": [1, 1, 1, 2, 2, 2],
9227-
... "lev2": [1, 1, 2, 1, 1, 2],
9228-
... "lev3": [1, 2, 1, 2, 1, 2],
9229-
... "lev4": [1, 2, 3, 4, 5, 6],
9230-
... "values": [0, 1, 2, 3, 4, 5]})
9226+
... "lev1": [1, 1, 1, 2, 2, 2],
9227+
... "lev2": [1, 1, 2, 1, 1, 2],
9228+
... "lev3": [1, 2, 1, 2, 1, 2],
9229+
... "lev4": [1, 2, 3, 4, 5, 6],
9230+
... "values": [0, 1, 2, 3, 4, 5]})
92319231
>>> df
92329232
lev1 lev2 lev3 lev4 values
92339233
0 1 1 1 1 0

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3544,7 +3544,7 @@ def to_latex(
35443544
>>> print(df.to_latex(index=False,
35453545
... formatters={"name": str.upper},
35463546
... float_format="{:.1f}".format,
3547-
... )) # doctest: +SKIP
3547+
... )) # doctest: +SKIP
35483548
\begin{tabular}{lrr}
35493549
\toprule
35503550
name & age & height \\

pandas/core/groupby/groupby.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -444,14 +444,15 @@ class providing the base-class of operations.
444444
a `(callable, data_keyword)` tuple where `data_keyword` is a
445445
string indicating the keyword of `callable` that expects the
446446
%(klass)s object.
447-
args : iterable, optional
447+
*args : iterable, optional
448448
Positional arguments passed into `func`.
449-
kwargs : dict, optional
449+
**kwargs : dict, optional
450450
A dictionary of keyword arguments passed into `func`.
451451
452452
Returns
453453
-------
454-
the return type of `func`.
454+
%(klass)s
455+
The original object with the function `func` applied.
455456
456457
See Also
457458
--------

pandas/core/indexes/base.py

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,7 +1953,7 @@ def rename(self, name, inplace: bool = False) -> Self | None:
19531953
19541954
>>> idx = pd.MultiIndex.from_product([['python', 'cobra'],
19551955
... [2018, 2019]],
1956-
... names=['kind', 'year'])
1956+
... names=['kind', 'year'])
19571957
>>> idx
19581958
MultiIndex([('python', 2018),
19591959
('python', 2019),
@@ -4809,11 +4809,18 @@ def _join_non_unique(
48094809
left_idx, right_idx = get_join_indexers_non_unique(
48104810
self._values, other._values, how=how, sort=sort
48114811
)
4812-
mask = left_idx == -1
48134812

4814-
join_idx = self.take(left_idx)
4815-
right = other.take(right_idx)
4816-
join_index = join_idx.putmask(mask, right)
4813+
if how == "right":
4814+
join_index = other.take(right_idx)
4815+
else:
4816+
join_index = self.take(left_idx)
4817+
4818+
if how == "outer":
4819+
mask = left_idx == -1
4820+
if mask.any():
4821+
right = other.take(right_idx)
4822+
join_index = join_index.putmask(mask, right)
4823+
48174824
if isinstance(join_index, ABCMultiIndex) and how == "outer":
48184825
# test_join_index_levels
48194826
join_index = join_index._sort_levels_monotonic()
@@ -4989,35 +4996,29 @@ def _join_monotonic(
49894996
ridx: npt.NDArray[np.intp] | None
49904997
lidx: npt.NDArray[np.intp] | None
49914998

4992-
if self.is_unique and other.is_unique:
4993-
# We can perform much better than the general case
4994-
if how == "left":
4999+
if how == "left":
5000+
if other.is_unique:
5001+
# We can perform much better than the general case
49955002
join_index = self
49965003
lidx = None
49975004
ridx = self._left_indexer_unique(other)
4998-
elif how == "right":
5005+
else:
5006+
join_array, lidx, ridx = self._left_indexer(other)
5007+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5008+
elif how == "right":
5009+
if self.is_unique:
5010+
# We can perform much better than the general case
49995011
join_index = other
50005012
lidx = other._left_indexer_unique(self)
50015013
ridx = None
5002-
elif how == "inner":
5003-
join_array, lidx, ridx = self._inner_indexer(other)
5004-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5005-
elif how == "outer":
5006-
join_array, lidx, ridx = self._outer_indexer(other)
5007-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5008-
else:
5009-
if how == "left":
5010-
join_array, lidx, ridx = self._left_indexer(other)
5011-
elif how == "right":
5014+
else:
50125015
join_array, ridx, lidx = other._left_indexer(self)
5013-
elif how == "inner":
5014-
join_array, lidx, ridx = self._inner_indexer(other)
5015-
elif how == "outer":
5016-
join_array, lidx, ridx = self._outer_indexer(other)
5017-
5018-
assert lidx is not None
5019-
assert ridx is not None
5020-
5016+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5017+
elif how == "inner":
5018+
join_array, lidx, ridx = self._inner_indexer(other)
5019+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5020+
elif how == "outer":
5021+
join_array, lidx, ridx = self._outer_indexer(other)
50215022
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
50225023

50235024
lidx = None if lidx is None else ensure_platform_int(lidx)
@@ -6574,7 +6575,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
65746575
65756576
Examples
65766577
--------
6577-
>>> idx = pd.Index([1,2,3])
6578+
>>> idx = pd.Index([1, 2, 3])
65786579
>>> idx
65796580
Index([1, 2, 3], dtype='int64')
65806581
@@ -6583,7 +6584,7 @@ def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
65836584
>>> idx.isin([1, 4])
65846585
array([ True, False, False])
65856586
6586-
>>> midx = pd.MultiIndex.from_arrays([[1,2,3],
6587+
>>> midx = pd.MultiIndex.from_arrays([[1, 2, 3],
65876588
... ['red', 'blue', 'green']],
65886589
... names=('number', 'color'))
65896590
>>> midx

0 commit comments

Comments
 (0)