Skip to content

Commit 053ef8b

Browse files
authored
Merge branch 'main' into ordered_categoricals_correlation
2 parents 16cdd21 + b0192c7 commit 053ef8b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+418
-172
lines changed

.circleci/config.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ jobs:
3434
fi
3535
python -m pip install --no-build-isolation -ve . -Csetup-args="--werror"
3636
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH
37-
sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
3837
ci/run_tests.sh
3938
test-linux-musl:
4039
docker:

ci/code_checks.sh

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8181
-i "pandas.Timestamp.resolution PR02" \
8282
-i "pandas.Timestamp.tzinfo GL08" \
8383
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
84-
-i "pandas.arrays.IntervalArray.length SA01" \
8584
-i "pandas.arrays.NumpyExtensionArray SA01" \
8685
-i "pandas.arrays.TimedeltaArray PR07,SA01" \
87-
-i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \
88-
-i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \
8986
-i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \
90-
-i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \
9187
-i "pandas.core.groupby.SeriesGroupBy.plot PR02" \
92-
-i "pandas.core.resample.Resampler.get_group RT03,SA01" \
9388
-i "pandas.core.resample.Resampler.max PR01,RT03,SA01" \
9489
-i "pandas.core.resample.Resampler.mean SA01" \
9590
-i "pandas.core.resample.Resampler.min PR01,RT03,SA01" \
@@ -98,9 +93,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
9893
-i "pandas.core.resample.Resampler.std SA01" \
9994
-i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \
10095
-i "pandas.core.resample.Resampler.var SA01" \
101-
-i "pandas.errors.NullFrequencyError SA01" \
102-
-i "pandas.errors.NumbaUtilError SA01" \
103-
-i "pandas.errors.PerformanceWarning SA01" \
10496
-i "pandas.errors.UndefinedVariableError PR01,SA01" \
10597
-i "pandas.errors.ValueLabelTypeMismatch SA01" \
10698
-i "pandas.io.json.build_table_schema PR07,RT03,SA01" \

doc/source/reference/frame.rst

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,6 @@ Reindexing / selection / label manipulation
185185
DataFrame.duplicated
186186
DataFrame.equals
187187
DataFrame.filter
188-
DataFrame.head
189188
DataFrame.idxmax
190189
DataFrame.idxmin
191190
DataFrame.reindex
@@ -196,7 +195,6 @@ Reindexing / selection / label manipulation
196195
DataFrame.sample
197196
DataFrame.set_axis
198197
DataFrame.set_index
199-
DataFrame.tail
200198
DataFrame.take
201199
DataFrame.truncate
202200

doc/source/user_guide/dsintro.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ This case is handled identically to a dict of arrays.
326326

327327
.. ipython:: python
328328
329-
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")])
329+
data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "S10")])
330330
data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
331331
332332
pd.DataFrame(data)

doc/source/whatsnew/v3.0.0.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Other enhancements
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
5757
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
5858
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
59+
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
5960
- :meth:`Series.corr`, :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith` with ``method="kendall"`` and ``method="spearman"`` now work with ordered categorical data types (:issue:`60306`)
6061
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
6162
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
@@ -627,6 +628,7 @@ Datetimelike
627628
- Bug in :meth:`DatetimeIndex.union` and :meth:`DatetimeIndex.intersection` when ``unit`` was non-nanosecond (:issue:`59036`)
628629
- Bug in :meth:`Series.dt.microsecond` producing incorrect results for pyarrow backed :class:`Series`. (:issue:`59154`)
629630
- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`)
631+
- Bug in :meth:`to_datetime` on float32 df with year, month, day etc. columns leads to precision issues and incorrect result. (:issue:`60506`)
630632
- Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`)
631633
- Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`)
632634
- Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`)
@@ -734,6 +736,7 @@ Groupby/resample/rolling
734736
- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
735737
- Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
736738
- Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`)
739+
- Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`)
737740
- Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
738741
- Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`)
739742
- Bug in :meth:`DataFrameGroupBy.cumsum` and :meth:`DataFrameGroupBy.cumprod` where ``numeric_only`` parameter was passed indirectly through kwargs instead of passing directly. (:issue:`58811`)
@@ -798,6 +801,7 @@ Other
798801
- Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`)
799802
- Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`)
800803
- Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`)
804+
- Bug in ``Series.list`` methods not preserving the original name. (:issue:`60522`)
801805
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
802806

803807
.. ***DO NOT USE THIS SECTION***

pandas/core/arrays/arrow/accessors.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,10 @@ def len(self) -> Series:
117117

118118
value_lengths = pc.list_value_length(self._pa_array)
119119
return Series(
120-
value_lengths, dtype=ArrowDtype(value_lengths.type), index=self._data.index
120+
value_lengths,
121+
dtype=ArrowDtype(value_lengths.type),
122+
index=self._data.index,
123+
name=self._data.name,
121124
)
122125

123126
def __getitem__(self, key: int | slice) -> Series:
@@ -162,7 +165,10 @@ def __getitem__(self, key: int | slice) -> Series:
162165
# key = pc.add(key, pc.list_value_length(self._pa_array))
163166
element = pc.list_element(self._pa_array, key)
164167
return Series(
165-
element, dtype=ArrowDtype(element.type), index=self._data.index
168+
element,
169+
dtype=ArrowDtype(element.type),
170+
index=self._data.index,
171+
name=self._data.name,
166172
)
167173
elif isinstance(key, slice):
168174
if pa_version_under11p0:
@@ -181,7 +187,12 @@ def __getitem__(self, key: int | slice) -> Series:
181187
if step is None:
182188
step = 1
183189
sliced = pc.list_slice(self._pa_array, start, stop, step)
184-
return Series(sliced, dtype=ArrowDtype(sliced.type), index=self._data.index)
190+
return Series(
191+
sliced,
192+
dtype=ArrowDtype(sliced.type),
193+
index=self._data.index,
194+
name=self._data.name,
195+
)
185196
else:
186197
raise ValueError(f"key must be an int or slice, got {type(key).__name__}")
187198

@@ -223,7 +234,12 @@ def flatten(self) -> Series:
223234
counts = pa.compute.list_value_length(self._pa_array)
224235
flattened = pa.compute.list_flatten(self._pa_array)
225236
index = self._data.index.repeat(counts.fill_null(pa.scalar(0, counts.type)))
226-
return Series(flattened, dtype=ArrowDtype(flattened.type), index=index)
237+
return Series(
238+
flattened,
239+
dtype=ArrowDtype(flattened.type),
240+
index=index,
241+
name=self._data.name,
242+
)
227243

228244

229245
class StructAccessor(ArrowAccessor):

pandas/core/arrays/interval.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1306,6 +1306,20 @@ def length(self) -> Index:
13061306
"""
13071307
Return an Index with entries denoting the length of each Interval.
13081308
1309+
The length of an interval is calculated as the difference between
1310+
its `right` and `left` bounds. This property is particularly useful
1311+
when working with intervals where the size of the interval is an important
1312+
attribute, such as in time-series analysis or spatial data analysis.
1313+
1314+
See Also
1315+
--------
1316+
arrays.IntervalArray.left : Return the left endpoints of each Interval in
1317+
the IntervalArray as an Index.
1318+
arrays.IntervalArray.right : Return the right endpoints of each Interval in
1319+
the IntervalArray as an Index.
1320+
arrays.IntervalArray.mid : Return the midpoint of each Interval in the
1321+
IntervalArray as an Index.
1322+
13091323
Examples
13101324
--------
13111325

pandas/core/computation/expressions.py

Lines changed: 38 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -65,23 +65,23 @@ def set_numexpr_threads(n=None) -> None:
6565
ne.set_num_threads(n)
6666

6767

68-
def _evaluate_standard(op, op_str, a, b):
68+
def _evaluate_standard(op, op_str, left_op, right_op):
6969
"""
7070
Standard evaluation.
7171
"""
7272
if _TEST_MODE:
7373
_store_test_result(False)
74-
return op(a, b)
74+
return op(left_op, right_op)
7575

7676

77-
def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
78-
"""return a boolean if we WILL be using numexpr"""
77+
def _can_use_numexpr(op, op_str, left_op, right_op, dtype_check) -> bool:
78+
"""return left_op boolean if we WILL be using numexpr"""
7979
if op_str is not None:
8080
# required min elements (otherwise we are adding overhead)
81-
if a.size > _MIN_ELEMENTS:
81+
if left_op.size > _MIN_ELEMENTS:
8282
# check for dtype compatibility
8383
dtypes: set[str] = set()
84-
for o in [a, b]:
84+
for o in [left_op, right_op]:
8585
# ndarray and Series Case
8686
if hasattr(o, "dtype"):
8787
dtypes |= {o.dtype.name}
@@ -93,43 +93,43 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check) -> bool:
9393
return False
9494

9595

96-
def _evaluate_numexpr(op, op_str, a, b):
96+
def _evaluate_numexpr(op, op_str, left_op, right_op):
9797
result = None
9898

99-
if _can_use_numexpr(op, op_str, a, b, "evaluate"):
99+
if _can_use_numexpr(op, op_str, left_op, right_op, "evaluate"):
100100
is_reversed = op.__name__.strip("_").startswith("r")
101101
if is_reversed:
102102
# we were originally called by a reversed op method
103-
a, b = b, a
103+
left_op, right_op = right_op, left_op
104104

105-
a_value = a
106-
b_value = b
105+
left_value = left_op
106+
right_value = right_op
107107

108108
try:
109109
result = ne.evaluate(
110-
f"a_value {op_str} b_value",
111-
local_dict={"a_value": a_value, "b_value": b_value},
110+
f"left_value {op_str} right_value",
111+
local_dict={"left_value": left_value, "right_value": right_value},
112112
casting="safe",
113113
)
114114
except TypeError:
115115
# numexpr raises eg for array ** array with integers
116116
# (https://github.com/pydata/numexpr/issues/379)
117117
pass
118118
except NotImplementedError:
119-
if _bool_arith_fallback(op_str, a, b):
119+
if _bool_arith_fallback(op_str, left_op, right_op):
120120
pass
121121
else:
122122
raise
123123

124124
if is_reversed:
125125
# reverse order to original for fallback
126-
a, b = b, a
126+
left_op, right_op = right_op, left_op
127127

128128
if _TEST_MODE:
129129
_store_test_result(result is not None)
130130

131131
if result is None:
132-
result = _evaluate_standard(op, op_str, a, b)
132+
result = _evaluate_standard(op, op_str, left_op, right_op)
133133

134134
return result
135135

@@ -170,24 +170,24 @@ def _evaluate_numexpr(op, op_str, a, b):
170170
}
171171

172172

173-
def _where_standard(cond, a, b):
173+
def _where_standard(cond, left_op, right_op):
174174
# Caller is responsible for extracting ndarray if necessary
175-
return np.where(cond, a, b)
175+
return np.where(cond, left_op, right_op)
176176

177177

178-
def _where_numexpr(cond, a, b):
178+
def _where_numexpr(cond, left_op, right_op):
179179
# Caller is responsible for extracting ndarray if necessary
180180
result = None
181181

182-
if _can_use_numexpr(None, "where", a, b, "where"):
182+
if _can_use_numexpr(None, "where", left_op, right_op, "where"):
183183
result = ne.evaluate(
184184
"where(cond_value, a_value, b_value)",
185-
local_dict={"cond_value": cond, "a_value": a, "b_value": b},
185+
local_dict={"cond_value": cond, "a_value": left_op, "b_value": right_op},
186186
casting="safe",
187187
)
188188

189189
if result is None:
190-
result = _where_standard(cond, a, b)
190+
result = _where_standard(cond, left_op, right_op)
191191

192192
return result
193193

@@ -206,13 +206,13 @@ def _has_bool_dtype(x):
206206
_BOOL_OP_UNSUPPORTED = {"+": "|", "*": "&", "-": "^"}
207207

208208

209-
def _bool_arith_fallback(op_str, a, b) -> bool:
209+
def _bool_arith_fallback(op_str, left_op, right_op) -> bool:
210210
"""
211211
Check if we should fallback to the python `_evaluate_standard` in case
212212
of an unsupported operation by numexpr, which is the case for some
213213
boolean ops.
214214
"""
215-
if _has_bool_dtype(a) and _has_bool_dtype(b):
215+
if _has_bool_dtype(left_op) and _has_bool_dtype(right_op):
216216
if op_str in _BOOL_OP_UNSUPPORTED:
217217
warnings.warn(
218218
f"evaluating in Python space because the {op_str!r} "
@@ -224,40 +224,43 @@ def _bool_arith_fallback(op_str, a, b) -> bool:
224224
return False
225225

226226

227-
def evaluate(op, a, b, use_numexpr: bool = True):
227+
def evaluate(op, left_op, right_op, use_numexpr: bool = True):
228228
"""
229-
Evaluate and return the expression of the op on a and b.
229+
Evaluate and return the expression of the op on left_op and right_op.
230230
231231
Parameters
232232
----------
233233
op : the actual operand
234-
a : left operand
235-
b : right operand
234+
left_op : left operand
235+
right_op : right operand
236236
use_numexpr : bool, default True
237237
Whether to try to use numexpr.
238238
"""
239239
op_str = _op_str_mapping[op]
240240
if op_str is not None:
241241
if use_numexpr:
242242
# error: "None" not callable
243-
return _evaluate(op, op_str, a, b) # type: ignore[misc]
244-
return _evaluate_standard(op, op_str, a, b)
243+
return _evaluate(op, op_str, left_op, right_op) # type: ignore[misc]
244+
return _evaluate_standard(op, op_str, left_op, right_op)
245245

246246

247-
def where(cond, a, b, use_numexpr: bool = True):
247+
def where(cond, left_op, right_op, use_numexpr: bool = True):
248248
"""
249-
Evaluate the where condition cond on a and b.
249+
Evaluate the where condition cond on left_op and right_op.
250250
251251
Parameters
252252
----------
253253
cond : np.ndarray[bool]
254-
a : return if cond is True
255-
b : return if cond is False
254+
left_op : return if cond is True
255+
right_op : return if cond is False
256256
use_numexpr : bool, default True
257257
Whether to try to use numexpr.
258258
"""
259259
assert _where is not None
260-
return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b)
260+
if use_numexpr:
261+
return _where(cond, left_op, right_op)
262+
else:
263+
return _where_standard(cond, left_op, right_op)
261264

262265

263266
def set_test_mode(v: bool = True) -> None:

0 commit comments

Comments
 (0)