Skip to content

Commit d510052

Browse files
author
Kei
committed
Merge branch 'main' into fix/group_by_agg_pyarrow_bool_numpy_same_type
2 parents 82ddeb5 + f4232e7 commit d510052

File tree

26 files changed

+192
-128
lines changed

26 files changed

+192
-128
lines changed

.github/workflows/wheels.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,7 @@ jobs:
139139
shell: bash -el {0}
140140
run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
141141

142-
- name: Build normal wheels
143-
if: ${{ (env.IS_SCHEDULE_DISPATCH != 'true' || env.IS_PUSH == 'true') }}
142+
- name: Build wheels
144143
uses: pypa/[email protected]
145144
with:
146145
package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,8 @@ Performance improvements
297297
~~~~~~~~~~~~~~~~~~~~~~~~
298298
- :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`)
299299
- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`)
300+
- :class:`Series` returns a :class:`RangeIndex` index when possible when ``data`` is a ``dict`` (:issue:`58118`)
301+
- :func:`concat` returns a :class:`RangeIndex` column when possible when ``objs`` contains :class:`Series` and :class:`DataFrame` and ``axis=0`` (:issue:`58119`)
300302
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
301303
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
302304
- :meth:`Series.str.extract` returns a :class:`RangeIndex` columns instead of an :class:`Index` column when possible (:issue:`57542`)

pandas/core/accessor.py

Lines changed: 94 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def __get__(self, obj, cls):
231231
return accessor_obj
232232

233233

234-
@doc(klass="", others="")
234+
@doc(klass="", examples="", others="")
235235
def _register_accessor(name: str, cls):
236236
"""
237237
Register a custom accessor on {klass} objects.
@@ -255,51 +255,26 @@ def _register_accessor(name: str, cls):
255255
256256
Notes
257257
-----
258-
When accessed, your accessor will be initialized with the pandas object
259-
the user is interacting with. So the signature must be
258+
This function allows you to register a custom-defined accessor class for {klass}.
259+
The requirements for the accessor class are as follows:
260260
261-
.. code-block:: python
261+
* Must contain an init method that:
262262
263-
def __init__(self, pandas_object): # noqa: E999
264-
...
263+
* accepts a single {klass} object
265264
266-
For consistency with pandas methods, you should raise an ``AttributeError``
267-
if the data passed to your accessor has an incorrect dtype.
265+
* raises an AttributeError if the {klass} object does not have correctly
266+
matching inputs for the accessor
268267
269-
>>> pd.Series(["a", "b"]).dt
270-
Traceback (most recent call last):
271-
...
272-
AttributeError: Can only use .dt accessor with datetimelike values
268+
* Must contain a method for each access pattern.
273269
274-
Examples
275-
--------
276-
In your library code::
277-
278-
@pd.api.extensions.register_dataframe_accessor("geo")
279-
class GeoAccessor:
280-
def __init__(self, pandas_obj):
281-
self._obj = pandas_obj
282-
283-
@property
284-
def center(self):
285-
# return the geographic center point of this DataFrame
286-
lat = self._obj.latitude
287-
lon = self._obj.longitude
288-
return (float(lon.mean()), float(lat.mean()))
270+
* The methods should be able to take any argument signature.
289271
290-
def plot(self):
291-
# plot this array's data on a map, e.g., using Cartopy
292-
pass
272+
* Accessible using the @property decorator if no additional arguments are
273+
needed.
293274
294-
Back in an interactive IPython session:
295-
296-
.. code-block:: ipython
297-
298-
In [1]: ds = pd.DataFrame({{"longitude": np.linspace(0, 10),
299-
...: "latitude": np.linspace(0, 20)}})
300-
In [2]: ds.geo.center
301-
Out[2]: (5.0, 10.0)
302-
In [3]: ds.geo.plot() # plots data on a map
275+
Examples
276+
--------
277+
{examples}
303278
"""
304279

305280
def decorator(accessor):
@@ -318,21 +293,98 @@ def decorator(accessor):
318293
return decorator
319294

320295

321-
@doc(_register_accessor, klass="DataFrame")
296+
_register_df_examples = """
297+
An accessor that only accepts integers could
298+
have a class defined like this:
299+
300+
>>> @pd.api.extensions.register_dataframe_accessor("int_accessor")
301+
... class IntAccessor:
302+
... def __init__(self, pandas_obj):
303+
... if not all(pandas_obj[col].dtype == 'int64' for col in pandas_obj.columns):
304+
... raise AttributeError("All columns must contain integer values only")
305+
... self._obj = pandas_obj
306+
...
307+
... def sum(self):
308+
... return self._obj.sum()
309+
...
310+
>>> df = pd.DataFrame([[1, 2], ['x', 'y']])
311+
>>> df.int_accessor
312+
Traceback (most recent call last):
313+
...
314+
AttributeError: All columns must contain integer values only.
315+
>>> df = pd.DataFrame([[1, 2], [3, 4]])
316+
>>> df.int_accessor.sum()
317+
0 4
318+
1 6
319+
dtype: int64"""
320+
321+
322+
@doc(_register_accessor, klass="DataFrame", examples=_register_df_examples)
322323
def register_dataframe_accessor(name: str):
323324
from pandas import DataFrame
324325

325326
return _register_accessor(name, DataFrame)
326327

327328

328-
@doc(_register_accessor, klass="Series")
329+
_register_series_examples = """
330+
An accessor that only accepts integers could
331+
have a class defined like this:
332+
333+
>>> @pd.api.extensions.register_series_accessor("int_accessor")
334+
... class IntAccessor:
335+
... def __init__(self, pandas_obj):
336+
... if not pandas_obj.dtype == 'int64':
337+
... raise AttributeError("The series must contain integer data only")
338+
... self._obj = pandas_obj
339+
...
340+
... def sum(self):
341+
... return self._obj.sum()
342+
...
343+
>>> df = pd.Series([1, 2, 'x'])
344+
>>> df.int_accessor
345+
Traceback (most recent call last):
346+
...
347+
AttributeError: The series must contain integer data only.
348+
>>> df = pd.Series([1, 2, 3])
349+
>>> df.int_accessor.sum()
350+
6"""
351+
352+
353+
@doc(_register_accessor, klass="Series", examples=_register_series_examples)
329354
def register_series_accessor(name: str):
330355
from pandas import Series
331356

332357
return _register_accessor(name, Series)
333358

334359

335-
@doc(_register_accessor, klass="Index")
360+
_register_index_examples = """
361+
An accessor that only accepts integers could
362+
have a class defined like this:
363+
364+
>>> @pd.api.extensions.register_index_accessor("int_accessor")
365+
... class IntAccessor:
366+
... def __init__(self, pandas_obj):
367+
... if not all(isinstance(x, int) for x in pandas_obj):
368+
... raise AttributeError("The index must only be an integer value")
369+
... self._obj = pandas_obj
370+
...
371+
... def even(self):
372+
... return [x for x in self._obj if x % 2 == 0]
373+
>>> df = pd.DataFrame.from_dict(
374+
... {"row1": {"1": 1, "2": "a"}, "row2": {"1": 2, "2": "b"}}, orient="index"
375+
... )
376+
>>> df.index.int_accessor
377+
Traceback (most recent call last):
378+
...
379+
AttributeError: The index must only be an integer value.
380+
>>> df = pd.DataFrame(
381+
... {"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]}, index=[1, 2, 5, 8]
382+
... )
383+
>>> df.index.int_accessor.even()
384+
[2, 8]"""
385+
386+
387+
@doc(_register_accessor, klass="Index", examples=_register_index_examples)
336388
def register_index_accessor(name: str):
337389
from pandas import Index
338390

pandas/core/arrays/arrow/array.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1697,7 +1697,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
16971697
except (AttributeError, NotImplementedError, TypeError) as err:
16981698
msg = (
16991699
f"'{type(self).__name__}' with dtype {self.dtype} "
1700-
f"does not support reduction '{name}' with pyarrow "
1700+
f"does not support operation '{name}' with pyarrow "
17011701
f"version {pa.__version__}. '{name}' may be supported by "
17021702
f"upgrading pyarrow."
17031703
)

pandas/core/arrays/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1886,7 +1886,7 @@ def _reduce(
18861886
18871887
Raises
18881888
------
1889-
TypeError : subclass does not define reductions
1889+
TypeError : subclass does not define operations
18901890
18911891
Examples
18921892
--------
@@ -1897,7 +1897,7 @@ def _reduce(
18971897
if meth is None:
18981898
raise TypeError(
18991899
f"'{type(self).__name__}' with dtype {self.dtype} "
1900-
f"does not support reduction '{name}'"
1900+
f"does not support operation '{name}'"
19011901
)
19021902
result = meth(skipna=skipna, **kwargs)
19031903
if keepdims:

pandas/core/arrays/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1662,7 +1662,7 @@ def _groupby_op(
16621662
if dtype.kind == "M":
16631663
# Adding/multiplying datetimes is not valid
16641664
if how in ["any", "all", "sum", "prod", "cumsum", "cumprod", "var", "skew"]:
1665-
raise TypeError(f"datetime64 type does not support operation: '{how}'")
1665+
raise TypeError(f"datetime64 type does not support operation '{how}'")
16661666

16671667
elif isinstance(dtype, PeriodDtype):
16681668
# Adding/multiplying Periods is not valid

pandas/core/indexes/base.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,6 @@
176176
)
177177
from pandas.core.missing import clean_reindex_fill_method
178178
from pandas.core.ops import get_op_result_name
179-
from pandas.core.ops.invalid import make_invalid_op
180179
from pandas.core.sorting import (
181180
ensure_key_mapped,
182181
get_group_index_sorter,
@@ -6453,6 +6452,10 @@ def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
64536452
>>> idx = pd.Index(list("abcd"))
64546453
>>> idx.slice_locs(start="b", end="c")
64556454
(1, 3)
6455+
6456+
>>> idx = pd.Index(list("bcde"))
6457+
>>> idx.slice_locs(start="a", end="c")
6458+
(0, 2)
64566459
"""
64576460
inc = step is None or step >= 0
64586461

@@ -6938,14 +6941,8 @@ def _maybe_disable_logical_methods(self, opname: str_t) -> None:
69386941
"""
69396942
raise if this Index subclass does not support any or all.
69406943
"""
6941-
if (
6942-
isinstance(self, ABCMultiIndex)
6943-
# TODO(3.0): PeriodArray and DatetimeArray any/all will raise,
6944-
# so checking needs_i8_conversion will be unnecessary
6945-
or (needs_i8_conversion(self.dtype) and self.dtype.kind != "m")
6946-
):
6947-
# This call will raise
6948-
make_invalid_op(opname)(self)
6944+
if isinstance(self, ABCMultiIndex):
6945+
raise TypeError(f"cannot perform {opname} with {type(self).__name__}")
69496946

69506947
@Appender(IndexOpsMixin.argmin.__doc__)
69516948
def argmin(self, axis=None, skipna: bool = True, *args, **kwargs) -> int:
@@ -7144,7 +7141,10 @@ def maybe_sequence_to_range(sequence) -> Any | range:
71447141
return sequence
71457142
if len(sequence) == 0:
71467143
return range(0)
7147-
np_sequence = np.asarray(sequence, dtype=np.int64)
7144+
try:
7145+
np_sequence = np.asarray(sequence, dtype=np.int64)
7146+
except OverflowError:
7147+
return sequence
71487148
diff = np_sequence[1] - np_sequence[0]
71497149
if diff == 0:
71507150
return sequence

pandas/core/nanops.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,7 @@ def nanany(
520520

521521
if values.dtype.kind == "M":
522522
# GH#34479
523-
raise TypeError("datetime64 type does not support operation: 'any'")
523+
raise TypeError("datetime64 type does not support operation 'any'")
524524

525525
values, _ = _get_values(values, skipna, fill_value=False, mask=mask)
526526

@@ -576,7 +576,7 @@ def nanall(
576576

577577
if values.dtype.kind == "M":
578578
# GH#34479
579-
raise TypeError("datetime64 type does not support operation: 'all'")
579+
raise TypeError("datetime64 type does not support operation 'all'")
580580

581581
values, _ = _get_values(values, skipna, fill_value=True, mask=mask)
582582

pandas/core/reshape/concat.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -518,8 +518,11 @@ def _sanitize_mixed_ndim(
518518
# to have unique names
519519
name = current_column
520520
current_column += 1
521-
522-
obj = sample._constructor({name: obj}, copy=False)
521+
obj = sample._constructor(obj, copy=False)
522+
if isinstance(obj, ABCDataFrame):
523+
obj.columns = range(name, name + 1, 1)
524+
else:
525+
obj = sample._constructor({name: obj}, copy=False)
523526

524527
new_objs.append(obj)
525528

pandas/core/series.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@
132132
PeriodIndex,
133133
default_index,
134134
ensure_index,
135+
maybe_sequence_to_range,
135136
)
136137
import pandas.core.indexes.base as ibase
137138
from pandas.core.indexes.multi import maybe_droplevels
@@ -538,16 +539,14 @@ def _init_dict(
538539
_data : BlockManager for the new Series
539540
index : index for the new Series
540541
"""
541-
keys: Index | tuple
542-
543542
# Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
544543
# raises KeyError), so we iterate the entire dict, and align
545544
if data:
546545
# GH:34717, issue was using zip to extract key and values from data.
547546
# using generators in effects the performance.
548547
# Below is the new way of extracting the keys and values
549548

550-
keys = tuple(data.keys())
549+
keys = maybe_sequence_to_range(tuple(data.keys()))
551550
values = list(data.values()) # Generating list of values- faster way
552551
elif index is not None:
553552
# fastpath for Series(data=None). Just use broadcasting a scalar

0 commit comments

Comments
 (0)