diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bf919c6fe8a42..cef0e68874a10 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9,8 +9,6 @@ labeling information """ -from __future__ import annotations - import collections from collections import abc from collections.abc import ( @@ -209,7 +207,6 @@ AnyAll, AnyArrayLike, ArrayLike, - Axes, Axis, AxisInt, ColspaceArgType, @@ -303,8 +300,7 @@ A named Series object is treated as a DataFrame with a single named column. -The join is done on columns or indexes. If joining columns on -columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. When performing a cross merge, no column specifications to merge on are allowed. @@ -509,6 +505,35 @@ # ----------------------------------------------------------------------- # DataFrame class +# Define type aliases to avoid import cycle issues +from typing import Union, Sequence, TypeVar, Protocol, Iterator, Any, overload, Mapping, Callable +from os import PathLike + +_T_co = TypeVar("_T_co", covariant=True) +AnyStr = TypeVar("AnyStr", str, bytes) + +# Define protocols to avoid import cycles +class SequenceNotStr(Protocol[_T_co]): + def __getitem__(self, index) -> Any: ... + def __contains__(self, value) -> bool: ... + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[_T_co]: ... + +class BaseBuffer(Protocol): + @property + def mode(self) -> str: ... + def seek(self, __offset: int, __whence: int = ...) -> int: ... + def tell(self) -> int: ... + +class WriteBuffer(BaseBuffer, Protocol[AnyStr]): + def write(self, __b: AnyStr) -> Any: ... + def flush(self) -> Any: ... + +Axes = Union[Sequence[Hashable], Index] +FormattersType = Union[list[Callable], tuple[Callable, ...], Mapping[Union[str, int], Callable]] +FloatFormatType = Union[str, Callable, None] +FilePath = Union[str, PathLike[str]] + @set_module("pandas") class DataFrame(NDFrame, OpsMixin): @@ -661,21 +686,32 @@ class DataFrame(NDFrame, OpsMixin): __pandas_priority__ = 4000 @property - def _constructor(self) -> type[DataFrame]: - return DataFrame + def _constructor(self): + return type(self) - def _constructor_from_mgr(self, mgr, axes) -> DataFrame: - df = DataFrame._from_mgr(mgr, axes=axes) + idx_diff = result_index.difference(correl.index) - if type(self) is DataFrame: - # This would also work `if self._constructor is DataFrame`, but - # this check is slightly faster, benefiting the most-common case. - return df + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - elif type(self).__name__ == "GeoDataFrame": - # Shim until geopandas can override their _constructor_from_mgr - # bc they have different behavior for Managers than for DataFrames - return self._constructor(mgr) + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" # We assume that the subclass __init__ knows how to handle a # pd.DataFrame object. @@ -702,10 +738,10 @@ def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: def __init__( self, data=None, - index: Axes | None = None, - columns: Axes | None = None, - dtype: Dtype | None = None, - copy: bool | None = None, + index=None, + columns=None, + dtype=None, + copy=None, ) -> None: allow_mgr = False if dtype is not None: @@ -902,11 +938,34 @@ def __init__( NDFrame.__init__(self, mgr) + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" # ---------------------------------------------------------------------- def __dataframe__( self, nan_as_null: bool = False, allow_copy: bool = True - ) -> DataFrameXchg: + ): """ Return the dataframe interchange object implementing the interchange protocol. @@ -1241,8 +1300,8 @@ def to_string( header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., - formatters: fmt.FormattersType | None = ..., - float_format: fmt.FloatFormatType | None = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., sparsify: bool | None = ..., index_names: bool = ..., justify: str | None = ..., @@ -1266,8 +1325,8 @@ def to_string( header: bool | SequenceNotStr[str] = ..., index: bool = ..., na_rep: str = ..., - formatters: fmt.FormattersType | None = ..., - float_format: fmt.FloatFormatType | None = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., sparsify: bool | None = ..., index_names: bool = ..., justify: str | None = ..., @@ -1301,8 +1360,8 @@ def to_string( header: bool | SequenceNotStr[str] = True, index: bool = True, na_rep: str = "NaN", - formatters: fmt.FormattersType | None = None, - float_format: fmt.FloatFormatType | None = None, + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, sparsify: bool | None = None, index_names: bool = True, justify: str | None = None, @@ -1377,7 +1436,7 @@ def _get_values_for_csv( decimal: str, na_rep: str, quoting, # int csv.QUOTE_FOO from stdlib - ) -> DataFrame: + ): # helper used by to_csv mgr = self._mgr.get_values_for_csv( float_format=float_format, @@ -1391,7 +1450,7 @@ def _get_values_for_csv( # ---------------------------------------------------------------------- @property - def style(self) -> Styler: + def style(self): """ Returns a Styler object. @@ -1617,7 +1676,7 @@ def __len__(self) -> int: def dot(self, other: Series) -> Series: ... @overload - def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... + def dot(self, other: "DataFrame" | Index | ArrayLike) -> "DataFrame": ... def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: """ @@ -1997,15 +2056,13 @@ def to_dict( index: bool = ..., ) -> list[dict]: ... - # error: Incompatible default for argument "into" (default has type "type - # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") def to_dict( self, orient: Literal[ "dict", "list", "series", "split", "tight", "records", "index" ] = "dict", *, - into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] + into: type[MutableMappingT] | MutableMappingT = dict, index: bool = True, ) -> MutableMappingT | list[MutableMappingT]: """ @@ -2283,13 +2340,7 @@ def maybe_reorder( if coerce_float: for i, arr in enumerate(arrays): if arr.dtype == object: - # error: Argument 1 to "maybe_convert_objects" has - # incompatible type "Union[ExtensionArray, ndarray]"; - # expected "ndarray" - arrays[i] = lib.maybe_convert_objects( - arr, # type: ignore[arg-type] - try_float=True, - ) + arrays[i] = lib.maybe_convert_objects(arr, try_float=True) arr_columns = ensure_index(arr_columns) if columns is None: @@ -2484,9 +2535,7 @@ def to_records( if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, np.dtype, str)): - # error: Argument 1 to "append" of "list" has incompatible - # type "Union[type, dtype[Any], str]"; expected "dtype[Any]" - formats.append(dtype_mapping) # type: ignore[arg-type] + formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" @@ -2659,17 +2708,9 @@ def to_stata( raise ValueError("strl is not supported in format 114") from pandas.io.stata import StataWriter as statawriter elif version == 117: - # Incompatible import of "statawriter" (imported name has type - # "Type[StataWriter117]", local name has type "Type[StataWriter]") - from pandas.io.stata import ( # type: ignore[assignment] - StataWriter117 as statawriter, - ) + from pandas.io.stata import StataWriter117 as statawriter else: # versions 118 and 119 - # Incompatible import of "statawriter" (imported name has type - # "Type[StataWriter117]", local name has type "Type[StataWriter]") - from pandas.io.stata import ( # type: ignore[assignment] - StataWriterUTF8 as statawriter, - ) + from pandas.io.stata import StataWriterUTF8 as statawriter kwargs: dict[str, Any] = {} if version is None or version >= 117: @@ -2854,19 +2895,6 @@ def to_markdown( handles.handle.write(result) return None - @overload - def to_parquet( - self, - path: None = ..., - *, - engine: Literal["auto", "pyarrow", "fastparquet"] = ..., - compression: str | None = ..., - index: bool | None = ..., - partition_cols: list[str] | None = ..., - storage_options: StorageOptions = ..., - **kwargs, - ) -> bytes: ... - @overload def to_parquet( self, @@ -4476,1131 +4504,9470 @@ def _get_item(self, item: Hashable) -> Series: loc = self.columns.get_loc(item) return self._ixs(loc, axis=1) - # ---------------------------------------------------------------------- - # Unsorted - - @overload - def query( - self, - expr: str, - *, - parser: Literal["pandas", "python"] = ..., - engine: Literal["python", "numexpr"] | None = ..., - local_dict: dict[str, Any] | None = ..., - global_dict: dict[str, Any] | None = ..., - resolvers: list[Mapping] | None = ..., - level: int = ..., - inplace: Literal[False] = ..., - ) -> DataFrame: ... +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. - @overload - def query( - self, - expr: str, - *, - parser: Literal["pandas", "python"] = ..., - engine: Literal["python", "numexpr"] | None = ..., - local_dict: dict[str, Any] | None = ..., - global_dict: dict[str, Any] | None = ..., - resolvers: list[Mapping] | None = ..., - level: int = ..., - inplace: Literal[True], - ) -> None: ... + idx_diff = result_index.difference(correl.index) - @overload - def query( - self, - expr: str, - *, - parser: Literal["pandas", "python"] = ..., - engine: Literal["python", "numexpr"] | None = ..., - local_dict: dict[str, Any] | None = ..., - global_dict: dict[str, Any] | None = ..., - resolvers: list[Mapping] | None = ..., - level: int = ..., - inplace: bool = ..., - ) -> DataFrame | None: ... + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - def query( - self, - expr: str, - *, - parser: Literal["pandas", "python"] = "pandas", - engine: Literal["python", "numexpr"] | None = None, - local_dict: dict[str, Any] | None = None, - global_dict: dict[str, Any] | None = None, - resolvers: list[Mapping] | None = None, - level: int = 0, - inplace: bool = False, - ) -> DataFrame | None: - """ - Query the columns of a DataFrame with a boolean expression. + return correl - .. warning:: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Parameters - ---------- - expr : str - The query string to evaluate. +if TYPE_CHECKING: + import datetime - See the documentation for :func:`eval` for details of - supported operations and functions in the query string. - - See the documentation for :meth:`DataFrame.eval` for details on - referring to column names and variables in the query string. - parser : {'pandas', 'python'}, default 'pandas' - The parser to use to construct the syntax tree from the expression. The - default of ``'pandas'`` parses code slightly different than standard - Python. Alternatively, you can parse an expression using the - ``'python'`` parser to retain strict Python semantics. See the - :ref:`enhancing performance ` documentation for - more details. - engine : {'python', 'numexpr'}, default 'numexpr' - - The engine used to evaluate the expression. Supported engines are - - - None : tries to use ``numexpr``, falls back to ``python`` - - ``'numexpr'`` : This default engine evaluates pandas objects using - numexpr for large speed ups in complex expressions with large frames. - - ``'python'`` : Performs operations as if you had ``eval``'d in top - level python. This engine is generally not that useful. - - More backends may be available in the future. - local_dict : dict or None, optional - A dictionary of local variables, taken from locals() by default. - global_dict : dict or None, optional - A dictionary of global variables, taken from globals() by default. - resolvers : list of dict-like or None, optional - A list of objects implementing the ``__getitem__`` special method that - you can use to inject an additional collection of namespaces to use for - variable lookup. For example, this is used in the - :meth:`~DataFrame.query` method to inject the - ``DataFrame.index`` and ``DataFrame.columns`` - variables that refer to their respective :class:`~pandas.DataFrame` - instance attributes. - level : int, optional - The number of prior stack frames to traverse and add to the current - scope. Most users will **not** need to change this parameter. - inplace : bool - Whether to modify the DataFrame rather than creating a new one. + from pandas._libs.internals import BlockValuesRefs + from pandas._typing import ( + AggFuncType, + AnyAll, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + AxisInt, + ColspaceArgType, + CompressionOptions, + CorrelationMethod, + DropKeep, + Dtype, + DtypeObj, + FilePath, + FloatFormatType, + FormattersType, + Frequency, + FromDictOrient, + HashableT, + HashableT2, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + JoinValidate, + Level, + ListLike, + MergeHow, + MergeValidate, + MutableMappingT, + NaPosition, + NsmallestNlargestKeep, + PythonFuncType, + QuantileInterpolation, + ReadBuffer, + ReindexMethod, + Renamer, + Scalar, + Self, + SequenceNotStr, + SortKind, + StorageOptions, + Suffixes, + T, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, + ValueKeyFunc, + WriteBuffer, + XMLParsers, + npt, + ) - Returns - ------- - DataFrame or None - DataFrame resulting from the provided query expression or - None if ``inplace=True``. + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.core.internals.managers import SingleBlockManager - See Also - -------- - eval : Evaluate a string describing operations on - DataFrame columns. - DataFrame.eval : Evaluate a string describing operations on - DataFrame columns. + from pandas.io.formats.style import Styler - Notes - ----- - The result of the evaluation of this expression is first passed to - :attr:`DataFrame.loc` and if that fails because of a - multidimensional key (e.g., a DataFrame) then the result will be passed - to :meth:`DataFrame.__getitem__`. +# --------------------------------------------------------------------- +# Docstring templates - This method uses the top-level :func:`eval` function to - evaluate the passed query. +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + "inplace": """ + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one.""", + "optional_by": """ +by : str or list of str + Name or list of names to sort by. - The :meth:`~pandas.DataFrame.query` method uses a slightly - modified Python syntax by default. For example, the ``&`` and ``|`` - (bitwise) operators have the precedence of their boolean cousins, - :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, - however the semantics are different. + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels.""", + "optional_reindex": """ +labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. +index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. +columns : array-like, optional + New labels for the columns. Preferably an Index object to avoid + duplicating data. +axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", +} - You can change the semantics of the expression by passing the keyword - argument ``parser='python'``. This enforces the same semantics as - evaluation in Python space. Likewise, you can pass ``engine='python'`` - to evaluate an expression using Python itself as a backend. This is not - recommended as it is inefficient compared to using ``numexpr`` as the - engine. +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. - The :attr:`DataFrame.index` and - :attr:`DataFrame.columns` attributes of the - :class:`~pandas.DataFrame` instance are placed in the query namespace - by default, which allows you to treat both the index and columns of the - frame as a column in the frame. - The identifier ``index`` is used for the frame index; you can also - use the name of the index to identify it in a query. Please note that - Python keywords may not be used as identifiers. +A named Series object is treated as a DataFrame with a single named column. - For further details and examples see the ``query`` documentation in - :ref:`indexing `. +The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. - *Backtick quoted variables* + idx_diff = result_index.difference(correl.index) - Backtick quoted variables are parsed as literal Python code and - are converted internally to a Python valid identifier. - This can lead to the following problems. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - During parsing a number of disallowed characters inside the backtick - quoted string are replaced by strings that are allowed as a Python identifier. - These characters include all operators in Python, the space character, the - question mark, the exclamation mark, the dollar sign, and the euro sign. + return correl - A backtick can be escaped by double backticks. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - See also the `Python documentation about lexical analysis - `__ - in combination with the source code in :mod:`pandas.core.computation.parsing`. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Examples - -------- - >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} - ... ) - >>> df - A B C&C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 - >>> df.query("A > B") - A B C&C - 4 5 2 6 + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` +indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. - The previous expression is equivalent to + idx_diff = result_index.difference(correl.index) - >>> df[df.A > df.B] - A B C&C - 4 5 2 6 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - For columns with spaces in their name, you can use backtick quoting. + return correl - >>> df.query("B == `C&C`") - A B C&C - 0 1 10 10 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - The previous expression is equivalent to + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df[df.B == df["C&C"]] - A B C&C - 0 1 10 10 + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. - Using local variable: + idx_diff = result_index.difference(correl.index) - >>> local_var = 2 - >>> df.query("A <= @local_var") - A B C&C - 0 1 10 10 - 1 2 8 9 - """ - inplace = validate_bool_kwarg(inplace, "inplace") - if not isinstance(expr, str): - msg = f"expr must be a string to be evaluated, {type(expr)} given" - raise ValueError(msg) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - res = self.eval( - expr, - level=level + 1, - parser=parser, - target=None, - engine=engine, - local_dict=local_dict, - global_dict=global_dict, - resolvers=resolvers or (), - ) + return correl - try: - result = self.loc[res] - except ValueError: - # when res is multi-dimensional loc raises, but this is sometimes a - # valid query - result = self[res] + # ---------------------------------------------------------------------- + # ndarray-like stats methods - if inplace: - self._update_inplace(result) - return None - else: - return result + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @overload - def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: ... +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. - @overload - def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ... +Examples +-------- +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 - def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: - """ - Evaluate a string describing operations on DataFrame columns. +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. - .. warning:: +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 - This method can run arbitrary code which can make you vulnerable to code - injection if you pass user input to this function. +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. - Operates on columns only, not specific rows or elements. This allows - `eval` to run arbitrary code, which can make you vulnerable to code - injection if you pass user input to this function. + idx_diff = result_index.difference(correl.index) - Parameters - ---------- - expr : str - The expression string to evaluate. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. + return correl - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuation (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "if", "for", "import", etc) cannot be used. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - See the documentation for :func:`eval` for full details of - supported operations and functions in the expression string. - inplace : bool, default False - If the expression contains an assignment, whether to perform the - operation inplace and mutate the existing DataFrame. Otherwise, - a new DataFrame is returned. - **kwargs - See the documentation for :func:`eval` for complete details - on the keyword arguments accepted by - :meth:`~pandas.DataFrame.eval`. +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') - Returns - ------- - ndarray, scalar, pandas object, or None - The result of the evaluation or None if ``inplace=True``. +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 - See Also - -------- - DataFrame.query : Evaluates a boolean expression to query the columns - of a frame. - DataFrame.assign : Can evaluate an expression or function to create new - values for a column. - eval : Evaluate a Python expression as a string using various - backends. +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 - Notes - ----- - For more details see the API documentation for :func:`~eval`. - For detailed examples see :ref:`enhancing performance with eval - `. + idx_diff = result_index.difference(correl.index) - Examples - -------- - >>> df = pd.DataFrame( - ... {"A": range(1, 6), "B": range(10, 0, -2), "C&C": range(10, 5, -1)} - ... ) - >>> df - A B C&C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 - >>> df.eval("A + B") - 0 11 - 1 10 - 2 9 - 3 8 - 4 7 - dtype: int64 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Assignment is allowed though by default the original DataFrame is not - modified. + return correl - >>> df.eval("D = A + B") - A B C&C D - 0 1 10 10 11 - 1 2 8 9 10 - 2 3 6 8 9 - 3 4 4 7 8 - 4 5 2 6 7 - >>> df - A B C&C - 0 1 10 10 - 1 2 8 9 - 2 3 6 8 - 3 4 4 7 - 4 5 2 6 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Multiple columns can be assigned to using multi-line expressions: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.eval( - ... ''' - ... D = A + B - ... E = A - B - ... ''' - ... ) - A B C&C D E - 0 1 10 10 11 -9 - 1 2 8 9 10 -6 - 2 3 6 8 9 -3 - 3 4 4 7 8 0 - 4 5 2 6 7 3 - - For columns with spaces or other disallowed characters in their name, you can - use backtick quoting. - - >>> df.eval("B * `C&C`") - 0 100 - 1 72 - 2 48 - 3 28 - 4 12 - - Local variables shall be explicitly referenced using ``@`` - character in front of the name: - - >>> local_var = 2 - >>> df.eval("@local_var * A") - 0 2 - 1 4 - 2 6 - 3 8 - 4 10 - """ - from pandas.core.computation.eval import eval as _eval +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 - inplace = validate_bool_kwarg(inplace, "inplace") - kwargs["level"] = kwargs.pop("level", 0) + 1 - index_resolvers = self._get_index_resolvers() - column_resolvers = self._get_cleaned_column_resolvers() - resolvers = column_resolvers, index_resolvers - if "target" not in kwargs: - kwargs["target"] = self - kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers + idx_diff = result_index.difference(correl.index) - return _eval(expr, inplace=inplace, **kwargs) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - def select_dtypes(self, include=None, exclude=None) -> DataFrame: - """ - Return a subset of the DataFrame's columns based on the column dtypes. + return correl - This method allows for filtering columns based on their data types. - It is useful when working with heterogeneous DataFrames where operations - need to be performed on a specific subset of data types. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Parameters - ---------- - include, exclude : scalar or list-like - A selection of dtypes or strings to be included/excluded. At least - one of these parameters must be supplied. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Returns - ------- - DataFrame - The subset of the frame including the dtypes in ``include`` and - excluding the dtypes in ``exclude``. - Raises - ------ - ValueError - * If both of ``include`` and ``exclude`` are empty - * If ``include`` and ``exclude`` have overlapping elements - TypeError - * If any kind of string dtype is passed in. + idx_diff = result_index.difference(correl.index) - See Also - -------- - DataFrame.dtypes: Return Series with the data type of each column. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Notes - ----- - * To select all *numeric* types, use ``np.number`` or ``'number'`` - * To select strings you must use the ``object`` dtype, but note that - this will return *all* object dtype columns. With - ``pd.options.future.infer_string`` enabled, using ``"str"`` will - work to select all string columns. - * See the `numpy dtype hierarchy - `__ - * To select datetimes, use ``np.datetime64``, ``'datetime'`` or - ``'datetime64'`` - * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or - ``'timedelta64'`` - * To select Pandas categorical dtypes, use ``'category'`` - * To select Pandas datetimetz dtypes, use ``'datetimetz'`` - or ``'datetime64[ns, tz]'`` + return correl - Examples - -------- - >>> df = pd.DataFrame( - ... {"a": [1, 2] * 3, "b": [True, False] * 3, "c": [1.0, 2.0] * 3} - ... ) - >>> df - a b c - 0 1 True 1.0 - 1 2 False 2.0 - 2 1 True 1.0 - 3 2 False 2.0 - 4 1 True 1.0 - 5 2 False 2.0 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.select_dtypes(include="bool") - b - 0 True - 1 False - 2 True - 3 False - 4 True - 5 False + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.select_dtypes(include=["float64"]) - c - 0 1.0 - 1 2.0 - 2 1.0 - 3 2.0 - 4 1.0 - 5 2.0 - >>> df.select_dtypes(exclude=["int64"]) - b c - 0 True 1.0 - 1 False 2.0 - 2 True 1.0 - 3 False 2.0 - 4 True 1.0 - 5 False 2.0 - """ - if not is_list_like(include): - include = (include,) if include is not None else () - if not is_list_like(exclude): - exclude = (exclude,) if exclude is not None else () +@set_module("pandas") +class DataFrame(NDFrame, OpsMixin): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. - selection = (frozenset(include), frozenset(exclude)) + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. - if not any(selection): - raise ValueError("at least one of include or exclude must be nonempty") + idx_diff = result_index.difference(correl.index) - # convert the myriad valid dtypes object to a single representation - def check_int_infer_dtype(dtypes): - converted_dtypes: list[type] = [] - for dtype in dtypes: - # Numpy maps int to different types (int32, in64) on Windows and Linux - # see https://github.com/numpy/numpy/issues/9464 - if (isinstance(dtype, str) and dtype == "int") or (dtype is int): - converted_dtypes.append(np.int32) - converted_dtypes.append(np.int64) - elif dtype == "float" or dtype is float: - # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 - converted_dtypes.extend([np.float64, np.float32]) - else: - converted_dtypes.append(infer_dtype_from_object(dtype)) - return frozenset(converted_dtypes) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - include = check_int_infer_dtype(include) - exclude = check_int_infer_dtype(exclude) + return correl - for dtypes in (include, exclude): - invalidate_string_dtypes(dtypes) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - # can't both include AND exclude! - if not include.isdisjoint(exclude): - raise ValueError(f"include and exclude overlap on {(include & exclude)}") + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: - # GH 46870: BooleanDtype._is_numeric == True but should be excluded - dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype - return issubclass(dtype.type, tuple(dtypes_set)) or ( - np.number in dtypes_set - and getattr(dtype, "_is_numeric", False) - and not is_bool_dtype(dtype) - ) - - def predicate(arr: ArrayLike) -> bool: - dtype = arr.dtype - if include: - if not dtype_predicate(dtype, include): - return False + If data is a list of dicts, column order follows insertion-order. - if exclude: - if dtype_predicate(dtype, exclude): - return False + idx_diff = result_index.difference(correl.index) - return True + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - mgr = self._mgr._get_data_subset(predicate).copy(deep=False) - return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) + return correl - def insert( - self, - loc: int, - column: Hashable, - value: object, - allow_duplicates: bool | lib.NoDefault = lib.no_default, - ) -> None: - """ - Insert column into DataFrame at specified location. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Raises a ValueError if `column` is already contained in the DataFrame, - unless `allow_duplicates` is set to True. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Parameters - ---------- - loc : int - Insertion index. Must verify 0 <= loc <= len(columns). - column : str, number, or hashable object - Label of the inserted column. - value : Scalar, Series, or array-like - Content of the inserted column. - allow_duplicates : bool, optional, default lib.no_default - Allow duplicate column labels to be created. + .. versionchanged:: 1.3.0 - See Also - -------- - Index.insert : Insert new item by index. + idx_diff = result_index.difference(correl.index) - Examples - -------- - >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) - >>> df - col1 col2 - 0 1 3 - 1 2 4 - >>> df.insert(1, "newcol", [99, 99]) - >>> df - col1 newcol col2 - 0 1 99 3 - 1 2 99 4 - >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) - >>> df - col1 col1 newcol col2 - 0 100 1 99 3 - 1 100 2 99 4 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Notice that pandas uses index alignment in case of `value` from type `Series`: + return correl - >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) - >>> df - col0 col1 col1 newcol col2 - 0 NaN 100 1 99 3 - 1 5.0 100 2 99 4 - """ - if allow_duplicates is lib.no_default: - allow_duplicates = False - if allow_duplicates and not self.flags.allows_duplicate_labels: - raise ValueError( - "Cannot specify 'allow_duplicates=True' when " - "'self.flags.allows_duplicate_labels' is False." - ) - if not allow_duplicates and column in self.columns: - # Should this be a different kind of error?? - raise ValueError(f"cannot insert {column}, already exists") - if not is_integer(loc): - raise TypeError("loc must be int") - # convert non stdlib ints to satisfy typing checks - loc = int(loc) - if isinstance(value, DataFrame) and len(value.columns) > 1: - raise ValueError( - f"Expected a one-dimensional object, got a DataFrame with " - f"{len(value.columns)} columns instead." - ) - elif isinstance(value, DataFrame): - value = value.iloc[:, 0] + # ---------------------------------------------------------------------- + # ndarray-like stats methods - value, refs = self._sanitize_column(value) - self._mgr.insert(loc, column, value, refs=refs) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - def assign(self, **kwargs) -> DataFrame: - r""" - Assign new columns to a DataFrame. + Notes + ----- + Please reference the :ref:`User Guide ` for more information. - Returns a new object with all original columns in addition to new ones. - Existing columns that are re-assigned will be overwritten. + Examples + -------- + Constructing DataFrame from a dictionary. - Parameters - ---------- - **kwargs : callable or Series - The column names are keywords. If the values are - callable, they are computed on the DataFrame and - assigned to the new columns. The callable must not - change input DataFrame (though pandas doesn't check it). - If the values are not callable, (e.g. a Series, scalar, or array), - they are simply assigned. + >>> d = {"col1": [1, 2], "col2": [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 - Returns - ------- - DataFrame - A new DataFrame with the new columns in addition to - all the existing columns. + Notice that the inferred dtype is int64. - See Also - -------- - DataFrame.loc : Select a subset of a DataFrame by labels. - DataFrame.iloc : Select a subset of a DataFrame by positions. + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object - Notes - ----- - Assigning multiple columns within the same ``assign`` is possible. - Later items in '\*\*kwargs' may refer to newly created or modified - columns in 'df'; items are computed and assigned into 'df' in order. + To enforce a single dtype: - Examples - -------- - >>> df = pd.DataFrame({"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"]) - >>> df - temp_c - Portland 17.0 - Berkeley 25.0 + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object - Where the value is a callable, evaluated on `df`: + Constructing DataFrame from a dictionary including Series: - >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) - temp_c temp_f - Portland 17.0 62.6 - Berkeley 25.0 77.0 + >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 - Alternatively, the same behavior can be achieved by directly - referencing an existing Series or sequence: + Constructing DataFrame from numpy ndarray: - >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32) - temp_c temp_f - Portland 17.0 62.6 - Berkeley 25.0 77.0 + >>> df2 = pd.DataFrame( + ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"] + ... ) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 - You can create multiple columns within the same assign where one - of the columns depends on another one defined within the same assign: + Constructing DataFrame from a numpy ndarray that has labeled columns: - >>> df.assign( - ... temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, - ... temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, - ... ) - temp_c temp_f temp_k - Portland 17.0 62.6 290.15 - Berkeley 25.0 77.0 298.15 - """ - data = self.copy(deep=False) + >>> data = np.array( + ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], + ... ) + >>> df3 = pd.DataFrame(data, columns=["c", "a"]) + >>> df3 + c a + 0 3 1 + 1 6 4 + 2 9 7 - for k, v in kwargs.items(): - data[k] = com.apply_if_callable(v, data) - return data + Constructing DataFrame from dataclass: - def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: - """ - Ensures new columns (which go into the BlockManager as new blocks) are - always copied (or a reference is being tracked to them under CoW) - and converted into an array. + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 - Parameters - ---------- - value : scalar, Series, or array-like + Constructing DataFrame from Series/DataFrame: - Returns - ------- - tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs - """ - self._ensure_valid_index(value) + >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) + >>> df = pd.DataFrame(data=ser, index=["a", "c"]) + >>> df + 0 + a 1 + c 3 - # Using a DataFrame would mean coercing values to one dtype - assert not isinstance(value, DataFrame) - if is_dict_like(value): - if not isinstance(value, Series): - value = Series(value) - return _reindex_for_setitem(value, self.index) + >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) + >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) + >>> df2 + x + a 1 + c 3 + """ + + _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: set[str] = {"sparse"} + _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: BlockManager - if is_list_like(value): - com.require_length_match(value, self.index) - return sanitize_array(value, self.index, copy=True, allow_2d=True), None + # similar to __array_priority__, positions DataFrame before Series, Index, + # and ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 4000 @property - def _series(self): - return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} + def _constructor(self) -> type[DataFrame]: + return DataFrame - # ---------------------------------------------------------------------- - # Reindexing and alignment + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return DataFrame._from_mgr(mgr, axes=axes) + else: + assert axes is mgr.axes + return self._constructor(mgr) - def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: - """ - We are guaranteed non-Nones in the axes. - """ + _constructor_sliced: Callable[..., Series] = Series - new_index, row_indexer = self.index.reindex(axes["index"]) - new_columns, col_indexer = self.columns.reindex(axes["columns"]) + def _sliced_from_mgr(self, mgr, axes) -> Series: + return Series._from_mgr(mgr, axes) - if row_indexer is not None and col_indexer is not None: - # Fastpath. By doing two 'take's at once we avoid making an - # unnecessary copy. - # We only get here with `self._can_fast_transpose`, which (almost) - # ensures that self.values is cheap. It may be worth making this - # condition more specific. - indexer = row_indexer, col_indexer - new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) - return self._constructor( - new_values, index=new_index, columns=new_columns, copy=False - ) - else: - return self._reindex_with_indexers( - {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, - fill_value=fill_value, - ) + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name + return ser + assert axes is mgr.axes + return self._constructor_sliced(mgr) - @Appender( - """ - Examples - -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + idx_diff = result_index.difference(correl.index) - Change the row labels. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df.set_axis(['a', 'b', 'c'], axis='index') - A B - a 1 4 - b 2 5 - c 3 6 + return correl - Change the column labels. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.set_axis(['I', 'II'], axis='columns') - I II - 0 1 4 - 1 2 5 - 2 3 6 - """ - ) - @Substitution( - klass=_shared_doc_kwargs["klass"], - axes_single_arg=_shared_doc_kwargs["axes_single_arg"], - extended_summary_sub=" column or", - axis_description_sub=", and 1 identifies the columns", - see_also_sub=" or columns", - ) - @Appender(NDFrame.set_axis.__doc__) - def set_axis( - self, - labels, - *, - axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, - ) -> DataFrame: - return super().set_axis(labels, axis=axis, copy=copy) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @doc( - NDFrame.reindex, - klass=_shared_doc_kwargs["klass"], - optional_reindex=_shared_doc_kwargs["optional_reindex"], - ) - def reindex( + def __init__( self, - labels=None, - *, + data=None, index=None, columns=None, - axis: Axis | None = None, - method: ReindexMethod | None = None, - copy: bool | lib.NoDefault = lib.no_default, - level: Level | None = None, - fill_value: Scalar | None = np.nan, - limit: int | None = None, - tolerance=None, - ) -> DataFrame: - return super().reindex( - labels=labels, - index=index, - columns=columns, - axis=axis, - method=method, - level=level, - fill_value=fill_value, - limit=limit, - tolerance=tolerance, - copy=copy, - ) - - @overload - def drop( - self, - labels: IndexLabel | ListLike = ..., - *, - axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., - level: Level = ..., - inplace: Literal[True], - errors: IgnoreRaise = ..., - ) -> None: ... + dtype=None, + copy=None, + ) -> None: + allow_mgr = False + if dtype is not None: + dtype = self._validate_dtype(dtype) - @overload - def drop( - self, - labels: IndexLabel | ListLike = ..., - *, - axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., - level: Level = ..., - inplace: Literal[False] = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame: ... + idx_diff = result_index.difference(correl.index) - @overload - def drop( - self, - labels: IndexLabel | ListLike = ..., - *, - axis: Axis = ..., - index: IndexLabel | ListLike = ..., - columns: IndexLabel | ListLike = ..., - level: Level = ..., - inplace: bool = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame | None: ... + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - def drop( - self, - labels: IndexLabel | ListLike = None, - *, - axis: Axis = 0, - index: IndexLabel | ListLike = None, - columns: IndexLabel | ListLike = None, - level: Level | None = None, - inplace: bool = False, - errors: IgnoreRaise = "raise", - ) -> DataFrame | None: - """ - Drop specified labels from rows or columns. + return correl - Remove rows or columns by specifying label names and corresponding - axis, or by directly specifying index or column names. When using a - multi-index, labels on different levels can be removed by specifying - the level. See the :ref:`user guide ` - for more information about the now unused levels. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Parameters - ---------- - labels : single label or iterable of labels - Index or column labels to drop. A tuple will be used as a single - label and not treated as an iterable. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Whether to drop labels from the index (0 or 'index') or - columns (1 or 'columns'). - index : single label or iterable of labels - Alternative to specifying axis (``labels, axis=0`` - is equivalent to ``index=labels``). - columns : single label or iterable of labels - Alternative to specifying axis (``labels, axis=1`` - is equivalent to ``columns=labels``). - level : int or level name, optional - For MultiIndex, level from which the labels will be removed. - inplace : bool, default False - If False, return a copy. Otherwise, do operation - in place and return None. - errors : {'ignore', 'raise'}, default 'raise' - If 'ignore', suppress error and only existing labels are - dropped. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Returns - ------- - DataFrame or None - Returns DataFrame or None DataFrame with the specified - index or column labels removed or None if inplace=True. + if isinstance(data, BlockManager): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + ) - Raises - ------ - KeyError - If any of the labels is not found in the selected axis. + data = data.copy(deep=False) + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return - See Also - -------- - DataFrame.loc : Label-location based indexer for selection by label. - DataFrame.dropna : Return DataFrame with labels on given axis omitted - where (all or any) data are missing. - DataFrame.drop_duplicates : Return DataFrame with duplicate rows - removed, optionally only considering certain columns. - Series.drop : Return Series with specified index labels removed. + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype - Examples - -------- - >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"]) + # GH47215 + if isinstance(index, set): + raise ValueError("index cannot be a set") + if isinstance(columns, set): + raise ValueError("columns cannot be a set") + + if copy is None: + if isinstance(data, dict): + # retain pre-GH#38939 default behavior + copy = True + elif not isinstance(data, (Index, DataFrame, Series)): + copy = True + else: + copy = False + + if data is None: + index = index if index is not None else default_index(0) + columns = columns if columns is not None else default_index(0) + dtype = dtype if dtype is not None else pandas_dtype(object) + data = [] + + if isinstance(data, BlockManager): + mgr = self._init_mgr( + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + ) + + elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy) + elif isinstance(data, ma.MaskedArray): + from numpy.ma import mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + raise TypeError( + "MaskedRecords are not supported. Pass " + "{name: data[name] for name in data.dtype.names} " + "instead" + ) + + # a masked array + data = sanitize_masked_array(data) + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): + if data.dtype.names: + # i.e. numpy structured array + data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + ) + elif getattr(data, "name", None) is not None: + # i.e. Series/Index with non-None name + mgr = dict_to_mgr( + # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no + # attribute "name" + {data.name: data}, # type: ignore[union-attr] + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + # For data is list-like, or Iterable (will consume into list) + elif is_list_like(data): + if not isinstance(data, abc.Sequence): + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above + if columns is not None: + columns = ensure_index(columns) + arrays, columns, index = nested_data_to_arrays( + # error: Argument 3 to "nested_data_to_arrays" has incompatible + # type "Optional[Collection[Any]]"; expected "Optional[Index]" + data, + columns, + index, # type: ignore[arg-type] + dtype, + ) + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = dict_to_mgr( + {}, + index, + columns if columns is not None else default_index(0), + dtype=dtype, + ) + # For data is scalar + else: + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + index = ensure_index(index) + columns = ensure_index(columns) + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data) + + # For data is a scalar extension dtype + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): special case not needed with 2D EAs + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, dtype=None) + else: + arr2d = construct_2d_arraylike_from_scalar( + data, + len(index), + len(columns), + dtype, + copy, + ) + + mgr = ndarray_to_mgr( + arr2d, + index, + columns, + dtype=arr2d.dtype, + copy=False, + ) + + NDFrame.__init__(self, mgr) + + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + + @classmethod + def _from_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Internal constructor that bypasses the deprecated BlockManager warning. + + This method is intended for internal use only. + + Parameters + ---------- + data : BlockManager, array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame + """ + if isinstance(data, BlockManager): + # Skip the deprecation warning for internal use + obj = cls.__new__(cls) + NDFrame.__init__(obj, data) + return obj + + # Use the regular constructor for other data types + return cls(data=data, index=index, columns=columns, dtype=dtype, copy=copy) + + @classmethod + def _create_with_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Create a DataFrame from data for subclasses. + + This is a recommended helper method for subclasses to create new instances + from data without triggering deprecation warnings. + + Parameters + ---------- + data : array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame or subclass + DataFrame of the subclass type. + + Notes + ----- + This method is primarily intended for subclass authors to create + instances of their subclass from array-like data. + """ + # Prepare the data safely avoiding internal manager passing + if isinstance(data, DataFrame): + if index is None: + index = data.index + if columns is None: + columns = data.columns + + # Create a dataframe using public APIs + df = cls(data, index=index, columns=columns, dtype=dtype, copy=copy) + return df + + # ---------------------------------------------------------------------- + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> interchange_object.column_names() + Index(['A', 'B'], dtype='object') + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) + >>> df_pandas + A + 0 1 + 1 2 + + These methods (``column_names``, ``select_columns_by_name``) should work + for any dataframe library which implements the interchange protocol. + """ + + from pandas.core.interchange.dataframe import PandasDataFrameXchg + + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> list[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape : Tuple of array dimensions. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame( + ... { + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64), + ... } + ... )._is_homogeneous_type + False + """ + # The "<" part of "<=" here is for empty DataFrame cases + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + blocks = self._mgr.blocks + if len(blocks) != 1: + return False + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) + + @property + def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + mgr = self._mgr + + blocks = mgr.blocks + if len(blocks) != 1: + return ensure_wrapped_if_datetimelike(self.values) + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self.values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + return arr.T + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if width is None or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if max_rows is not None: # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(line) for line in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + return buf.getvalue() + + repr_params = fmt.get_dataframe_repr_params() + return self.to_string(**repr_params) + + def _repr_html_(self) -> str | None: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return f"
{val}
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + ) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + else: + return None + + @overload + def to_string( + self, + buf: None = ..., + *, + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> str: ... +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" + +import collections +from collections import abc +from collections.abc import ( + Hashable, + Iterable, + Iterator, + Mapping, + Sequence, +) +import functools +from inspect import signature +from io import StringIO +import itertools +import operator +import sys +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) +import warnings + +import numpy as np +from numpy import ma + +from pandas._config import get_option + +from pandas._libs import ( + algos as libalgos, + lib, + properties, +) +from pandas._libs.hashtable import duplicated +from pandas._libs.lib import is_range_indexer +from pandas.compat import PYPY +from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import ( + ChainedAssignmentError, + InvalidIndexError, +) +from pandas.errors.cow import ( + _chained_assignment_method_msg, + _chained_assignment_msg, +) +from pandas.util._decorators import ( + Appender, + Substitution, + doc, + set_module, +) +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) +from pandas.util._validators import ( + validate_ascending, + validate_bool_kwarg, + validate_percentile, +) + +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + construct_1d_arraylike_from_scalar, + construct_2d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + infer_dtype_from_object, + is_1d_only_ea_dtype, + is_array_like, + is_bool_dtype, + is_dataclass, + is_dict_like, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_scalar, + is_sequence, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, + ExtensionDtype, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import ( + algorithms, + common as com, + nanops, + ops, + roperator, +) +from pandas.core.accessor import CachedAccessor +from pandas.core.apply import reconstruct_and_relabel_result +from pandas.core.array_algos.take import take_2d_multi +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ( + BaseMaskedArray, + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + sanitize_array, + sanitize_masked_array, +) +from pandas.core.generic import ( + NDFrame, + make_doc, +) +from pandas.core.indexers import check_key_length +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + default_index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.core.indexes.multi import ( + MultiIndex, + maybe_droplevels, +) +from pandas.core.indexing import ( + check_bool_indexer, + check_dict_or_set_indexers, +) +from pandas.core.internals import BlockManager +from pandas.core.internals.construction import ( + arrays_to_mgr, + dataclasses_to_dicts, + dict_to_mgr, + ndarray_to_mgr, + nested_data_to_arrays, + rec_array_to_mgr, + reorder_arrays, + to_arrays, + treat_as_nested, +) +from pandas.core.methods import selectn +from pandas.core.reshape.melt import melt +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ( + get_group_index, + lexsort_indexer, + nargsort, +) + +from pandas.io.common import get_handle +from pandas.io.formats import ( + console, + format as fmt, +) +from pandas.io.formats.info import ( + INFO_DOCSTRING, + DataFrameInfo, + frame_sub_kwargs, +) +import pandas.plotting + +if TYPE_CHECKING: + import datetime + + from pandas._libs.internals import BlockValuesRefs + from pandas._typing import ( + AggFuncType, + AnyAll, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + AxisInt, + ColspaceArgType, + CompressionOptions, + CorrelationMethod, + DropKeep, + Dtype, + DtypeObj, + FilePath, + FloatFormatType, + FormattersType, + Frequency, + FromDictOrient, + HashableT, + HashableT2, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + JoinValidate, + Level, + ListLike, + MergeHow, + MergeValidate, + MutableMappingT, + NaPosition, + NsmallestNlargestKeep, + PythonFuncType, + QuantileInterpolation, + ReadBuffer, + ReindexMethod, + Renamer, + Scalar, + Self, + SequenceNotStr, + SortKind, + StorageOptions, + Suffixes, + T, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, + ValueKeyFunc, + WriteBuffer, + XMLParsers, + npt, + ) + + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.core.internals.managers import SingleBlockManager + + from pandas.io.formats.style import Styler + +# --------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + "inplace": """ + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one.""", + "optional_by": """ +by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels.""", + "optional_reindex": """ +labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. +index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. +columns : array-like, optional + New labels for the columns. Preferably an Index object to avoid + duplicating data. +axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", +} + +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. + +A named Series object is treated as a DataFrame with a single named column. + +The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. + +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + +Parameters +----------%s +right : DataFrame or named Series + Object to merge with. +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. +on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. +left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. +right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. +left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. +right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. +sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). +suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. +copy : bool, default True + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` +indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + +validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + +Returns +------- +DataFrame + A DataFrame of the two merged objects. + +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. + +Examples +-------- +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 + +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', +... suffixes=('_left', '_right')) + lkey value_left rkey value_right +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 +""" + + +# ----------------------------------------------------------------------- +# DataFrame class + + +@set_module("pandas") +class DataFrame(NDFrame, OpsMixin): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. + + Parameters + ---------- + data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame + Dict can contain Series, arrays, constants, dataclass or list-like objects. If + data is a dict, column order follows insertion-order. If a dict contains Series + which have an index defined, it is aligned by its index. This alignment also + occurs if data is a Series or a DataFrame itself. Alignment is done on + Series/DataFrame inputs. + + If data is a list of dicts, column order follows insertion-order. + + index : Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + no indexing information part of input data and no index provided. + columns : Index or array-like + Column labels to use for resulting frame when data does not have them, + defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, + will perform column selection instead. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. + If data is a dict containing one or more Series (possibly of different dtypes), + ``copy=False`` will ensure that these inputs are not copied. + + .. versionchanged:: 1.3.0 + + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_table : Read general delimited file into DataFrame. + read_clipboard : Read text from clipboard into DataFrame. + + Notes + ----- + Please reference the :ref:`User Guide ` for more information. + + Examples + -------- + Constructing DataFrame from a dictionary. + + >>> d = {"col1": [1, 2], "col2": [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from a dictionary including Series: + + >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame( + ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"] + ... ) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 + + Constructing DataFrame from a numpy ndarray that has labeled columns: + + >>> data = np.array( + ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], + ... ) + >>> df3 = pd.DataFrame(data, columns=["c", "a"]) + >>> df3 + c a + 0 3 1 + 1 6 4 + 2 9 7 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 + + Constructing DataFrame from Series/DataFrame: + + >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) + >>> df = pd.DataFrame(data=ser, index=["a", "c"]) + >>> df + 0 + a 1 + c 3 + + >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) + >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) + >>> df2 + x + a 1 + c 3 + """ + + _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: set[str] = {"sparse"} + _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: BlockManager + + # similar to __array_priority__, positions DataFrame before Series, Index, + # and ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 4000 + + @property + def _constructor(self): + return type(self) + + @classmethod + def _constructor_from_mgr(cls, mgr, axes): + """ + Create a new DataFrame from a BlockManager directly without triggering DeprecationWarning. + + Parameters + ---------- + mgr : BlockManager + The BlockManager to use for the new DataFrame. + axes : list, optional + List containing the index and columns. + + Returns + ------- + DataFrame + New DataFrame with the given BlockManager. + """ + if axes is not None: + index, columns = axes + else: + index, columns = mgr.axes + + return cls._from_mgr(mgr, [index, columns]) + + @classmethod + def _from_mgr(cls, mgr, axes): + """ + Internal factory method to create a DataFrame from a BlockManager. + + This method is used internally to avoid triggering the DeprecationWarning + that is raised when a BlockManager is passed to the DataFrame constructor. + + Parameters + ---------- + mgr : BlockManager + Block manager for the DataFrame. + axes : list + List containing the index and columns. + + Returns + ------- + DataFrame + DataFrame constructed from the block manager. + """ + obj = cls.__new__(cls) + from pandas.core.generic import NDFrame + NDFrame.__init__(obj, mgr) + return obj + + _constructor_sliced: Callable[..., Series] = Series + + def _sliced_from_mgr(self, mgr, axes) -> Series: + return Series._from_mgr(mgr, axes) + + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name + return ser + assert axes is mgr.axes + return self._constructor_sliced(mgr) + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index: Axes | None = None, + columns: Axes | None = None, + dtype: Dtype | None = None, + copy: bool | None = None, + ) -> None: + allow_mgr = False + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._mgr + allow_mgr = True + if not copy: + # if not copying data, ensure to still return a shallow copy + # to avoid the result sharing the same Manager + data = data.copy(deep=False) + + if isinstance(data, BlockManager): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + ) + + data = data.copy(deep=False) + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + + # GH47215 + if isinstance(index, set): + raise ValueError("index cannot be a set") + if isinstance(columns, set): + raise ValueError("columns cannot be a set") + + if copy is None: + if isinstance(data, dict): + # retain pre-GH#38939 default behavior + copy = True + elif not isinstance(data, (Index, DataFrame, Series)): + copy = True + else: + copy = False + + if data is None: + index = index if index is not None else default_index(0) + columns = columns if columns is not None else default_index(0) + dtype = dtype if dtype is not None else pandas_dtype(object) + data = [] + + if isinstance(data, BlockManager): + mgr = self._init_mgr( + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + ) + + elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy) + elif isinstance(data, ma.MaskedArray): + from numpy.ma import mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + raise TypeError( + "MaskedRecords are not supported. Pass " + "{name: data[name] for name in data.dtype.names} " + "instead" + ) + + # a masked array + data = sanitize_masked_array(data) + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): + if data.dtype.names: + # i.e. numpy structured array + data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + ) + elif getattr(data, "name", None) is not None: + # i.e. Series/Index with non-None name + mgr = dict_to_mgr( + # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no + # attribute "name" + {data.name: data}, # type: ignore[union-attr] + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + # For data is list-like, or Iterable (will consume into list) + elif is_list_like(data): + if not isinstance(data, abc.Sequence): + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above + if columns is not None: + columns = ensure_index(columns) + arrays, columns, index = nested_data_to_arrays( + # error: Argument 3 to "nested_data_to_arrays" has incompatible + # type "Optional[Collection[Any]]"; expected "Optional[Index]" + data, + columns, + index, # type: ignore[arg-type] + dtype, + ) + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = dict_to_mgr( + {}, + index, + columns if columns is not None else default_index(0), + dtype=dtype, + ) + # For data is scalar + else: + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + index = ensure_index(index) + columns = ensure_index(columns) + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data) + + # For data is a scalar extension dtype + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): special case not needed with 2D EAs + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, dtype=None) + else: + arr2d = construct_2d_arraylike_from_scalar( + data, + len(index), + len(columns), + dtype, + copy, + ) + + mgr = ndarray_to_mgr( + arr2d, + index, + columns, + dtype=arr2d.dtype, + copy=False, + ) + + NDFrame.__init__(self, mgr) + + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + + @classmethod + def _from_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Internal constructor that bypasses the deprecated BlockManager warning. + + This method is intended for internal use only. + + Parameters + ---------- + data : BlockManager, array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame + """ + if isinstance(data, BlockManager): + # Skip the deprecation warning for internal use + obj = cls.__new__(cls) + NDFrame.__init__(obj, data) + return obj + + # Use the regular constructor for other data types + return cls(data=data, index=index, columns=columns, dtype=dtype, copy=copy) + + @classmethod + def _create_with_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Create a DataFrame from data for subclasses. + + This is a recommended helper method for subclasses to create new instances + from data without triggering deprecation warnings. + + Parameters + ---------- + data : array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame or subclass + DataFrame of the subclass type. + + Notes + ----- + This method is primarily intended for subclass authors to create + instances of their subclass from array-like data. + """ + # Prepare the data safely avoiding internal manager passing + if isinstance(data, DataFrame): + if index is None: + index = data.index + if columns is None: + columns = data.columns + + # Create a dataframe using public APIs + df = cls(data, index=index, columns=columns, dtype=dtype, copy=copy) + return df + + # ---------------------------------------------------------------------- + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> interchange_object.column_names() + Index(['A', 'B'], dtype='object') + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) + >>> df_pandas + A + 0 1 + 1 2 + + These methods (``column_names``, ``select_columns_by_name``) should work + for any dataframe library which implements the interchange protocol. + """ + + from pandas.core.interchange.dataframe import PandasDataFrameXchg + + return PandasDataFrameXchg(self, allow_copy=allow_copy) + + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> list[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape : Tuple of array dimensions. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame( + ... { + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64), + ... } + ... )._is_homogeneous_type + False + """ + # The "<" part of "<=" here is for empty DataFrame cases + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + blocks = self._mgr.blocks + if len(blocks) != 1: + return False + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) + + @property + def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + mgr = self._mgr + + blocks = mgr.blocks + if len(blocks) != 1: + return ensure_wrapped_if_datetimelike(self.values) + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self.values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + return arr.T + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if width is None or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if max_rows is not None: # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(line) for line in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + return buf.getvalue() + + repr_params = fmt.get_dataframe_repr_params() + return self.to_string(**repr_params) + + def _repr_html_(self) -> str | None: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return f"
{val}
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + ) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + else: + return None + + @overload + def to_string( + self, + buf: None = ..., + *, + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> str: ... + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + *, + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> None: ... + + @Substitution( + header_type="bool or list of str", + header="Write out the column names. If a list of columns " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int, list or dict of int", + col_space="The minimum width of each column. If a list of ints is given " + "every integers corresponds with one column. If a dict is given, the key " + "references the column, while the value defines the space to use.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + columns: Axes | None = None, + col_space: int | list[int] | dict[Hashable, int] | None = None, + header: bool | SequenceNotStr[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: fmt.FormattersType | None = None, + float_format: fmt.FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_colwidth : int, optional + Max width to truncate each column in characters. By default, no limit. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + from pandas import option_context + + with option_context("display.max_colwidth", max_colwidth): + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, + line_width=line_width, + ) + + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> DataFrame: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + return self._constructor_from_mgr(mgr, axes=mgr.axes) + + # ---------------------------------------------------------------------- + + @property + def style(self) -> Styler: + """ + Returns a Styler object. + + Contains methods for building a styled HTML representation of the DataFrame. + + See Also + -------- + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3]}) + >>> df.style # doctest: +SKIP + + Please see + `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. + """ + from pandas.io.formats.style import Styler + + return Styler(self) + + _shared_docs["items"] = r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print(f'label: {label}') + ... print(f'content: {content}', sep='\n') + ... + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ + + @Appender(_shared_docs["items"]) + def items(self) -> Iterable[tuple[Hashable, Series]]: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) + + def iterrows(self) -> Iterable[tuple[Hashable, Series]]: + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + + Examples + -------- + + >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row["int"].dtype) + float64 + >>> print(df["int"].dtype) + int64 + """ + columns = self.columns + klass = self._constructor_sliced + for k, v in zip(self.index, self.values): + s = klass(v, index=columns, name=k).__finalize__(self) + if self._mgr.is_single_block: + s._mgr.add_references(self._mgr) + yield k, s + + def itertuples( + self, index: bool = True, name: str | None = "Pandas" + ) -> Iterable[tuple[Any, ...]]: + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"] + ... ) + >>> df + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name="Animal"): + ... print(row) + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + arrays = [] + fields = list(self.columns) + if index: + arrays.append(self.index) + fields.insert(0, "Index") + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + if name is not None: + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) + return map(itertuple._make, zip(*arrays)) + + # fallback to regular tuples + return zip(*arrays) + + def __len__(self) -> int: + """ + Returns length of info axis, but here we use the index. + """ + return len(self.index) + + @overload + def dot(self, other: Series) -> Series: ... + + @overload + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... + + def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Compute the matrix multiplication between the DataFrame and other. + + This method computes the matrix product between the DataFrame and the + values of an other Series, DataFrame or a numpy array. + + It can also be called using ``self @ other``. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the matrix product with. + + Returns + ------- + Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Series. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + + See Also + -------- + Series.dot: Similar method for Series. + + Notes + ----- + The dimensions of DataFrame and other must be compatible in order to + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. + + The dot method for Series computes the inner product, instead of the + matrix product here. + + Examples + -------- + Here we multiply a DataFrame with a Series. + + >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> s = pd.Series([1, 1, 2, 1]) + >>> df.dot(s) + 0 -4 + 1 5 + dtype: int64 + + Here we multiply a DataFrame with another DataFrame. + + >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(other) + 0 1 + 0 1 4 + 1 2 2 + + Note that the dot method give the same result as @ + + >>> df @ other + 0 1 + 0 1 4 + 1 2 2 + + The dot method works also if other is an np.array. + + >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(arr) + 0 1 + 0 1 4 + 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(columns=common) + right = other.reindex(index=common) + lvals = left.values + rvals = right._values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, DataFrame): + common_type = find_common_type(list(self.dtypes) + list(other.dtypes)) + return self._constructor( + np.dot(lvals, rvals), + index=left.index, + columns=other.columns, + copy=False, + dtype=common_type, + ) + elif isinstance(other, Series): + common_type = find_common_type(list(self.dtypes) + [other.dtypes]) + return self._constructor_sliced( + np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type + ) + elif isinstance(rvals, (np.ndarray, Index)): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index, copy=False) + else: + return self._constructor_sliced(result, index=left.index, copy=False) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + @overload + def __matmul__(self, other: Series) -> Series: ... + + @overload + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ... + + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Matrix multiplication using binary `@` operator. + """ + return self.dot(other) + + def __rmatmul__(self, other) -> DataFrame: + """ + Matrix multiplication using binary `@` operator. + """ + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err + + # ---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict( + cls, + data: dict, + orient: FromDictOrient = "columns", + dtype: Dtype | None = None, + columns: Axes | None = None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index', 'tight'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + dtype : dtype, default None + Data type to force after DataFrame construction, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from structured ndarray, sequence + of tuples or dicts, or DataFrame. + DataFrame : DataFrame object creation using constructor. + DataFrame.to_dict : Convert the DataFrame to a dictionary. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} + >>> pd.DataFrame.from_dict(data, orient="index") + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) + A B C D + row_1 3 2 1 0 + row_2 a b c d + + Specify ``orient='tight'`` to create the DataFrame using a 'tight' + format: + + >>> data = { + ... "index": [("a", "b"), ("a", "c")], + ... "columns": [("x", 1), ("y", 2)], + ... "data": [[1, 3], [2, 4]], + ... "index_names": ["n1", "n2"], + ... "column_names": ["z1", "z2"], + ... } + >>> pd.DataFrame.from_dict(data, orient="tight") + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 + """ + index: list | Index | None = None + orient = orient.lower() # type: ignore[assignment] + if orient == "index": + if len(data) > 0: + # TODO speed up Series case + if isinstance(next(iter(data.values())), (Series, dict)): + data = _from_nested_dict(data) + else: + index = list(data.keys()) + # error: Incompatible types in assignment (expression has type + # "List[Any]", variable has type "Dict[Any, Any]") + data = list(data.values()) # type: ignore[assignment] + elif orient in ("columns", "tight"): + if columns is not None: + raise ValueError(f"cannot use columns parameter with orient='{orient}'") + else: # pragma: no cover + raise ValueError( + f"Expected 'index', 'columns' or 'tight' for orient parameter. " + f"Got '{orient}' instead" + ) + + if orient != "tight": + return cls(data, index=index, columns=columns, dtype=dtype) + else: + realdata = data["data"] + + def create_index(indexlist, namelist) -> Index: + index: Index + if len(namelist) > 1: + index = MultiIndex.from_tuples(indexlist, names=namelist) + else: + index = Index(indexlist, name=namelist[0]) + return index + + index = create_index(data["index"], data["index_names"]) + columns = create_index(data["columns"], data["column_names"]) + return cls(realdata, index=index, columns=columns, dtype=dtype) + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the DataFrame to a NumPy array. + + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the dtypes of the DataFrame columns. + + Returns + ------- + numpy.ndarray + The NumPy array representing the values in the DataFrame. + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogeneous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df["C"] = pd.date_range("2000", periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + if dtype is not None: + dtype = np.dtype(dtype) + result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) + if result.dtype is not dtype: + result = np.asarray(result, dtype=dtype) + + return result + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., + ) -> dict: ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... + + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + def to_dict( + self, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] + index: bool = True, + ) -> MutableMappingT | list[MutableMappingT]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + into : class, default dict + The collections.abc.MutableMapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). + + .. versionadded:: 2.0.0 + + Returns + ------- + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"] + ... ) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict("series") + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict("split") + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict("records") + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict("index") + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + >>> df.to_dict("tight") + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict("records", into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ + from pandas.core.methods.to_dict import to_dict + + return to_dict(self, orient, into=into, index=index) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Parameters + ---------- + data : structured ndarray, sequence of tuples or dicts + Structured input data. + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_dict : DataFrame from dict of array-like or dicts. + DataFrame : DataFrame object creation using constructor. + + Examples + -------- + Data can be provided as a structured ndarray: + + >>> data = np.array( + ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")], + ... dtype=[("col_1", "i4"), ("col_2", "U1")], + ... ) + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of dicts: + + >>> data = [ + ... {"col_1": 3, "col_2": "a"}, + ... {"col_1": 2, "col_2": "b"}, + ... {"col_1": 1, "col_2": "c"}, + ... {"col_1": 0, "col_2": "d"}, + ... ] + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of tuples with corresponding columns: + + >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] + >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"]) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + """ + if isinstance(data, DataFrame): + raise TypeError( + "Passing a DataFrame to DataFrame.from_records is not supported. Use " + "set_index and/or drop to modify the DataFrame instead.", + ) + + result_index = None + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = ensure_index(columns) + + def maybe_reorder( + arrays: list[ArrayLike], arr_columns: Index, columns: Index, index + ) -> tuple[list[ArrayLike], Index, Index | None]: + """ + If our desired 'columns' do not match the data's pre-existing 'arr_columns', + we re-order our arrays. This is like a pre-emptive (cheap) reindex. + """ + if len(arrays): + length = len(arrays[0]) + else: + length = 0 + + result_index = None + if len(arrays) == 0 and index is None and length == 0: + result_index = default_index(0) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + return arrays, arr_columns, result_index + + if is_iterator(data): + if nrows == 0: + return cls() + + try: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, "dtype") and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns_list = [] + for k, v in data.items(): + if k in columns: + arr_columns_list.append(k) + arrays.append(v) + + arr_columns = Index(arr_columns_list) + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + elif isinstance(data, np.ndarray): + arrays, columns = to_arrays(data, columns) + arr_columns = columns + else: + arrays, arr_columns = to_arrays(data, columns) + if coerce_float: + for i, arr in enumerate(arrays): + if arr.dtype == object: + # error: Argument 1 to "maybe_convert_objects" has + # incompatible type "Union[ExtensionArray, ndarray]"; + # expected "ndarray" + arrays[i] = lib.maybe_convert_objects( + arr, # type: ignore[arg-type] + try_float=True, + ) + + arr_columns = ensure_index(arr_columns) + if columns is None: + columns = arr_columns + else: + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + if index is not None: + if isinstance(index, str) or not hasattr(index, "__iter__"): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + except (KeyError, TypeError): + # raised by get_loc, see GH#29258 + result_index = index + else: + result_index = ensure_index_from_sequences(index_data, names=index) + exclude.update(index) + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + columns = columns.drop(exclude) + + mgr = arrays_to_mgr(arrays, columns, result_index) + return cls._from_mgr(mgr, axes=mgr.axes) + + def to_records( + self, index: bool = True, column_dtypes=None, index_dtypes=None + ) -> np.rec.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Parameters + ---------- + index : bool, default True + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns + ------- + numpy.rec.recarray + NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + + See Also + -------- + DataFrame.from_records: Convert structured or record ndarray + to DataFrame. + numpy.rec.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) + >>> df + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' Self: + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + columns = ensure_index(columns) + if len(columns) != len(arrays): + raise ValueError("len(columns) must match len(arrays)") + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + verify_integrity=verify_integrity, + ) + return cls._from_mgr(mgr, axes=mgr.axes) + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) + def to_stata( + self, + path: FilePath | WriteBuffer[bytes], + *, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: ToStataByteorder | None = None, + time_stamp: datetime.datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + version: int | None = 114, + convert_strl: Sequence[Hashable] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + """ + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. + + Parameters + ---------- + path : str, path object, or buffer + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. + + convert_dates : dict + Dictionary mapping columns containing datetime types to stata + internal format to use when writing the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information. + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder`. + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + data_label : str, optional + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + version : {{114, 117, 118, 119, None}}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. + + Version 119 should usually only be used when the number of + variables exceeds the capacity of dta format 118. Exporting + smaller datasets in format 119 may have unintended consequences, + and, as of November 2020, Stata SE cannot read version 119 files. + + convert_strl : list, optional + List of column names to convert to string columns to Stata StrL + format. Only available if version is 117. Storing strings in the + StrL format can produce smaller dta files if strings have more than + 8 characters and values are repeated. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + + .. versionadded:: 1.4.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + See Also + -------- + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. + + Examples + -------- + >>> df = pd.DataFrame( + ... [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"] + ... ) + >>> df.to_stata("animals.dta") # doctest: +SKIP + """ + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + if version == 114: + if convert_strl is not None: + raise ValueError("strl is not supported in format 114") + from pandas.io.stata import StataWriter as statawriter + elif version == 117: + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriter117 as statawriter, + ) + else: # versions 118 and 119 + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriterUTF8 as statawriter, + ) + + kwargs: dict[str, Any] = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 + kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version + + writer = statawriter( + path, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + compression=compression, + storage_options=storage_options, + value_labels=value_labels, + **kwargs, + ) + writer.write_file() + + def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: + """ + Write a DataFrame to the binary Feather format. + + Parameters + ---------- + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If a string or a path, + it will be used as Root Directory path when writing a partitioned dataset. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + This includes the `compression`, `compression_level`, `chunksize` + and `version` keywords. + + Notes + ----- + This function writes the dataframe as a `feather file + `_. Requires a default + index. For saving the DataFrame with your custom index use a method that + supports custom indices e.g. `to_parquet`. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + >>> df.to_feather("file.feather") # doctest: +SKIP + """ + from pandas.io.feather_format import to_feather + + to_feather(self, path, **kwargs) + + @overload + def to_markdown( + self, + buf: None = ..., + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str], + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> None: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None, + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str | None: ... + + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples="""Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + + Output markdown with a tabulate option. + + >>> print(df.to_markdown(tablefmt="grid")) + +----+------------+------------+ + | | animal_1 | animal_2 | + +====+============+============+ + | 0 | elk | dog | + +----+------------+------------+ + | 1 | pig | quetzal | + +----+------------+------------+""", + ) + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + if "showindex" in kwargs: + raise ValueError("Pass 'index' instead of 'showindex") + + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + + with get_handle(buf, mode, storage_options=storage_options) as handles: + handles.handle.write(result) + return None + + @overload + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes], + *, + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., + compression: str | None = ..., + index: bool | None = ..., + partition_cols: list[str] | None = ..., + storage_options: StorageOptions = ..., + **kwargs, + ) -> None: ... + + @doc(storage_options=_shared_docs["storage_options"]) + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", + compression: str | None = "snappy", + index: bool | None = None, + partition_cols: list[str] | None = None, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> bytes | None: + """ + Write a DataFrame to the binary parquet format. + + This function writes the dataframe as a `parquet file + `_. You can choose different parquet + backends, and have the option of compression. See + :ref:`the user guide ` for more details. + + Parameters + ---------- + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : str or None, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + {storage_options} + + **kwargs + Additional arguments passed to the parquet library. See + :ref:`pandas io ` for more details. + + Returns + ------- + bytes if no path argument is provided else None + + See Also + -------- + read_parquet : Read a parquet file. + DataFrame.to_orc : Write an orc file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}}) + >>> df.to_parquet("df.parquet.gzip", compression="gzip") # doctest: +SKIP + >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the parquet content you can use a io.BytesIO + object, as long as you don't use partition_cols, which creates multiple files. + + >>> import io + >>> f = io.BytesIO() + >>> df.to_parquet(f) + >>> f.seek(0) + 0 + >>> content = f.read() + """ + from pandas.io.parquet import to_parquet + + return to_parquet( + self, + path, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + **kwargs, + ) + + @overload + def to_orc( + self, + path: None = ..., + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes], + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> None: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None, + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes | None: ... + + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, + ) -> bytes | None: + """ + Write a DataFrame to the Optimized Row Columnar (ORC) format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : {'pyarrow'}, default 'pyarrow' + ORC library to use. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no ``path`` argument is provided else None + Bytes object with DataFrame data if ``path`` is not specified else None. + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + * Find more information on ORC + `here `__. + * Before using this function you should read the :ref:`user guide about + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + + Examples + -------- + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> df.to_orc("df.orc") # doctest: +SKIP + >>> pd.read_orc("df.orc") # doctest: +SKIP + col1 col2 + 0 1 4 + 1 2 3 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + + >>> import io + >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP + >>> b.seek(0) # doctest: +SKIP + 0 + >>> content = b.read() # doctest: +SKIP + """ + from pandas.io.orc import to_orc + + return to_orc( + self, path, engine=engine, index=index, engine_kwargs=engine_kwargs + ) + + @overload + def to_html( + self, + buf: FilePath | WriteBuffer[str], + *, + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> None: ... + + @overload + def to_html( + self, + buf: None = ..., + *, + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> str: ... + + @Substitution( + header_type="bool", + header="Whether to print column labels, default True", + col_space_type="str or int, list or dict of int or str", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + columns: Axes | None = None, + col_space: ColspaceArgType | None = None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool | str = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame as an HTML table. + %(shared_params)s + bold_rows : bool, default True + Make the row labels bold in the output. + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table. + escape : bool, default True + Convert the characters <, >, and & to HTML-safe sequences. + notebook : {True, False}, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_string : Convert DataFrame to a string. + + Examples + -------- + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> html_string = '''
+ ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ...
col1col2
014
123
''' + >>> assert html_string == df.to_html() + """ + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: + raise ValueError("Invalid value for justify parameter") + + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + justify=justify, + index_names=index_names, + escape=escape, + decimal=decimal, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + ) + # TODO: a generic formatter wld b in DataFrameFormatter + return fmt.DataFrameRenderer(formatter).to_html( + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, + table_id=table_id, + render_links=render_links, + ) + + @overload + def to_xml( + self, + path_or_buffer: None = ..., + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> str: ... + + @overload + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> None: ... + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", + ) + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + *, + index: bool = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, + encoding: str = "utf-8", + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + parser: XMLParsers | None = "lxml", + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + ) -> str | None: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``write()`` function. If None, the result is returned + as a string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{"": "https://example.com"}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : bool, default True + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame( + ... [["square", 360, 4], ["circle", 360, np.nan], ["triangle", 180, 3]], + ... columns=["shape", "degrees", "sides"], + ... ) + + >>> df.to_xml() # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml( + ... attr_cols=["index", "shape", "degrees", "sides"] + ... ) # doctest: +SKIP + + + + + + + + >>> df.to_xml( + ... namespaces={{"doc": "https://example.com"}}, prefix="doc" + ... ) # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter] + + if parser == "lxml": + if lxml is not None: + TreeBuilder = LxmlXMLFormatter + else: + raise ImportError( + "lxml not found, please install or use the etree parser." + ) + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self, + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + + return xml_formatter.write_output() + + # ---------------------------------------------------------------------- + @doc(INFO_DOCSTRING, **frame_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool | None = None, + ) -> None: + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) + + def memory_usage(self, index: bool = True, deep: bool = False) -> Series: + """ + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. + + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. + + Parameters + ---------- + index : bool, default True + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True``, the memory usage of + the index is the first item in the output. + deep : bool, default False + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. + + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. + + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. + + Notes + ----- + See the :ref:`Frequently Asked Questions ` for more + details. + + Examples + -------- + >>> dtypes = ["int64", "float64", "complex128", "object", "bool"] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True + + >>> df.memory_usage() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + + The memory footprint of `object` dtype columns is ignored by default: + + >>> df.memory_usage(deep=True) + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 180000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df["object"].astype("category").memory_usage(deep=True) + 5136 + """ + result = self._constructor_sliced( + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + index=self.columns, + dtype=np.intp, + ) + if index: + index_memory_usage = self._constructor_sliced( + self.index.memory_usage(deep=deep), index=["Index"] + ) + result = index_memory_usage._append(result) + return result + + def transpose(self, *args, copy: bool = False) -> DataFrame: + """ + Transpose index and columns. + + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. + + Parameters + ---------- + *args : tuple, optional + Accepted for compatibility with NumPy. + copy : bool, default False + Whether to copy the data after transposing, even for DataFrames + with a single dtype. + + Note that a copy is always required for mixed dtype DataFrames, + or for DataFrames with any extension types. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. In such a case, a copy of the data + is always made. + + Examples + -------- + **Square DataFrame with homogeneous dtype** + + >>> d1 = {"col1": [1, 2], "col2": [3, 4]} + >>> df1 = pd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 + + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 + + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: + + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object + + **Non-square DataFrame with mixed dtypes** + + >>> d2 = { + ... "name": ["Alice", "Bob"], + ... "score": [9.5, 8], + ... "employed": [False, True], + ... "kids": [0, 0], + ... } + >>> df2 = pd.DataFrame(data=d2) + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 + + >>> df2_transposed = df2.T # or df2.transpose() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8.0 + employed False True + kids 0 0 + + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: + + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + nv.validate_transpose(args, {}) + # construct the args + + dtypes = list(self.dtypes) + + if self._can_fast_transpose: + # Note: tests pass without this, but this improves perf quite a bit. + new_vals = self._values.T + + result = self._constructor( + new_vals, + index=self.columns, + columns=self.index, + copy=False, + dtype=new_vals.dtype, + ) + if len(self) > 0: + result._mgr.add_references(self._mgr) + + elif ( + self._is_homogeneous_type + and dtypes + and isinstance(dtypes[0], ExtensionDtype) + ): + new_values: list + if isinstance(dtypes[0], BaseMaskedDtype): + # We have masked arrays with the same dtype. We can transpose faster. + from pandas.core.arrays.masked import ( + transpose_homogeneous_masked_arrays, + ) + + new_values = transpose_homogeneous_masked_arrays( + cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) + ) + elif isinstance(dtypes[0], ArrowDtype): + # We have arrow EAs with the same dtype. We can transpose faster. + from pandas.core.arrays.arrow.array import ( + ArrowExtensionArray, + transpose_homogeneous_pyarrow, + ) + + new_values = transpose_homogeneous_pyarrow( + cast(Sequence[ArrowExtensionArray], self._iter_column_arrays()) + ) + else: + # We have other EAs with the same dtype. We preserve dtype in transpose. + dtyp = dtypes[0] + arr_typ = dtyp.construct_array_type() + values = self.values + new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] + + result = type(self)._from_arrays( + new_values, + index=self.columns, + columns=self.index, + verify_integrity=False, + ) + + else: + new_arr = self.values.T + result = self._constructor( + new_arr, + index=self.columns, + columns=self.index, + dtype=new_arr.dtype, + # We already made a copy (more than one block) + copy=False, + ) + + return result.__finalize__(self, method="transpose") + + @property + def T(self) -> DataFrame: + """ + The transpose of the DataFrame. + + Returns + ------- + DataFrame + The transposed DataFrame. + + See Also + -------- + DataFrame.transpose : Transpose index and columns. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + >>> df.T + 0 1 + col1 1 2 + col2 3 4 + """ + return self.transpose() + + # ---------------------------------------------------------------------- + # Indexing Methods + + def _ixs(self, i: int, axis: AxisInt = 0) -> Series: + """ + Parameters + ---------- + i : int + axis : int + + Returns + ------- + Series + """ + # irow + if axis == 0: + new_mgr = self._mgr.fast_xs(i) + + result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) + result._name = self.index[i] + return result.__finalize__(self) + + # icol + else: + col_mgr = self._mgr.iget(i) + return self._box_col_values(col_mgr, i) + + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). + """ + return self._mgr.iget_values(i) + + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). + """ + for i in range(len(self.columns)): + yield self._get_column_array(i) + + def __getitem__(self, key): + check_dict_or_set_indexers(key) + key = lib.item_from_zerodim(key) + key = com.apply_if_callable(key, self) + + if is_hashable(key) and not is_iterator(key): + # is_iterator to exclude generator e.g. test_getitem_listlike + # shortcut if the key is in columns + is_mi = isinstance(self.columns, MultiIndex) + # GH#45316 Return view if key is not duplicated + # Only use drop_duplicates with duplicates for performance + if not is_mi and ( + self.columns.is_unique + and key in self.columns + or key in self.columns.drop_duplicates(keep=False) + ): + return self._get_item(key) + + elif is_mi and self.columns.is_unique and key in self.columns: + return self._getitem_multilevel(key) + + # Do we have a slicer (on rows)? + if isinstance(key, slice): + return self._getitem_slice(key) + + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): + return self.where(key) + + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) + + # We are left with two options: a single key, and a collection of keys, + # We interpret tuples as collections only for non-MultiIndex + is_single_key = isinstance(key, tuple) or not is_list_like(key) + + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] + else: + if is_iterator(key): + key = list(key) + indexer = self.columns._get_indexer_strict(key, "columns")[1] + + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] + + if isinstance(indexer, slice): + return self._slice(indexer, axis=1) + + data = self.take(indexer, axis=1) + + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + # GH#26490 using data[key] can cause RecursionError + return data._get_item(key) + + return data + + def _getitem_bool_array(self, key): + # also raises Exception if object array with NA values + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + UserWarning, + stacklevel=find_stack_level(), + ) + elif len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}." + ) + + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + + if key.all(): + return self.copy(deep=False) + + indexer = key.nonzero()[0] + return self.take(indexer, axis=0) + + def _getitem_multilevel(self, key): + # self.columns is a MultiIndex + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_columns = self.columns[loc] + result_columns = maybe_droplevels(new_columns, key) + result = self.iloc[:, loc] + result.columns = result_columns + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. + if len(result.columns) == 1: + # e.g. test_frame_getitem_multicolumn_empty_level, + # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice + top = result.columns[0] + if isinstance(top, tuple): + top = top[0] + if top == "": + result = result[""] + if isinstance(result, Series): + result = self._constructor_sliced( + result, index=self.index, name=key + ) + + return result + else: + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) + + def _get_value(self, index, col, takeable: bool = False) -> Scalar: + """ + Quickly retrieve single value at passed column and index. + + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False + + Returns + ------- + scalar + + Notes + ----- + Assumes that both `self.index._index_as_unique` and + `self.columns._index_as_unique`; Caller is responsible for checking. + """ + if takeable: + series = self._ixs(col, axis=1) + return series._values[index] + + series = self._get_item(col) + engine = self.index._engine + + if not isinstance(self.index, MultiIndex): + # CategoricalIndex: Trying to use the engine fastpath may give incorrect + # results if our categories are integers that dont match our codes + # IntervalIndex: IntervalTree has no get_loc + row = self.index.get_loc(index) + return series._values[row] + + # For MultiIndex going through engine effectively restricts us to + # same-length tuples; see test_get_set_value_no_partial_indexing + loc = engine.get_loc(index) + return series._values[loc] + + def isetitem(self, loc, value) -> None: + """ + Set the given value in the column with position `loc`. + + This is a positional analogue to ``__setitem__``. + + Parameters + ---------- + loc : int or sequence of ints + Index position for the column. + value : scalar or arraylike + Value(s) for the column. + + See Also + -------- + DataFrame.iloc : Purely integer-location based indexing for selection by + position. + + Notes + ----- + ``frame.isetitem(loc, value)`` is an in-place method as it will + modify the DataFrame in place (not returning a new object). In contrast to + ``frame.iloc[:, i] = value`` which will try to update the existing values in + place, ``frame.isetitem(loc, value)`` will not update the values of the column + itself in place, it will instead insert a new array. + + In cases where ``frame.columns`` is unique, this is equivalent to + ``frame[frame.columns[i]] = value``. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.isetitem(1, [5, 6]) + >>> df + A B + 0 1 5 + 1 2 6 + """ + if isinstance(value, DataFrame): + if is_integer(loc): + loc = [loc] + + if len(loc) != len(value.columns): + raise ValueError( + f"Got {len(loc)} positions but value has {len(value.columns)} " + f"columns." + ) + + for i, idx in enumerate(loc): + arraylike, refs = self._sanitize_column(value.iloc[:, i]) + self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs) + return + + arraylike, refs = self._sanitize_column(value) + self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) + + def __setitem__(self, key, value) -> None: + if not PYPY: + if sys.getrefcount(self) <= 3: + warnings.warn( + _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 + ) + + key = com.apply_if_callable(key, self) + + # see if we can slice the rows + if isinstance(key, slice): + slc = self.index._convert_slice_indexer(key, kind="getitem") + return self._setitem_slice(slc, value) + + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: + self._setitem_frame(key, value) + elif isinstance(key, (Series, np.ndarray, list, Index)): + self._setitem_array(key, value) + elif isinstance(value, DataFrame): + self._set_item_frame_value(key, value) + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): + # Column to set is duplicated + self._setitem_array([key], value) + else: + # set column + self._set_item(key, value) + + def _setitem_slice(self, key: slice, value) -> None: + # NB: we can't just use self.loc[key] = value because that + # operates on labels and we need to operate positional for + # backwards-compat, xref GH#31469 + self.iloc[key] = value + + def _setitem_array(self, key, value) -> None: + # also raises Exception if object array with NA values + if com.is_bool_indexer(key): + # bool indexer is indexing along rows + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + if isinstance(value, DataFrame): + # GH#39931 reindex since iloc does not align + value = value.reindex(self.index.take(indexer)) + self.iloc[indexer] = value + + else: + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace + + if isinstance(value, DataFrame): + check_key_length(self.columns, key, value) + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] + + elif not is_list_like(value): + for col in key: + self[col] = value + + elif isinstance(value, np.ndarray) and value.ndim == 2: + self._iset_not_inplace(key, value) + + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + self._setitem_array(key, value) + + else: + self._iset_not_inplace(key, value) + + def _iset_not_inplace(self, key, value) -> None: + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. + + def igetitem(obj, i: int): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): + return obj[..., i] + else: + return obj[i] + + if self.columns.is_unique: + if np.shape(value)[-1] != len(key): + raise ValueError("Columns must be same length as key") + + for i, col in enumerate(key): + self[col] = igetitem(value, i) + + else: + ilocs = self.columns.get_indexer_non_unique(key)[0] + if (ilocs < 0).any(): + # key entries not in self.columns + raise NotImplementedError + + if np.shape(value)[-1] != len(ilocs): + raise ValueError("Columns must be same length as key") + + assert np.ndim(value) <= 2 + + orig_columns = self.columns + + # Using self.iloc[:, i] = ... may set values inplace, which + # by convention we do not do in __setitem__ + try: + self.columns = Index(range(len(self.columns))) + for i, iloc in enumerate(ilocs): + self[iloc] = igetitem(value, i) + finally: + self.columns = orig_columns + + def _setitem_frame(self, key, value) -> None: + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + key = self._constructor(key, **self._construct_axes_dict(), copy=False) + + if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): + raise TypeError( + "Must pass DataFrame or 2-d ndarray with boolean values only" + ) + + self._where(-key, value, inplace=True) + + def _set_item_frame_value(self, key, value: DataFrame) -> None: + self._ensure_valid_index(value) + + # align columns + if key in self.columns: + loc = self.columns.get_loc(key) + cols = self.columns[loc] + len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols) + if len_cols != len(value.columns): + raise ValueError("Columns must be same length as key") + + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, MultiIndex) and isinstance( + loc, (slice, Series, np.ndarray, Index) + ): + cols_droplevel = maybe_droplevels(cols, key) + if len(cols_droplevel) and not cols_droplevel.equals(value.columns): + value = value.reindex(cols_droplevel, axis=1) + + for col, col_droplevel in zip(cols, cols_droplevel): + self[col] = value[col_droplevel] + return + + if is_scalar(cols): + self[cols] = value[value.columns[0]] + return + + locs: np.ndarray | list + if isinstance(loc, slice): + locs = np.arange(loc.start, loc.stop, loc.step) + elif is_scalar(loc): + locs = [loc] + else: + locs = loc.nonzero()[0] + + return self.isetitem(locs, value) + + if len(value.columns) > 1: + raise ValueError( + "Cannot set a DataFrame with multiple columns to the single " + f"column {key}" + ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) + + self[key] = value[value.columns[0]] + + def _iset_item_mgr( + self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + # when called from _set_item_mgr loc can be anything returned from get_loc + self._mgr.iset(loc, value, inplace=inplace, refs=refs) + + def _set_item_mgr( + self, key, value: ArrayLike, refs: BlockValuesRefs | None = None + ) -> None: + try: + loc = self._info_axis.get_loc(key) + except KeyError: + # This item wasn't present, just insert at end + self._mgr.insert(len(self._info_axis), key, value, refs) + else: + self._iset_item_mgr(loc, value, refs=refs) + + def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: + # We are only called from _replace_columnwise which guarantees that + # no reindex is necessary + self._iset_item_mgr(loc, value._values, inplace=inplace, refs=value._references) + + def _set_item(self, key, value) -> None: + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrames index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrames index to + ensure homogeneity. + """ + value, refs = self._sanitize_column(value) + + if ( + key in self.columns + and value.ndim == 1 + and not isinstance(value.dtype, ExtensionDtype) + ): + # broadcast across multiple columns if necessary + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)).T + refs = None + + self._set_item_mgr(key, value, refs) + + def _set_value( + self, index: IndexLabel, col, value: Scalar, takeable: bool = False + ) -> None: + """ + Put single value at passed column and index. + + Parameters + ---------- + index : Label + row label + col : Label + column label + value : scalar + takeable : bool, default False + Sets whether or not index/col interpreted as indexers + """ + try: + if takeable: + icol = col + iindex = cast(int, index) + else: + icol = self.columns.get_loc(col) + iindex = self.index.get_loc(index) + self._mgr.column_setitem(icol, iindex, value, inplace_only=True) + + except (KeyError, TypeError, ValueError, LossySetitemError): + # get_loc might raise a KeyError for missing labels (falling back + # to (i)loc will do expansion of the index) + # column_setitem will do validation that may raise TypeError, + # ValueError, or LossySetitemError + # set using a non-recursive method & reset the cache + if takeable: + self.iloc[index, col] = value + else: + self.loc[index, col] = value + + except InvalidIndexError as ii_err: + # GH48729: Seems like you are trying to assign a value to a + # row when only scalar options are permitted + raise InvalidIndexError( + f"You can only assign a scalar value not a {type(value)}" + ) from ii_err + + def _ensure_valid_index(self, value) -> None: + """ + Ensure that if we don't have an index, that we can create one from the + passed value. + """ + # GH5632, make sure that we are a Series convertible + if not len(self.index) and is_list_like(value) and len(value): + if not isinstance(value, DataFrame): + try: + value = Series(value) + except (ValueError, NotImplementedError, TypeError) as err: + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a Series" + ) from err + + # GH31368 preserve name of index + index_copy = value.index.copy() + if self.index.name is not None: + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) + + def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series: + """ + Provide boxed values for a column. + """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] + # We get index=self.index bc values is a SingleBlockManager + obj = self._constructor_sliced_from_mgr(values, axes=values.axes) + obj._name = name + return obj.__finalize__(self) + + def _get_item(self, item: Hashable) -> Series: + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) + +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" + +import collections +from collections import abc +from collections.abc import ( + Hashable, + Iterable, + Iterator, + Mapping, + Sequence, +) +import functools +from inspect import signature +from io import StringIO +import itertools +import operator +import sys +from textwrap import dedent +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Literal, + cast, + overload, +) +import warnings + +import numpy as np +from numpy import ma + +from pandas._config import get_option + +from pandas._libs import ( + algos as libalgos, + lib, + properties, +) +from pandas._libs.hashtable import duplicated +from pandas._libs.lib import is_range_indexer +from pandas.compat import PYPY +from pandas.compat._constants import REF_COUNT +from pandas.compat._optional import import_optional_dependency +from pandas.compat.numpy import function as nv +from pandas.errors import ( + ChainedAssignmentError, + InvalidIndexError, +) +from pandas.errors.cow import ( + _chained_assignment_method_msg, + _chained_assignment_msg, +) +from pandas.util._decorators import ( + Appender, + Substitution, + doc, + set_module, +) +from pandas.util._exceptions import ( + find_stack_level, + rewrite_warning, +) +from pandas.util._validators import ( + validate_ascending, + validate_bool_kwarg, + validate_percentile, +) + +from pandas.core.dtypes.cast import ( + LossySetitemError, + can_hold_element, + construct_1d_arraylike_from_scalar, + construct_2d_arraylike_from_scalar, + find_common_type, + infer_dtype_from_scalar, + invalidate_string_dtypes, + maybe_downcast_to_dtype, +) +from pandas.core.dtypes.common import ( + infer_dtype_from_object, + is_1d_only_ea_dtype, + is_array_like, + is_bool_dtype, + is_dataclass, + is_dict_like, + is_float, + is_float_dtype, + is_hashable, + is_integer, + is_integer_dtype, + is_iterator, + is_list_like, + is_scalar, + is_sequence, + needs_i8_conversion, + pandas_dtype, +) +from pandas.core.dtypes.concat import concat_compat +from pandas.core.dtypes.dtypes import ( + ArrowDtype, + BaseMaskedDtype, + ExtensionDtype, +) +from pandas.core.dtypes.missing import ( + isna, + notna, +) + +from pandas.core import ( + algorithms, + common as com, + nanops, + ops, + roperator, +) +from pandas.core.accessor import CachedAccessor +from pandas.core.apply import reconstruct_and_relabel_result +from pandas.core.array_algos.take import take_2d_multi +from pandas.core.arraylike import OpsMixin +from pandas.core.arrays import ( + BaseMaskedArray, + DatetimeArray, + ExtensionArray, + PeriodArray, + TimedeltaArray, +) +from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + sanitize_array, + sanitize_masked_array, +) +from pandas.core.generic import ( + NDFrame, + make_doc, +) +from pandas.core.indexers import check_key_length +from pandas.core.indexes.api import ( + DatetimeIndex, + Index, + PeriodIndex, + default_index, + ensure_index, + ensure_index_from_sequences, +) +from pandas.core.indexes.multi import ( + MultiIndex, + maybe_droplevels, +) +from pandas.core.indexing import ( + check_bool_indexer, + check_dict_or_set_indexers, +) +from pandas.core.internals import BlockManager +from pandas.core.internals.construction import ( + arrays_to_mgr, + dataclasses_to_dicts, + dict_to_mgr, + ndarray_to_mgr, + nested_data_to_arrays, + rec_array_to_mgr, + reorder_arrays, + to_arrays, + treat_as_nested, +) +from pandas.core.methods import selectn +from pandas.core.reshape.melt import melt +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import ( + get_group_index, + lexsort_indexer, + nargsort, +) + +from pandas.io.common import get_handle +from pandas.io.formats import ( + console, + format as fmt, +) +from pandas.io.formats.info import ( + INFO_DOCSTRING, + DataFrameInfo, + frame_sub_kwargs, +) +import pandas.plotting + +if TYPE_CHECKING: + import datetime + + from pandas._libs.internals import BlockValuesRefs + from pandas._typing import ( + AggFuncType, + AnyAll, + AnyArrayLike, + ArrayLike, + Axes, + Axis, + AxisInt, + ColspaceArgType, + CompressionOptions, + CorrelationMethod, + DropKeep, + Dtype, + DtypeObj, + FilePath, + FloatFormatType, + FormattersType, + Frequency, + FromDictOrient, + HashableT, + HashableT2, + IgnoreRaise, + IndexKeyFunc, + IndexLabel, + JoinValidate, + Level, + ListLike, + MergeHow, + MergeValidate, + MutableMappingT, + NaPosition, + NsmallestNlargestKeep, + PythonFuncType, + QuantileInterpolation, + ReadBuffer, + ReindexMethod, + Renamer, + Scalar, + Self, + SequenceNotStr, + SortKind, + StorageOptions, + Suffixes, + T, + ToStataByteorder, + ToTimestampHow, + UpdateJoin, + ValueKeyFunc, + WriteBuffer, + XMLParsers, + npt, + ) + + from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg + from pandas.core.internals.managers import SingleBlockManager + + from pandas.io.formats.style import Styler + +# --------------------------------------------------------------------- +# Docstring templates + +_shared_doc_kwargs = { + "axes": "index, columns", + "klass": "DataFrame", + "axes_single_arg": "{0 or 'index', 1 or 'columns'}", + "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index': apply function to each column. + If 1 or 'columns': apply function to each row.""", + "inplace": """ + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one.""", + "optional_by": """ +by : str or list of str + Name or list of names to sort by. + + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels.""", + "optional_reindex": """ +labels : array-like, optional + New labels / index to conform the axis specified by 'axis' to. +index : array-like, optional + New labels for the index. Preferably an Index object to avoid + duplicating data. +columns : array-like, optional + New labels for the columns. Preferably an Index object to avoid + duplicating data. +axis : int or str, optional + Axis to target. Can be either the axis name ('index', 'columns') + or number (0, 1).""", +} + +_merge_doc = """ +Merge DataFrame or named Series objects with a database-style join. + +A named Series object is treated as a DataFrame with a single named column. + +The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes +on indexes or indexes on a column or columns, the index will be passed on. +When performing a cross merge, no column specifications to merge on are +allowed. + +.. warning:: + + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + +Parameters +----------%s +right : DataFrame or named Series + Object to merge with. +how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' + Type of merge to be performed. + + * left: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + * right: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + * outer: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + * inner: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. +on : label or list + Column or index level names to join on. These must be found in both + DataFrames. If `on` is None and not merging on indexes then this defaults + to the intersection of the columns in both DataFrames. +left_on : label or list, or array-like + Column or index level names to join on in the left DataFrame. Can also + be an array or list of arrays of the length of the left DataFrame. + These arrays are treated as if they are columns. +right_on : label or list, or array-like + Column or index level names to join on in the right DataFrame. Can also + be an array or list of arrays of the length of the right DataFrame. + These arrays are treated as if they are columns. +left_index : bool, default False + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels. +right_index : bool, default False + Use the index from the right DataFrame as the join key. Same caveats as + left_index. +sort : bool, default False + Sort the join keys lexicographically in the result DataFrame. If False, + the order of the join keys depends on the join type (how keyword). +suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. +copy : bool, default True + If False, avoid copy if possible. + + .. note:: + The `copy` keyword will change behavior in pandas 3.0. + `Copy-on-Write + `__ + will be enabled by default, which means that all methods with a + `copy` keyword will use a lazy copy mechanism to defer the copy and + ignore the `copy` keyword. The `copy` keyword will be removed in a + future version of pandas. + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` +indicator : bool or str, default False + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. + +validate : str, optional + If specified, checks if merge is of specified type. + + * "one_to_one" or "1:1": check if merge keys are unique in both + left and right datasets. + * "one_to_many" or "1:m": check if merge keys are unique in left + dataset. + * "many_to_one" or "m:1": check if merge keys are unique in right + dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. + +Returns +------- +DataFrame + A DataFrame of the two merged objects. + +See Also +-------- +merge_ordered : Merge with optional filling/interpolation. +merge_asof : Merge on nearest keys. +DataFrame.join : Similar method using indices. + +Examples +-------- +>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [1, 2, 3, 5]}) +>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], +... 'value': [5, 6, 7, 8]}) +>>> df1 + lkey value +0 foo 1 +1 bar 2 +2 baz 3 +3 foo 5 +>>> df2 + rkey value +0 foo 5 +1 bar 6 +2 baz 7 +3 foo 8 + +Merge df1 and df2 on the lkey and rkey columns. The value columns have +the default suffixes, _x and _y, appended. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey') + lkey value_x rkey value_y +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2 with specified left and right suffixes +appended to any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', +... suffixes=('_left', '_right')) + lkey value_left rkey value_right +0 foo 1 foo 5 +1 foo 1 foo 8 +2 bar 2 bar 6 +3 baz 3 baz 7 +4 foo 5 foo 5 +5 foo 5 foo 8 + +Merge DataFrames df1 and df2, but raise an exception if the DataFrames have +any overlapping columns. + +>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) +Traceback (most recent call last): +... +ValueError: columns overlap but no suffix specified: + Index(['value'], dtype='object') + +>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) +>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) +>>> df1 + a b +0 foo 1 +1 bar 2 +>>> df2 + a c +0 foo 3 +1 baz 4 + +>>> df1.merge(df2, how='inner', on='a') + a b c +0 foo 1 3 + +>>> df1.merge(df2, how='left', on='a') + a b c +0 foo 1 3.0 +1 bar 2 NaN + +>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) +>>> df2 = pd.DataFrame({'right': [7, 8]}) +>>> df1 + left +0 foo +1 bar +>>> df2 + right +0 7 +1 8 + +>>> df1.merge(df2, how='cross') + left right +0 foo 7 +1 foo 8 +2 bar 7 +3 bar 8 +""" + + +# ----------------------------------------------------------------------- +# DataFrame class + + +@set_module("pandas") +class DataFrame(NDFrame, OpsMixin): + """ + Two-dimensional, size-mutable, potentially heterogeneous tabular data. + + Data structure also contains labeled axes (rows and columns). + Arithmetic operations align on both row and column labels. Can be + thought of as a dict-like container for Series objects. The primary + pandas data structure. + + Parameters + ---------- + data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame + Dict can contain Series, arrays, constants, dataclass or list-like objects. If + data is a dict, column order follows insertion-order. If a dict contains Series + which have an index defined, it is aligned by its index. This alignment also + occurs if data is a Series or a DataFrame itself. Alignment is done on + Series/DataFrame inputs. + + If data is a list of dicts, column order follows insertion-order. + + index : Index or array-like + Index to use for resulting frame. Will default to RangeIndex if + no indexing information part of input data and no index provided. + columns : Index or array-like + Column labels to use for resulting frame when data does not have them, + defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, + will perform column selection instead. + dtype : dtype, default None + Data type to force. Only a single dtype is allowed. If None, infer. + copy : bool or None, default None + Copy data from inputs. + For dict data, the default of None behaves like ``copy=True``. For DataFrame + or 2d ndarray input, the default of None behaves like ``copy=False``. + If data is a dict containing one or more Series (possibly of different dtypes), + ``copy=False`` will ensure that these inputs are not copied. + + .. versionchanged:: 1.3.0 + + See Also + -------- + DataFrame.from_records : Constructor from tuples, also record arrays. + DataFrame.from_dict : From dicts of Series, arrays, or dicts. + read_csv : Read a comma-separated values (csv) file into DataFrame. + read_table : Read general delimited file into DataFrame. + read_clipboard : Read text from clipboard into DataFrame. + + Notes + ----- + Please reference the :ref:`User Guide ` for more information. + + Examples + -------- + Constructing DataFrame from a dictionary. + + >>> d = {"col1": [1, 2], "col2": [3, 4]} + >>> df = pd.DataFrame(data=d) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + + Notice that the inferred dtype is int64. + + >>> df.dtypes + col1 int64 + col2 int64 + dtype: object + + To enforce a single dtype: + + >>> df = pd.DataFrame(data=d, dtype=np.int8) + >>> df.dtypes + col1 int8 + col2 int8 + dtype: object + + Constructing DataFrame from a dictionary including Series: + + >>> d = {"col1": [0, 1, 2, 3], "col2": pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 + + Constructing DataFrame from numpy ndarray: + + >>> df2 = pd.DataFrame( + ... np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=["a", "b", "c"] + ... ) + >>> df2 + a b c + 0 1 2 3 + 1 4 5 6 + 2 7 8 9 + + Constructing DataFrame from a numpy ndarray that has labeled columns: + + >>> data = np.array( + ... [(1, 2, 3), (4, 5, 6), (7, 8, 9)], + ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")], + ... ) + >>> df3 = pd.DataFrame(data, columns=["c", "a"]) + >>> df3 + c a + 0 3 1 + 1 6 4 + 2 9 7 + + Constructing DataFrame from dataclass: + + >>> from dataclasses import make_dataclass + >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) + >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) + x y + 0 0 0 + 1 0 3 + 2 2 3 + + Constructing DataFrame from Series/DataFrame: + + >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) + >>> df = pd.DataFrame(data=ser, index=["a", "c"]) + >>> df + 0 + a 1 + c 3 + + >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) + >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) + >>> df2 + x + a 1 + c 3 + """ + + _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set + _typ = "dataframe" + _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) + _accessors: set[str] = {"sparse"} + _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) + _mgr: BlockManager + + # similar to __array_priority__, positions DataFrame before Series, Index, + # and ExtensionArray. Should NOT be overridden by subclasses. + __pandas_priority__ = 4000 + + @property + def _constructor(self) -> type[DataFrame]: + return DataFrame + + def _constructor_from_mgr(self, mgr, axes) -> DataFrame: + if self._constructor is DataFrame: + # we are pandas.DataFrame (or a subclass that doesn't override _constructor) + return DataFrame._from_mgr(mgr, axes=axes) + else: + assert axes is mgr.axes + return self._constructor(mgr) + + _constructor_sliced: Callable[..., Series] = Series + + def _sliced_from_mgr(self, mgr, axes) -> Series: + return Series._from_mgr(mgr, axes) + + def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: + if self._constructor_sliced is Series: + ser = self._sliced_from_mgr(mgr, axes) + ser._name = None # caller is responsible for setting real name + return ser + assert axes is mgr.axes + return self._constructor_sliced(mgr) + + # ---------------------------------------------------------------------- + # Constructors + + def __init__( + self, + data=None, + index: Axes | None = None, + columns: Axes | None = None, + dtype: Dtype | None = None, + copy: bool | None = None, + ) -> None: + allow_mgr = False + if dtype is not None: + dtype = self._validate_dtype(dtype) + + if isinstance(data, DataFrame): + data = data._mgr + allow_mgr = True + if not copy: + # if not copying data, ensure to still return a shallow copy + # to avoid the result sharing the same Manager + data = data.copy(deep=False) + + if isinstance(data, BlockManager): + if not allow_mgr: + # GH#52419 + warnings.warn( + f"Passing a {type(data).__name__} to {type(self).__name__} " + "is deprecated and will raise in a future version. " + "Use public APIs instead.", + DeprecationWarning, + stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix + ) + + data = data.copy(deep=False) + # first check if a Manager is passed without any other arguments + # -> use fastpath (without checking Manager type) + if index is None and columns is None and dtype is None and not copy: + # GH#33357 fastpath + NDFrame.__init__(self, data) + return + + is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) + data_dtype = getattr(data, "dtype", None) + original_dtype = dtype + + # GH47215 + if isinstance(index, set): + raise ValueError("index cannot be a set") + if isinstance(columns, set): + raise ValueError("columns cannot be a set") + + if copy is None: + if isinstance(data, dict): + # retain pre-GH#38939 default behavior + copy = True + elif not isinstance(data, (Index, DataFrame, Series)): + copy = True + else: + copy = False + + if data is None: + index = index if index is not None else default_index(0) + columns = columns if columns is not None else default_index(0) + dtype = dtype if dtype is not None else pandas_dtype(object) + data = [] + + if isinstance(data, BlockManager): + mgr = self._init_mgr( + data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy + ) + + elif isinstance(data, dict): + # GH#38939 de facto copy defaults to False only in non-dict cases + mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy) + elif isinstance(data, ma.MaskedArray): + from numpy.ma import mrecords + + # masked recarray + if isinstance(data, mrecords.MaskedRecords): + raise TypeError( + "MaskedRecords are not supported. Pass " + "{name: data[name] for name in data.dtype.names} " + "instead" + ) + + # a masked array + data = sanitize_masked_array(data) + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): + if data.dtype.names: + # i.e. numpy structured array + data = cast(np.ndarray, data) + mgr = rec_array_to_mgr( + data, + index, + columns, + dtype, + copy, + ) + elif getattr(data, "name", None) is not None: + # i.e. Series/Index with non-None name + mgr = dict_to_mgr( + # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no + # attribute "name" + {data.name: data}, # type: ignore[union-attr] + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + + # For data is list-like, or Iterable (will consume into list) + elif is_list_like(data): + if not isinstance(data, abc.Sequence): + if hasattr(data, "__array__"): + # GH#44616 big perf improvement for e.g. pytorch tensor + data = np.asarray(data) + else: + data = list(data) + if len(data) > 0: + if is_dataclass(data[0]): + data = dataclasses_to_dicts(data) + if not isinstance(data, np.ndarray) and treat_as_nested(data): + # exclude ndarray as we may have cast it a few lines above + if columns is not None: + columns = ensure_index(columns) + arrays, columns, index = nested_data_to_arrays( + # error: Argument 3 to "nested_data_to_arrays" has incompatible + # type "Optional[Collection[Any]]"; expected "Optional[Index]" + data, + columns, + index, # type: ignore[arg-type] + dtype, + ) + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + ) + else: + mgr = ndarray_to_mgr( + data, + index, + columns, + dtype=dtype, + copy=copy, + ) + else: + mgr = dict_to_mgr( + {}, + index, + columns if columns is not None else default_index(0), + dtype=dtype, + ) + # For data is scalar + else: + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + index = ensure_index(index) + columns = ensure_index(columns) + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data) + + # For data is a scalar extension dtype + if isinstance(dtype, ExtensionDtype): + # TODO(EA2D): special case not needed with 2D EAs + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, dtype=None) + else: + arr2d = construct_2d_arraylike_from_scalar( + data, + len(index), + len(columns), + dtype, + copy, + ) + + mgr = ndarray_to_mgr( + arr2d, + index, + columns, + dtype=arr2d.dtype, + copy=False, + ) + + NDFrame.__init__(self, mgr) + + if original_dtype is None and is_pandas_object and data_dtype == np.object_: + if self.dtypes.iloc[0] != data_dtype: + warnings.warn( + "Dtype inference on a pandas object " + "(Series, Index, ExtensionArray) is deprecated. The DataFrame " + "constructor will keep the original dtype in the future. " + "Call `infer_objects` on the result to get the old " + "behavior.", + FutureWarning, + stacklevel=2, + ) + + @classmethod + def _from_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Internal constructor that bypasses the deprecated BlockManager warning. + + This method is intended for internal use only. + + Parameters + ---------- + data : BlockManager, array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame + """ + if isinstance(data, BlockManager): + # Skip the deprecation warning for internal use + obj = cls.__new__(cls) + NDFrame.__init__(obj, data) + return obj + + # Use the regular constructor for other data types + return cls(data=data, index=index, columns=columns, dtype=dtype, copy=copy) + + @classmethod + def _create_with_data(cls, data, index=None, columns=None, dtype=None, copy=False): + """ + Create a DataFrame from data for subclasses. + + This is a recommended helper method for subclasses to create new instances + from data without triggering deprecation warnings. + + Parameters + ---------- + data : array-like, Iterable, dict, or DataFrame + Data to be stored in DataFrame. + index : Index or array-like, optional + Index for the DataFrame. + columns : Index or array-like, optional + Column labels for the DataFrame. + dtype : dtype, optional + Data type for the DataFrame. + copy : bool, default False + Whether to copy the data. + + Returns + ------- + DataFrame or subclass + DataFrame of the subclass type. + + Notes + ----- + This method is primarily intended for subclass authors to create + instances of their subclass from array-like data. + """ + # Prepare the data safely avoiding internal manager passing + if isinstance(data, DataFrame): + if index is None: + index = data.index + if columns is None: + columns = data.columns + + # Create a dataframe using public APIs + df = cls(data, index=index, columns=columns, dtype=dtype, copy=copy) + return df + + # ---------------------------------------------------------------------- + + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> DataFrameXchg: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + `nan_as_null` is DEPRECATED and has no effect. Please avoid using + it; it will be removed in a future release. + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + + Examples + -------- + >>> df_not_necessarily_pandas = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> interchange_object = df_not_necessarily_pandas.__dataframe__() + >>> interchange_object.column_names() + Index(['A', 'B'], dtype='object') + >>> df_pandas = pd.api.interchange.from_dataframe( + ... interchange_object.select_columns_by_name(["A"]) + ... ) + >>> df_pandas + A + 0 1 + 1 2 + + These methods (``column_names``, ``select_columns_by_name``) should work + for any dataframe library which implements the interchange protocol. + """ + + from pandas.core.interchange.dataframe import PandasDataFrameXchg + + return PandasDataFrameXchg(self, allow_copy=allow_copy) + + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the pandas DataFrame as an Arrow C stream PyCapsule. + + This relies on pyarrow to convert the pandas DataFrame to the Arrow + format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` + in its handling of the index, i.e. store the index as a column except + for RangeIndex). + This conversion is not necessarily zero-copy. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the dataframe should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + """ + pa = import_optional_dependency("pyarrow", min_version="14.0.0") + if requested_schema is not None: + requested_schema = pa.Schema._import_from_c_capsule(requested_schema) + table = pa.Table.from_pandas(self, schema=requested_schema) + return table.__arrow_c_stream__() + + # ---------------------------------------------------------------------- + + @property + def axes(self) -> list[Index]: + """ + Return a list representing the axes of the DataFrame. + + It has the row axis labels and column axis labels as the only members. + They are returned in that order. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.axes + [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], + dtype='object')] + """ + return [self.index, self.columns] + + @property + def shape(self) -> tuple[int, int]: + """ + Return a tuple representing the dimensionality of the DataFrame. + + See Also + -------- + ndarray.shape : Tuple of array dimensions. + + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df.shape + (2, 2) + + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4], "col3": [5, 6]}) + >>> df.shape + (2, 3) + """ + return len(self.index), len(self.columns) + + @property + def _is_homogeneous_type(self) -> bool: + """ + Whether all the columns in a DataFrame have the same type. + + Returns + ------- + bool + + Examples + -------- + >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type + True + >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type + False + + Items with the same type but different sizes are considered + different types. + + >>> DataFrame( + ... { + ... "A": np.array([1, 2], dtype=np.int32), + ... "B": np.array([1, 2], dtype=np.int64), + ... } + ... )._is_homogeneous_type + False + """ + # The "<" part of "<=" here is for empty DataFrame cases + return len({arr.dtype for arr in self._mgr.arrays}) <= 1 + + @property + def _can_fast_transpose(self) -> bool: + """ + Can we transpose this DataFrame without creating any new array objects. + """ + blocks = self._mgr.blocks + if len(blocks) != 1: + return False + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_1d_only_ea_dtype(dtype) + + @property + def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + mgr = self._mgr + + blocks = mgr.blocks + if len(blocks) != 1: + return ensure_wrapped_if_datetimelike(self.values) + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self.values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) + return arr.T + + # ---------------------------------------------------------------------- + # Rendering Methods + + def _repr_fits_vertical_(self) -> bool: + """ + Check length against max_rows. + """ + max_rows = get_option("display.max_rows") + return len(self) <= max_rows + + def _repr_fits_horizontal_(self) -> bool: + """ + Check if full repr fits in horizontal boundaries imposed by the display + options width and max_columns. + """ + width, height = console.get_console_size() + max_columns = get_option("display.max_columns") + nb_columns = len(self.columns) + + # exceed max columns + if (max_columns and nb_columns > max_columns) or ( + width and nb_columns > (width // 2) + ): + return False + + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if width is None or not console.in_interactive_session(): + return True + + if get_option("display.width") is not None or console.in_ipython_frontend(): + # check at least the column row for excessive width + max_rows = 1 + else: + max_rows = get_option("display.max_rows") + + # when auto-detecting, so width=None and not in ipython front end + # check whether repr fits horizontal by actually checking + # the width of the rendered repr + buf = StringIO() + + # only care about the stuff we'll actually print out + # and to_string on entire frame may be expensive + d = self + + if max_rows is not None: # unlimited rows + # min of two, where one may be None + d = d.iloc[: min(max_rows, len(d))] + else: + return True + + d.to_string(buf=buf) + value = buf.getvalue() + repr_width = max(len(line) for line in value.split("\n")) + + return repr_width < width + + def _info_repr(self) -> bool: + """ + True if the repr should show the info view. + """ + info_repr_option = get_option("display.large_repr") == "info" + return info_repr_option and not ( + self._repr_fits_horizontal_() and self._repr_fits_vertical_() + ) + + def __repr__(self) -> str: + """ + Return a string representation for a particular DataFrame. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + return buf.getvalue() + + repr_params = fmt.get_dataframe_repr_params() + return self.to_string(**repr_params) + + def _repr_html_(self) -> str | None: + """ + Return a html representation for a particular DataFrame. + + Mainly for IPython notebook. + """ + if self._info_repr(): + buf = StringIO() + self.info(buf=buf) + # need to escape the , should be the first line. + val = buf.getvalue().replace("<", r"<", 1) + val = val.replace(">", r">", 1) + return f"
{val}
" + + if get_option("display.notebook_repr_html"): + max_rows = get_option("display.max_rows") + min_rows = get_option("display.min_rows") + max_cols = get_option("display.max_columns") + show_dimensions = get_option("display.show_dimensions") + + formatter = fmt.DataFrameFormatter( + self, + columns=None, + col_space=None, + na_rep="NaN", + formatters=None, + float_format=None, + sparsify=None, + justify=None, + index_names=True, + header=True, + index=True, + bold_rows=True, + escape=True, + max_rows=max_rows, + min_rows=min_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=".", + ) + return fmt.DataFrameRenderer(formatter).to_html(notebook=True) + else: + return None + + @overload + def to_string( + self, + buf: None = ..., + *, + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> str: ... + + @overload + def to_string( + self, + buf: FilePath | WriteBuffer[str], + *, + columns: Axes | None = ..., + col_space: int | list[int] | dict[Hashable, int] | None = ..., + header: bool | SequenceNotStr[str] = ..., + index: bool = ..., + na_rep: str = ..., + formatters: fmt.FormattersType | None = ..., + float_format: fmt.FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool = ..., + decimal: str = ..., + line_width: int | None = ..., + min_rows: int | None = ..., + max_colwidth: int | None = ..., + encoding: str | None = ..., + ) -> None: ... + + @Substitution( + header_type="bool or list of str", + header="Write out the column names. If a list of columns " + "is given, it is assumed to be aliases for the " + "column names", + col_space_type="int, list or dict of int", + col_space="The minimum width of each column. If a list of ints is given " + "every integers corresponds with one column. If a dict is given, the key " + "references the column, while the value defines the space to use.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_string( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + columns: Axes | None = None, + col_space: int | list[int] | dict[Hashable, int] | None = None, + header: bool | SequenceNotStr[str] = True, + index: bool = True, + na_rep: str = "NaN", + formatters: fmt.FormattersType | None = None, + float_format: fmt.FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool = False, + decimal: str = ".", + line_width: int | None = None, + min_rows: int | None = None, + max_colwidth: int | None = None, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + min_rows : int, optional + The number of rows to display in the console in a truncated repr + (when number of rows is above `max_rows`). + max_colwidth : int, optional + Max width to truncate each column in characters. By default, no limit. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {"col1": [1, 2, 3], "col2": [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + from pandas import option_context + + with option_context("display.max_colwidth", max_colwidth): + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, + index=index, + min_rows=min_rows, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + ) + return fmt.DataFrameRenderer(formatter).to_string( + buf=buf, + encoding=encoding, + line_width=line_width, + ) + + def _get_values_for_csv( + self, + *, + float_format: FloatFormatType | None, + date_format: str | None, + decimal: str, + na_rep: str, + quoting, # int csv.QUOTE_FOO from stdlib + ) -> DataFrame: + # helper used by to_csv + mgr = self._mgr.get_values_for_csv( + float_format=float_format, + date_format=date_format, + decimal=decimal, + na_rep=na_rep, + quoting=quoting, + ) + return self._constructor_from_mgr(mgr, axes=mgr.axes) + + # ---------------------------------------------------------------------- + + @property + def style(self) -> Styler: + """ + Returns a Styler object. + + Contains methods for building a styled HTML representation of the DataFrame. + + See Also + -------- + io.formats.style.Styler : Helps style a DataFrame or Series according to the + data with HTML and CSS. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2, 3]}) + >>> df.style # doctest: +SKIP + + Please see + `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. + """ + from pandas.io.formats.style import Styler + + return Styler(self) + + _shared_docs["items"] = r""" + Iterate over (column name, Series) pairs. + + Iterates over the DataFrame columns, returning a tuple with + the column name and the content as a Series. + + Yields + ------ + label : object + The column names for the DataFrame being iterated over. + content : Series + The column entries belonging to each label, as a Series. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as + (index, Series) pairs. + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples + of the values. + + Examples + -------- + >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], + ... 'population': [1864, 22000, 80000]}, + ... index=['panda', 'polar', 'koala']) + >>> df + species population + panda bear 1864 + polar bear 22000 + koala marsupial 80000 + >>> for label, content in df.items(): + ... print(f'label: {label}') + ... print(f'content: {content}', sep='\n') + ... + label: species + content: + panda bear + polar bear + koala marsupial + Name: species, dtype: object + label: population + content: + panda 1864 + polar 22000 + koala 80000 + Name: population, dtype: int64 + """ + + @Appender(_shared_docs["items"]) + def items(self) -> Iterable[tuple[Hashable, Series]]: + for i, k in enumerate(self.columns): + yield k, self._ixs(i, axis=1) + + def iterrows(self) -> Iterable[tuple[Hashable, Series]]: + """ + Iterate over DataFrame rows as (index, Series) pairs. + + Yields + ------ + index : label or tuple of label + The index of the row. A tuple for a `MultiIndex`. + data : Series + The data of the row as a Series. + + See Also + -------- + DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + 1. Because ``iterrows`` returns a Series for each row, + it does **not** preserve dtypes across the rows (dtypes are + preserved across columns for DataFrames). + + To preserve dtypes while iterating over the rows, it is better + to use :meth:`itertuples` which returns namedtuples of the values + and which is generally faster than ``iterrows``. + + 2. You should **never modify** something you are iterating over. + This is not guaranteed to work in all cases. Depending on the + data types, the iterator returns a copy and not a view, and writing + to it will have no effect. + + Examples + -------- + + >>> df = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) + >>> row = next(df.iterrows())[1] + >>> row + int 1.0 + float 1.5 + Name: 0, dtype: float64 + >>> print(row["int"].dtype) + float64 + >>> print(df["int"].dtype) + int64 + """ + columns = self.columns + klass = self._constructor_sliced + for k, v in zip(self.index, self.values): + s = klass(v, index=columns, name=k).__finalize__(self) + if self._mgr.is_single_block: + s._mgr.add_references(self._mgr) + yield k, s + + def itertuples( + self, index: bool = True, name: str | None = "Pandas" + ) -> Iterable[tuple[Any, ...]]: + """ + Iterate over DataFrame rows as namedtuples. + + Parameters + ---------- + index : bool, default True + If True, return the index as the first element of the tuple. + name : str or None, default "Pandas" + The name of the returned namedtuples or None to return regular + tuples. + + Returns + ------- + iterator + An object to iterate over namedtuples for each row in the + DataFrame with the first field possibly being the index and + following fields being the column values. + + See Also + -------- + DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) + pairs. + DataFrame.items : Iterate over (column name, Series) pairs. + + Notes + ----- + The column names will be renamed to positional names if they are + invalid Python identifiers, repeated, or start with an underscore. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"] + ... ) + >>> df + num_legs num_wings + dog 4 0 + hawk 2 2 + >>> for row in df.itertuples(): + ... print(row) + Pandas(Index='dog', num_legs=4, num_wings=0) + Pandas(Index='hawk', num_legs=2, num_wings=2) + + By setting the `index` parameter to False we can remove the index + as the first element of the tuple: + + >>> for row in df.itertuples(index=False): + ... print(row) + Pandas(num_legs=4, num_wings=0) + Pandas(num_legs=2, num_wings=2) + + With the `name` parameter set we set a custom name for the yielded + namedtuples: + + >>> for row in df.itertuples(name="Animal"): + ... print(row) + Animal(Index='dog', num_legs=4, num_wings=0) + Animal(Index='hawk', num_legs=2, num_wings=2) + """ + arrays = [] + fields = list(self.columns) + if index: + arrays.append(self.index) + fields.insert(0, "Index") + + # use integer indexing because of possible duplicate column names + arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) + + if name is not None: + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) + return map(itertuple._make, zip(*arrays)) + + # fallback to regular tuples + return zip(*arrays) + + def __len__(self) -> int: + """ + Returns length of info axis, but here we use the index. + """ + return len(self.index) + + @overload + def dot(self, other: Series) -> Series: ... + + @overload + def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: ... + + def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Compute the matrix multiplication between the DataFrame and other. + + This method computes the matrix product between the DataFrame and the + values of an other Series, DataFrame or a numpy array. + + It can also be called using ``self @ other``. + + Parameters + ---------- + other : Series, DataFrame or array-like + The other object to compute the matrix product with. + + Returns + ------- + Series or DataFrame + If other is a Series, return the matrix product between self and + other as a Series. If other is a DataFrame or a numpy.array, return + the matrix product of self and other in a DataFrame of a np.array. + + See Also + -------- + Series.dot: Similar method for Series. + + Notes + ----- + The dimensions of DataFrame and other must be compatible in order to + compute the matrix multiplication. In addition, the column names of + DataFrame and the index of other must contain the same values, as they + will be aligned prior to the multiplication. + + The dot method for Series computes the inner product, instead of the + matrix product here. + + Examples + -------- + Here we multiply a DataFrame with a Series. + + >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) + >>> s = pd.Series([1, 1, 2, 1]) + >>> df.dot(s) + 0 -4 + 1 5 + dtype: int64 + + Here we multiply a DataFrame with another DataFrame. + + >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(other) + 0 1 + 0 1 4 + 1 2 2 + + Note that the dot method give the same result as @ + + >>> df @ other + 0 1 + 0 1 4 + 1 2 2 + + The dot method works also if other is an np.array. + + >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) + >>> df.dot(arr) + 0 1 + 0 1 4 + 1 2 2 + + Note how shuffling of the objects does not change the result. + + >>> s2 = s.reindex([1, 0, 2, 3]) + >>> df.dot(s2) + 0 -4 + 1 5 + dtype: int64 + """ + if isinstance(other, (Series, DataFrame)): + common = self.columns.union(other.index) + if len(common) > len(self.columns) or len(common) > len(other.index): + raise ValueError("matrices are not aligned") + + left = self.reindex(columns=common) + right = other.reindex(index=common) + lvals = left.values + rvals = right._values + else: + left = self + lvals = self.values + rvals = np.asarray(other) + if lvals.shape[1] != rvals.shape[0]: + raise ValueError( + f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" + ) + + if isinstance(other, DataFrame): + common_type = find_common_type(list(self.dtypes) + list(other.dtypes)) + return self._constructor( + np.dot(lvals, rvals), + index=left.index, + columns=other.columns, + copy=False, + dtype=common_type, + ) + elif isinstance(other, Series): + common_type = find_common_type(list(self.dtypes) + [other.dtypes]) + return self._constructor_sliced( + np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type + ) + elif isinstance(rvals, (np.ndarray, Index)): + result = np.dot(lvals, rvals) + if result.ndim == 2: + return self._constructor(result, index=left.index, copy=False) + else: + return self._constructor_sliced(result, index=left.index, copy=False) + else: # pragma: no cover + raise TypeError(f"unsupported type: {type(other)}") + + @overload + def __matmul__(self, other: Series) -> Series: ... + + @overload + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: ... + + def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: + """ + Matrix multiplication using binary `@` operator. + """ + return self.dot(other) + + def __rmatmul__(self, other) -> DataFrame: + """ + Matrix multiplication using binary `@` operator. + """ + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err + + # ---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict( + cls, + data: dict, + orient: FromDictOrient = "columns", + dtype: Dtype | None = None, + columns: Axes | None = None, + ) -> DataFrame: + """ + Construct DataFrame from dict of array-like or dicts. + + Creates DataFrame object from dictionary by columns or by index + allowing dtype specification. + + Parameters + ---------- + data : dict + Of the form {field : array-like} or {field : dict}. + orient : {'columns', 'index', 'tight'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + If 'tight', assume a dict with keys ['index', 'columns', 'data', + 'index_names', 'column_names']. + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + dtype : dtype, default None + Data type to force after DataFrame construction, otherwise infer. + columns : list, default None + Column labels to use when ``orient='index'``. Raises a ValueError + if used with ``orient='columns'`` or ``orient='tight'``. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_records : DataFrame from structured ndarray, sequence + of tuples or dicts, or DataFrame. + DataFrame : DataFrame object creation using constructor. + DataFrame.to_dict : Convert the DataFrame to a dictionary. + + Examples + -------- + By default the keys of the dict become the DataFrame columns: + + >>> data = {"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]} + >>> pd.DataFrame.from_dict(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Specify ``orient='index'`` to create the DataFrame using dictionary + keys as rows: + + >>> data = {"row_1": [3, 2, 1, 0], "row_2": ["a", "b", "c", "d"]} + >>> pd.DataFrame.from_dict(data, orient="index") + 0 1 2 3 + row_1 3 2 1 0 + row_2 a b c d + + When using the 'index' orientation, the column names can be + specified manually: + + >>> pd.DataFrame.from_dict(data, orient="index", columns=["A", "B", "C", "D"]) + A B C D + row_1 3 2 1 0 + row_2 a b c d + + Specify ``orient='tight'`` to create the DataFrame using a 'tight' + format: + + >>> data = { + ... "index": [("a", "b"), ("a", "c")], + ... "columns": [("x", 1), ("y", 2)], + ... "data": [[1, 3], [2, 4]], + ... "index_names": ["n1", "n2"], + ... "column_names": ["z1", "z2"], + ... } + >>> pd.DataFrame.from_dict(data, orient="tight") + z1 x y + z2 1 2 + n1 n2 + a b 1 3 + c 2 4 + """ + index: list | Index | None = None + orient = orient.lower() # type: ignore[assignment] + if orient == "index": + if len(data) > 0: + # TODO speed up Series case + if isinstance(next(iter(data.values())), (Series, dict)): + data = _from_nested_dict(data) + else: + index = list(data.keys()) + # error: Incompatible types in assignment (expression has type + # "List[Any]", variable has type "Dict[Any, Any]") + data = list(data.values()) # type: ignore[assignment] + elif orient in ("columns", "tight"): + if columns is not None: + raise ValueError(f"cannot use columns parameter with orient='{orient}'") + else: # pragma: no cover + raise ValueError( + f"Expected 'index', 'columns' or 'tight' for orient parameter. " + f"Got '{orient}' instead" + ) + + if orient != "tight": + return cls(data, index=index, columns=columns, dtype=dtype) + else: + realdata = data["data"] + + def create_index(indexlist, namelist) -> Index: + index: Index + if len(namelist) > 1: + index = MultiIndex.from_tuples(indexlist, names=namelist) + else: + index = Index(indexlist, name=namelist[0]) + return index + + index = create_index(data["index"], data["index_names"]) + columns = create_index(data["columns"], data["column_names"]) + return cls(realdata, index=index, columns=columns, dtype=dtype) + + def to_numpy( + self, + dtype: npt.DTypeLike | None = None, + copy: bool = False, + na_value: object = lib.no_default, + ) -> np.ndarray: + """ + Convert the DataFrame to a NumPy array. + + By default, the dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, if the dtypes are + ``float16`` and ``float32``, the results dtype will be ``float32``. + This may require copying data and coercing values, which may be + expensive. + + Parameters + ---------- + dtype : str or numpy.dtype, optional + The dtype to pass to :meth:`numpy.asarray`. + copy : bool, default False + Whether to ensure that the returned value is not a view on + another array. Note that ``copy=False`` does not *ensure* that + ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that + a copy is made, even if not strictly necessary. + na_value : Any, optional + The value to use for missing values. The default value depends + on `dtype` and the dtypes of the DataFrame columns. + + Returns + ------- + numpy.ndarray + The NumPy array representing the values in the DataFrame. + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogeneous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df["C"] = pd.date_range("2000", periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + if dtype is not None: + dtype = np.dtype(dtype) + result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) + if result.dtype is not dtype: + result = np.asarray(result, dtype=dtype) + + return result + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> MutableMappingT: ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[MutableMappingT] | MutableMappingT, + index: bool = ..., + ) -> list[MutableMappingT]: ... + + @overload + def to_dict( + self, + orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., + *, + into: type[dict] = ..., + index: bool = ..., + ) -> dict: ... + + @overload + def to_dict( + self, + orient: Literal["records"], + *, + into: type[dict] = ..., + index: bool = ..., + ) -> list[dict]: ... + + # error: Incompatible default for argument "into" (default has type "type + # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") + def to_dict( + self, + orient: Literal[ + "dict", "list", "series", "split", "tight", "records", "index" + ] = "dict", + *, + into: type[MutableMappingT] | MutableMappingT = dict, # type: ignore[assignment] + index: bool = True, + ) -> MutableMappingT | list[MutableMappingT]: + """ + Convert the DataFrame to a dictionary. + + The type of the key-value pairs can be customized with the parameters + (see below). + + Parameters + ---------- + orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} + Determines the type of the values of the dictionary. + + - 'dict' (default) : dict like {column -> {index -> value}} + - 'list' : dict like {column -> [values]} + - 'series' : dict like {column -> Series(values)} + - 'split' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} + - 'tight' : dict like + {'index' -> [index], 'columns' -> [columns], 'data' -> [values], + 'index_names' -> [index.names], 'column_names' -> [column.names]} + - 'records' : list like + [{column -> value}, ... , {column -> value}] + - 'index' : dict like {index -> {column -> value}} + + .. versionadded:: 1.4.0 + 'tight' as an allowed value for the ``orient`` argument + + into : class, default dict + The collections.abc.MutableMapping subclass used for all Mappings + in the return value. Can be the actual class or an empty + instance of the mapping type you want. If you want a + collections.defaultdict, you must pass it initialized. + + index : bool, default True + Whether to include the index item (and index_names item if `orient` + is 'tight') in the returned dictionary. Can only be ``False`` + when `orient` is 'split' or 'tight'. Note that when `orient` is + 'records', this parameter does not take effect (index item always + not included). + + .. versionadded:: 2.0.0 + + Returns + ------- + dict, list or collections.abc.MutableMapping + Return a collections.abc.MutableMapping object representing the + DataFrame. The resulting transformation depends on the `orient` + parameter. + + See Also + -------- + DataFrame.from_dict: Create a DataFrame from a dictionary. + DataFrame.to_json: Convert a DataFrame to JSON format. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"col1": [1, 2], "col2": [0.5, 0.75]}, index=["row1", "row2"] + ... ) + >>> df + col1 col2 + row1 1 0.50 + row2 2 0.75 + >>> df.to_dict() + {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} + + You can specify the return orientation. + + >>> df.to_dict("series") + {'col1': row1 1 + row2 2 + Name: col1, dtype: int64, + 'col2': row1 0.50 + row2 0.75 + Name: col2, dtype: float64} + + >>> df.to_dict("split") + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]]} + + >>> df.to_dict("records") + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] + + >>> df.to_dict("index") + {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} + + >>> df.to_dict("tight") + {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], + 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} + + You can also specify the mapping type. + + >>> from collections import OrderedDict, defaultdict + >>> df.to_dict(into=OrderedDict) + OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), + ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) + + If you want a `defaultdict`, you need to initialize it: + + >>> dd = defaultdict(list) + >>> df.to_dict("records", into=dd) + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] + """ + from pandas.core.methods.to_dict import to_dict + + return to_dict(self, orient, into=into, index=index) + + @classmethod + def from_records( + cls, + data, + index=None, + exclude=None, + columns=None, + coerce_float: bool = False, + nrows: int | None = None, + ) -> DataFrame: + """ + Convert structured or record ndarray to DataFrame. + + Creates a DataFrame object from a structured ndarray, sequence of + tuples or dicts, or DataFrame. + + Parameters + ---------- + data : structured ndarray, sequence of tuples or dicts + Structured input data. + index : str, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use. + exclude : sequence, default None + Columns or fields to exclude. + columns : sequence, default None + Column names to use. If the passed data do not have names + associated with them, this argument provides names for the + columns. Otherwise this argument indicates the order of the columns + in the result (any names not found in the data will become all-NA + columns). + coerce_float : bool, default False + Attempt to convert values of non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets. + nrows : int, default None + Number of rows to read if data is an iterator. + + Returns + ------- + DataFrame + + See Also + -------- + DataFrame.from_dict : DataFrame from dict of array-like or dicts. + DataFrame : DataFrame object creation using constructor. + + Examples + -------- + Data can be provided as a structured ndarray: + + >>> data = np.array( + ... [(3, "a"), (2, "b"), (1, "c"), (0, "d")], + ... dtype=[("col_1", "i4"), ("col_2", "U1")], + ... ) + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of dicts: + + >>> data = [ + ... {"col_1": 3, "col_2": "a"}, + ... {"col_1": 2, "col_2": "b"}, + ... {"col_1": 1, "col_2": "c"}, + ... {"col_1": 0, "col_2": "d"}, + ... ] + >>> pd.DataFrame.from_records(data) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + + Data can be provided as a list of tuples with corresponding columns: + + >>> data = [(3, "a"), (2, "b"), (1, "c"), (0, "d")] + >>> pd.DataFrame.from_records(data, columns=["col_1", "col_2"]) + col_1 col_2 + 0 3 a + 1 2 b + 2 1 c + 3 0 d + """ + if isinstance(data, DataFrame): + raise TypeError( + "Passing a DataFrame to DataFrame.from_records is not supported. Use " + "set_index and/or drop to modify the DataFrame instead.", + ) + + result_index = None + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = ensure_index(columns) + + def maybe_reorder( + arrays: list[ArrayLike], arr_columns: Index, columns: Index, index + ) -> tuple[list[ArrayLike], Index, Index | None]: + """ + If our desired 'columns' do not match the data's pre-existing 'arr_columns', + we re-order our arrays. This is like a pre-emptive (cheap) reindex. + """ + if len(arrays): + length = len(arrays[0]) + else: + length = 0 + + result_index = None + if len(arrays) == 0 and index is None and length == 0: + result_index = default_index(0) + + arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) + return arrays, arr_columns, result_index + + if is_iterator(data): + if nrows == 0: + return cls() + + try: + first_row = next(data) + except StopIteration: + return cls(index=index, columns=columns) + + dtype = None + if hasattr(first_row, "dtype") and first_row.dtype.names: + dtype = first_row.dtype + + values = [first_row] + + if nrows is None: + values += data + else: + values.extend(itertools.islice(data, nrows - 1)) + + if dtype is not None: + data = np.array(values, dtype=dtype) + else: + data = values + + if isinstance(data, dict): + if columns is None: + columns = arr_columns = ensure_index(sorted(data)) + arrays = [data[k] for k in columns] + else: + arrays = [] + arr_columns_list = [] + for k, v in data.items(): + if k in columns: + arr_columns_list.append(k) + arrays.append(v) + + arr_columns = Index(arr_columns_list) + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + elif isinstance(data, np.ndarray): + arrays, columns = to_arrays(data, columns) + arr_columns = columns + else: + arrays, arr_columns = to_arrays(data, columns) + if coerce_float: + for i, arr in enumerate(arrays): + if arr.dtype == object: + # error: Argument 1 to "maybe_convert_objects" has + # incompatible type "Union[ExtensionArray, ndarray]"; + # expected "ndarray" + arrays[i] = lib.maybe_convert_objects( + arr, # type: ignore[arg-type] + try_float=True, + ) + + arr_columns = ensure_index(arr_columns) + if columns is None: + columns = arr_columns + else: + arrays, arr_columns, result_index = maybe_reorder( + arrays, arr_columns, columns, index + ) + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + if index is not None: + if isinstance(index, str) or not hasattr(index, "__iter__"): + i = columns.get_loc(index) + exclude.add(index) + if len(arrays) > 0: + result_index = Index(arrays[i], name=index) + else: + result_index = Index([], name=index) + else: + try: + index_data = [arrays[arr_columns.get_loc(field)] for field in index] + except (KeyError, TypeError): + # raised by get_loc, see GH#29258 + result_index = index + else: + result_index = ensure_index_from_sequences(index_data, names=index) + exclude.update(index) + + if any(exclude): + arr_exclude = [x for x in exclude if x in arr_columns] + to_remove = [arr_columns.get_loc(col) for col in arr_exclude] + arrays = [v for i, v in enumerate(arrays) if i not in to_remove] + + columns = columns.drop(exclude) + + mgr = arrays_to_mgr(arrays, columns, result_index) + return cls._from_mgr(mgr, axes=mgr.axes) + + def to_records( + self, index: bool = True, column_dtypes=None, index_dtypes=None + ) -> np.rec.recarray: + """ + Convert DataFrame to a NumPy record array. + + Index will be included as the first field of the record array if + requested. + + Parameters + ---------- + index : bool, default True + Include index in resulting record array, stored in 'index' + field or using the index label, if set. + column_dtypes : str, type, dict, default None + If a string or type, the data type to store all columns. If + a dictionary, a mapping of column names and indices (zero-indexed) + to specific data types. + index_dtypes : str, type, dict, default None + If a string or type, the data type to store all index levels. If + a dictionary, a mapping of index level names and indices + (zero-indexed) to specific data types. + + This mapping is applied only if `index=True`. + + Returns + ------- + numpy.rec.recarray + NumPy ndarray with the DataFrame labels as fields and each row + of the DataFrame as entries. + + See Also + -------- + DataFrame.from_records: Convert structured or record ndarray + to DataFrame. + numpy.rec.recarray: An ndarray that allows field access using + attributes, analogous to typed columns in a + spreadsheet. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [0.5, 0.75]}, index=["a", "b"]) >>> df - A B C D - 0 0 1 2 3 - 1 4 5 6 7 - 2 8 9 10 11 + A B + a 1 0.50 + b 2 0.75 + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('index', 'O'), ('A', '>> df.index = df.index.rename("I") + >>> df.to_records() + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index=False) + rec.array([(1, 0.5 ), (2, 0.75)], + dtype=[('A', '>> df.to_records(column_dtypes={"A": "int32"}) + rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], + dtype=[('I', 'O'), ('A', '>> df.to_records(index_dtypes=">> index_dtypes = f">> df.to_records(index_dtypes=index_dtypes) + rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], + dtype=[('I', 'S1'), ('A', ' Self: + """ + Create DataFrame from a list of arrays corresponding to the columns. + + Parameters + ---------- + arrays : list-like of arrays + Each array in the list corresponds to one column, in order. + columns : list-like, Index + The column names for the resulting DataFrame. + index : list-like, Index + The rows labels for the resulting DataFrame. + dtype : dtype, optional + Optional dtype to enforce for all arrays. + verify_integrity : bool, default True + Validate and homogenize all input. If set to False, it is assumed + that all elements of `arrays` are actual arrays how they will be + stored in a block (numpy ndarray or ExtensionArray), have the same + length as and are aligned with the index, and that `columns` and + `index` are ensured to be an Index object. + + Returns + ------- + DataFrame + """ + if dtype is not None: + dtype = pandas_dtype(dtype) + + columns = ensure_index(columns) + if len(columns) != len(arrays): + raise ValueError("len(columns) must match len(arrays)") + mgr = arrays_to_mgr( + arrays, + columns, + index, + dtype=dtype, + verify_integrity=verify_integrity, + ) + return cls._from_mgr(mgr, axes=mgr.axes) + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path", + ) + def to_stata( + self, + path: FilePath | WriteBuffer[bytes], + *, + convert_dates: dict[Hashable, str] | None = None, + write_index: bool = True, + byteorder: ToStataByteorder | None = None, + time_stamp: datetime.datetime | None = None, + data_label: str | None = None, + variable_labels: dict[Hashable, str] | None = None, + version: int | None = 114, + convert_strl: Sequence[Hashable] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + value_labels: dict[Hashable, dict[float, str]] | None = None, + ) -> None: + """ + Export DataFrame object to Stata dta format. + + Writes the DataFrame to a Stata dataset file. + "dta" files contain a Stata dataset. + + Parameters + ---------- + path : str, path object, or buffer + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. + + convert_dates : dict + Dictionary mapping columns containing datetime types to stata + internal format to use when writing the dates. Options are 'tc', + 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer + or a name. Datetime columns that do not have a conversion type + specified will be converted to 'tc'. Raises NotImplementedError if + a datetime column has timezone information. + write_index : bool + Write the index to Stata dataset. + byteorder : str + Can be ">", "<", "little", or "big". default is `sys.byteorder`. + time_stamp : datetime + A datetime to use as file creation date. Default is the current + time. + data_label : str, optional + A label for the data set. Must be 80 characters or smaller. + variable_labels : dict + Dictionary containing columns as keys and variable labels as + values. Each label must be 80 characters or smaller. + version : {{114, 117, 118, 119, None}}, default 114 + Version to use in the output dta file. Set to None to let pandas + decide between 118 or 119 formats depending on the number of + columns in the frame. Version 114 can be read by Stata 10 and + later. Version 117 can be read by Stata 13 or later. Version 118 + is supported in Stata 14 and later. Version 119 is supported in + Stata 15 and later. Version 114 limits string variables to 244 + characters or fewer while versions 117 and later allow strings + with lengths up to 2,000,000 characters. Versions 118 and 119 + support Unicode characters, and version 119 supports more than + 32,767 variables. + + Version 119 should usually only be used when the number of + variables exceeds the capacity of dta format 118. Exporting + smaller datasets in format 119 may have unintended consequences, + and, as of November 2020, Stata SE cannot read version 119 files. + + convert_strl : list, optional + List of column names to convert to string columns to Stata StrL + format. Only available if version is 117. Storing strings in the + StrL format can produce smaller dta files if strings have more than + 8 characters and values are repeated. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + value_labels : dict of dicts + Dictionary containing columns as keys and dictionaries of column value + to labels as values. Labels for a single variable must be 32,000 + characters or smaller. + + .. versionadded:: 1.4.0 + + Raises + ------ + NotImplementedError + * If datetimes contain timezone information + * Column dtype is not representable in Stata + ValueError + * Columns listed in convert_dates are neither datetime64[ns] + or datetime.datetime + * Column listed in convert_dates is not in DataFrame + * Categorical label contains more than 32,000 characters + + See Also + -------- + read_stata : Import Stata data files. + io.stata.StataWriter : Low-level writer for Stata data files. + io.stata.StataWriter117 : Low-level writer for version 117 files. + + Examples + -------- + >>> df = pd.DataFrame( + ... [["falcon", 350], ["parrot", 18]], columns=["animal", "parrot"] + ... ) + >>> df.to_stata("animals.dta") # doctest: +SKIP + """ + if version not in (114, 117, 118, 119, None): + raise ValueError("Only formats 114, 117, 118 and 119 are supported.") + if version == 114: + if convert_strl is not None: + raise ValueError("strl is not supported in format 114") + from pandas.io.stata import StataWriter as statawriter + elif version == 117: + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriter117 as statawriter, + ) + else: # versions 118 and 119 + # Incompatible import of "statawriter" (imported name has type + # "Type[StataWriter117]", local name has type "Type[StataWriter]") + from pandas.io.stata import ( # type: ignore[assignment] + StataWriterUTF8 as statawriter, + ) + + kwargs: dict[str, Any] = {} + if version is None or version >= 117: + # strl conversion is only supported >= 117 + kwargs["convert_strl"] = convert_strl + if version is None or version >= 118: + # Specifying the version is only supported for UTF8 (118 or 119) + kwargs["version"] = version + + writer = statawriter( + path, + self, + convert_dates=convert_dates, + byteorder=byteorder, + time_stamp=time_stamp, + data_label=data_label, + write_index=write_index, + variable_labels=variable_labels, + compression=compression, + storage_options=storage_options, + value_labels=value_labels, + **kwargs, + ) + writer.write_file() + + def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: + """ + Write a DataFrame to the binary Feather format. + + Parameters + ---------- + path : str, path object, file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If a string or a path, + it will be used as Root Directory path when writing a partitioned dataset. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + This includes the `compression`, `compression_level`, `chunksize` + and `version` keywords. + + Notes + ----- + This function writes the dataframe as a `feather file + `_. Requires a default + index. For saving the DataFrame with your custom index use a method that + supports custom indices e.g. `to_parquet`. + + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) + >>> df.to_feather("file.feather") # doctest: +SKIP + """ + from pandas.io.feather_format import to_feather + + to_feather(self, path, **kwargs) + + @overload + def to_markdown( + self, + buf: None = ..., + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str], + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> None: ... + + @overload + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None, + *, + mode: str = ..., + index: bool = ..., + storage_options: StorageOptions | None = ..., + **kwargs, + ) -> str | None: ... + + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + storage_options=_shared_docs["storage_options"], + examples="""Examples + -------- + >>> df = pd.DataFrame( + ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} + ... ) + >>> print(df.to_markdown()) + | | animal_1 | animal_2 | + |---:|:-----------|:-----------| + | 0 | elk | dog | + | 1 | pig | quetzal | + + Output markdown with a tabulate option. + + >>> print(df.to_markdown(tablefmt="grid")) + +----+------------+------------+ + | | animal_1 | animal_2 | + +====+============+============+ + | 0 | elk | dog | + +----+------------+------------+ + | 1 | pig | quetzal | + +----+------------+------------+""", + ) + def to_markdown( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + mode: str = "wt", + index: bool = True, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> str | None: + if "showindex" in kwargs: + raise ValueError("Pass 'index' instead of 'showindex") + + kwargs.setdefault("headers", "keys") + kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) + tabulate = import_optional_dependency("tabulate") + result = tabulate.tabulate(self, **kwargs) + if buf is None: + return result + + with get_handle(buf, mode, storage_options=storage_options) as handles: + handles.handle.write(result) + return None + + @overload + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes], + *, + engine: Literal["auto", "pyarrow", "fastparquet"] = ..., + compression: str | None = ..., + index: bool | None = ..., + partition_cols: list[str] | None = ..., + storage_options: StorageOptions = ..., + **kwargs, + ) -> None: ... + + @doc(storage_options=_shared_docs["storage_options"]) + def to_parquet( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", + compression: str | None = "snappy", + index: bool | None = None, + partition_cols: list[str] | None = None, + storage_options: StorageOptions | None = None, + **kwargs, + ) -> bytes | None: + """ + Write a DataFrame to the binary parquet format. + + This function writes the dataframe as a `parquet file + `_. You can choose different parquet + backends, and have the option of compression. See + :ref:`the user guide ` for more details. + + Parameters + ---------- + path : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``write()`` function. If None, the result is + returned as bytes. If a string or path, it will be used as Root Directory + path when writing a partitioned dataset. + engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' + Parquet library to use. If 'auto', then the option + ``io.parquet.engine`` is used. The default ``io.parquet.engine`` + behavior is to try 'pyarrow', falling back to 'fastparquet' if + 'pyarrow' is unavailable. + compression : str or None, default 'snappy' + Name of the compression to use. Use ``None`` for no compression. + Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. + index : bool, default None + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``True`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + partition_cols : list, optional, default None + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. + {storage_options} + + **kwargs + Additional arguments passed to the parquet library. See + :ref:`pandas io ` for more details. + + Returns + ------- + bytes if no path argument is provided else None + + See Also + -------- + read_parquet : Read a parquet file. + DataFrame.to_orc : Write an orc file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + This function requires either the `fastparquet + `_ or `pyarrow + `_ library. + + Examples + -------- + >>> df = pd.DataFrame(data={{"col1": [1, 2], "col2": [3, 4]}}) + >>> df.to_parquet("df.parquet.gzip", compression="gzip") # doctest: +SKIP + >>> pd.read_parquet("df.parquet.gzip") # doctest: +SKIP + col1 col2 + 0 1 3 + 1 2 4 + + If you want to get a buffer to the parquet content you can use a io.BytesIO + object, as long as you don't use partition_cols, which creates multiple files. + + >>> import io + >>> f = io.BytesIO() + >>> df.to_parquet(f) + >>> f.seek(0) + 0 + >>> content = f.read() + """ + from pandas.io.parquet import to_parquet + + return to_parquet( + self, + path, + engine, + compression=compression, + index=index, + partition_cols=partition_cols, + storage_options=storage_options, + **kwargs, + ) + + @overload + def to_orc( + self, + path: None = ..., + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes], + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> None: ... + + @overload + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None, + *, + engine: Literal["pyarrow"] = ..., + index: bool | None = ..., + engine_kwargs: dict[str, Any] | None = ..., + ) -> bytes | None: ... + + def to_orc( + self, + path: FilePath | WriteBuffer[bytes] | None = None, + *, + engine: Literal["pyarrow"] = "pyarrow", + index: bool | None = None, + engine_kwargs: dict[str, Any] | None = None, + ) -> bytes | None: + """ + Write a DataFrame to the Optimized Row Columnar (ORC) format. + + .. versionadded:: 1.5.0 + + Parameters + ---------- + path : str, file-like object or None, default None + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handle + (e.g. via builtin open function). If path is None, + a bytes object is returned. + engine : {'pyarrow'}, default 'pyarrow' + ORC library to use. + index : bool, optional + If ``True``, include the dataframe's index(es) in the file output. + If ``False``, they will not be written to the file. + If ``None``, similar to ``infer`` the dataframe's index(es) + will be saved. However, instead of being saved as values, + the RangeIndex will be stored as a range in the metadata so it + doesn't require much space and is faster. Other indexes will + be included as columns in the file output. + engine_kwargs : dict[str, Any] or None, default None + Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. + + Returns + ------- + bytes if no ``path`` argument is provided else None + Bytes object with DataFrame data if ``path`` is not specified else None. + + Raises + ------ + NotImplementedError + Dtype of one or more columns is category, unsigned integers, interval, + period or sparse. + ValueError + engine is not pyarrow. + + See Also + -------- + read_orc : Read a ORC file. + DataFrame.to_parquet : Write a parquet file. + DataFrame.to_csv : Write a csv file. + DataFrame.to_sql : Write to a sql table. + DataFrame.to_hdf : Write to hdf. + + Notes + ----- + * Find more information on ORC + `here `__. + * Before using this function you should read the :ref:`user guide about + ORC ` and :ref:`install optional dependencies `. + * This function requires `pyarrow `_ + library. + * For supported dtypes please refer to `supported ORC features in Arrow + `__. + * Currently timezones in datetime columns are not preserved when a + dataframe is converted into ORC files. + + Examples + -------- + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> df.to_orc("df.orc") # doctest: +SKIP + >>> pd.read_orc("df.orc") # doctest: +SKIP + col1 col2 + 0 1 4 + 1 2 3 + + If you want to get a buffer to the orc content you can write it to io.BytesIO + + >>> import io + >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP + >>> b.seek(0) # doctest: +SKIP + 0 + >>> content = b.read() # doctest: +SKIP + """ + from pandas.io.orc import to_orc + + return to_orc( + self, path, engine=engine, index=index, engine_kwargs=engine_kwargs + ) + + @overload + def to_html( + self, + buf: FilePath | WriteBuffer[str], + *, + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> None: ... + + @overload + def to_html( + self, + buf: None = ..., + *, + columns: Axes | None = ..., + col_space: ColspaceArgType | None = ..., + header: bool = ..., + index: bool = ..., + na_rep: str = ..., + formatters: FormattersType | None = ..., + float_format: FloatFormatType | None = ..., + sparsify: bool | None = ..., + index_names: bool = ..., + justify: str | None = ..., + max_rows: int | None = ..., + max_cols: int | None = ..., + show_dimensions: bool | str = ..., + decimal: str = ..., + bold_rows: bool = ..., + classes: str | list | tuple | None = ..., + escape: bool = ..., + notebook: bool = ..., + border: int | bool | None = ..., + table_id: str | None = ..., + render_links: bool = ..., + encoding: str | None = ..., + ) -> str: ... + + @Substitution( + header_type="bool", + header="Whether to print column labels, default True", + col_space_type="str or int, list or dict of int or str", + col_space="The minimum width of each column in CSS length " + "units. An int is assumed to be px units.", + ) + @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) + def to_html( + self, + buf: FilePath | WriteBuffer[str] | None = None, + *, + columns: Axes | None = None, + col_space: ColspaceArgType | None = None, + header: bool = True, + index: bool = True, + na_rep: str = "NaN", + formatters: FormattersType | None = None, + float_format: FloatFormatType | None = None, + sparsify: bool | None = None, + index_names: bool = True, + justify: str | None = None, + max_rows: int | None = None, + max_cols: int | None = None, + show_dimensions: bool | str = False, + decimal: str = ".", + bold_rows: bool = True, + classes: str | list | tuple | None = None, + escape: bool = True, + notebook: bool = False, + border: int | bool | None = None, + table_id: str | None = None, + render_links: bool = False, + encoding: str | None = None, + ) -> str | None: + """ + Render a DataFrame as an HTML table. + %(shared_params)s + bold_rows : bool, default True + Make the row labels bold in the output. + classes : str or list or tuple, default None + CSS class(es) to apply to the resulting html table. + escape : bool, default True + Convert the characters <, >, and & to HTML-safe sequences. + notebook : {True, False}, default False + Whether the generated HTML is for IPython Notebook. + border : int + A ``border=border`` attribute is included in the opening + `` tag. Default ``pd.options.display.html.border``. + table_id : str, optional + A css id is included in the opening `
` tag if specified. + render_links : bool, default False + Convert URLs to HTML links. + encoding : str, default "utf-8" + Set character encoding. + %(returns)s + See Also + -------- + to_string : Convert DataFrame to a string. + + Examples + -------- + >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]}) + >>> html_string = '''
+ ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ... + ...
col1col2
014
123
''' + >>> assert html_string == df.to_html() + """ + if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: + raise ValueError("Invalid value for justify parameter") + + formatter = fmt.DataFrameFormatter( + self, + columns=columns, + col_space=col_space, + na_rep=na_rep, + header=header, + index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + justify=justify, + index_names=index_names, + escape=escape, + decimal=decimal, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + ) + # TODO: a generic formatter wld b in DataFrameFormatter + return fmt.DataFrameRenderer(formatter).to_html( + buf=buf, + classes=classes, + notebook=notebook, + border=border, + encoding=encoding, + table_id=table_id, + render_links=render_links, + ) + + @overload + def to_xml( + self, + path_or_buffer: None = ..., + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> str: ... + + @overload + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], + *, + index: bool = ..., + root_name: str | None = ..., + row_name: str | None = ..., + na_rep: str | None = ..., + attr_cols: list[str] | None = ..., + elem_cols: list[str] | None = ..., + namespaces: dict[str | None, str] | None = ..., + prefix: str | None = ..., + encoding: str = ..., + xml_declaration: bool | None = ..., + pretty_print: bool | None = ..., + parser: XMLParsers | None = ..., + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., + compression: CompressionOptions = ..., + storage_options: StorageOptions | None = ..., + ) -> None: ... + + @doc( + storage_options=_shared_docs["storage_options"], + compression_options=_shared_docs["compression_options"] % "path_or_buffer", + ) + def to_xml( + self, + path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + *, + index: bool = True, + root_name: str | None = "data", + row_name: str | None = "row", + na_rep: str | None = None, + attr_cols: list[str] | None = None, + elem_cols: list[str] | None = None, + namespaces: dict[str | None, str] | None = None, + prefix: str | None = None, + encoding: str = "utf-8", + xml_declaration: bool | None = True, + pretty_print: bool | None = True, + parser: XMLParsers | None = "lxml", + stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, + compression: CompressionOptions = "infer", + storage_options: StorageOptions | None = None, + ) -> str | None: + """ + Render a DataFrame to an XML document. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + path_or_buffer : str, path object, file-like object, or None, default None + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a ``write()`` function. If None, the result is returned + as a string. + index : bool, default True + Whether to include index in XML document. + root_name : str, default 'data' + The name of root element in XML document. + row_name : str, default 'row' + The name of row element in XML document. + na_rep : str, optional + Missing data representation. + attr_cols : list-like, optional + List of columns to write as attributes in row element. + Hierarchical columns will be flattened with underscore + delimiting the different levels. + elem_cols : list-like, optional + List of columns to write as children in row element. By default, + all columns output as children of row element. Hierarchical + columns will be flattened with underscore delimiting the + different levels. + namespaces : dict, optional + All namespaces to be defined in root element. Keys of dict + should be prefix names and values of dict corresponding URIs. + Default namespaces should be given empty string key. For + example, :: + + namespaces = {{"": "https://example.com"}} + + prefix : str, optional + Namespace prefix to be used for every element and/or attribute + in document. This should be one of the keys in ``namespaces`` + dict. + encoding : str, default 'utf-8' + Encoding of the resulting document. + xml_declaration : bool, default True + Whether to include the XML declaration at start of document. + pretty_print : bool, default True + Whether output should be pretty printed with indentation and + line breaks. + parser : {{'lxml','etree'}}, default 'lxml' + Parser module to use for building of tree. Only 'lxml' and + 'etree' are supported. With 'lxml', the ability to use XSLT + stylesheet is supported. + stylesheet : str, path object or file-like object, optional + A URL, file-like object, or a raw string containing an XSLT + script used to transform the raw XML output. Script should use + layout of elements and attributes from original output. This + argument requires ``lxml`` to be installed. Only XSLT 1.0 + scripts and not later versions is currently supported. + {compression_options} + + .. versionchanged:: 1.4.0 Zstandard support. + + {storage_options} + + Returns + ------- + None or str + If ``io`` is None, returns the resulting XML format as a + string. Otherwise returns None. + + See Also + -------- + to_json : Convert the pandas object to a JSON string. + to_html : Convert DataFrame to a html. + + Examples + -------- + >>> df = pd.DataFrame( + ... [["square", 360, 4], ["circle", 360, np.nan], ["triangle", 180, 3]], + ... columns=["shape", "degrees", "sides"], + ... ) + + >>> df.to_xml() # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + + >>> df.to_xml( + ... attr_cols=["index", "shape", "degrees", "sides"] + ... ) # doctest: +SKIP + + + + + + + + >>> df.to_xml( + ... namespaces={{"doc": "https://example.com"}}, prefix="doc" + ... ) # doctest: +SKIP + + + + 0 + square + 360 + 4.0 + + + 1 + circle + 360 + + + + 2 + triangle + 180 + 3.0 + + + """ + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + from pandas.io.formats.xml import ( + EtreeXMLFormatter, + LxmlXMLFormatter, + ) + + lxml = import_optional_dependency("lxml.etree", errors="ignore") + + TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter] + + if parser == "lxml": + if lxml is not None: + TreeBuilder = LxmlXMLFormatter + else: + raise ImportError( + "lxml not found, please install or use the etree parser." + ) + + elif parser == "etree": + TreeBuilder = EtreeXMLFormatter + + else: + raise ValueError("Values for parser can only be lxml or etree.") + + xml_formatter = TreeBuilder( + self, + path_or_buffer=path_or_buffer, + index=index, + root_name=root_name, + row_name=row_name, + na_rep=na_rep, + attr_cols=attr_cols, + elem_cols=elem_cols, + namespaces=namespaces, + prefix=prefix, + encoding=encoding, + xml_declaration=xml_declaration, + pretty_print=pretty_print, + stylesheet=stylesheet, + compression=compression, + storage_options=storage_options, + ) + + return xml_formatter.write_output() + + # ---------------------------------------------------------------------- + @doc(INFO_DOCSTRING, **frame_sub_kwargs) + def info( + self, + verbose: bool | None = None, + buf: WriteBuffer[str] | None = None, + max_cols: int | None = None, + memory_usage: bool | str | None = None, + show_counts: bool | None = None, + ) -> None: + info = DataFrameInfo( + data=self, + memory_usage=memory_usage, + ) + info.render( + buf=buf, + max_cols=max_cols, + verbose=verbose, + show_counts=show_counts, + ) - Drop columns + def memory_usage(self, index: bool = True, deep: bool = False) -> Series: + """ + Return the memory usage of each column in bytes. - >>> df.drop(["B", "C"], axis=1) - A D - 0 0 3 - 1 4 7 - 2 8 11 + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. - >>> df.drop(columns=["B", "C"]) - A D - 0 0 3 - 1 4 7 - 2 8 11 + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. - Drop a row by index + Parameters + ---------- + idx_diff = result_index.difference(correl.index) - >>> df.drop([0, 1]) - A B C D - 2 8 9 10 11 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Drop columns and/or rows of MultiIndex DataFrame + return correl - >>> midx = pd.MultiIndex( - ... levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]], - ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], - ... ) - >>> df = pd.DataFrame( - ... index=midx, - ... columns=["big", "small"], - ... data=[ - ... [45, 30], - ... [200, 100], - ... [1.5, 1], - ... [30, 20], - ... [250, 150], - ... [1.5, 0.8], - ... [320, 250], - ... [1, 0.8], - ... [0.3, 0.2], - ... ], - ... ) - >>> df - big small - llama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - weight 1.0 0.8 - length 0.3 0.2 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Drop a specific index combination from the MultiIndex - DataFrame, i.e., drop the combination ``'falcon'`` and - ``'weight'``, which deletes only the corresponding row + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.drop(index=("falcon", "weight")) - big small - llama speed 45.0 30.0 - weight 200.0 100.0 - length 1.5 1.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - length 1.5 0.8 - falcon speed 320.0 250.0 - length 0.3 0.2 + Returns + ------- + Series + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. - >>> df.drop(index="cow", columns="small") - big - llama speed 45.0 - weight 200.0 - length 1.5 - falcon speed 320.0 - weight 1.0 - length 0.3 + See Also + -------- + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + Categorical : Memory-efficient array for string values with + many repeated values. + DataFrame.info : Concise summary of a DataFrame. - >>> df.drop(index="length", level=1) - big small - llama speed 45.0 30.0 - weight 200.0 100.0 - cow speed 30.0 20.0 - weight 250.0 150.0 - falcon speed 320.0 250.0 - weight 1.0 0.8 - """ - return super().drop( - labels=labels, - axis=axis, - index=index, - columns=columns, - level=level, - inplace=inplace, - errors=errors, - ) + Notes + ----- + See the :ref:`Frequently Asked Questions ` for more + details. - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, - inplace: Literal[True], - level: Level = ..., - errors: IgnoreRaise = ..., - ) -> None: ... + Examples + -------- + >>> dtypes = ["int64", "float64", "complex128", "object", "bool"] + >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 1.0+0.0j 1 True + 1 1 1.0 1.0+0.0j 1 True + 2 1 1.0 1.0+0.0j 1 True + 3 1 1.0 1.0+0.0j 1 True + 4 1 1.0 1.0+0.0j 1 True - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, - inplace: Literal[False] = ..., - level: Level = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame: ... + >>> df.memory_usage() + Index 128 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 - @overload - def rename( - self, - mapper: Renamer | None = ..., - *, - index: Renamer | None = ..., - columns: Renamer | None = ..., - axis: Axis | None = ..., - copy: bool | lib.NoDefault = lib.no_default, - inplace: bool = ..., - level: Level = ..., - errors: IgnoreRaise = ..., - ) -> DataFrame | None: ... + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 - def rename( - self, - mapper: Renamer | None = None, - *, - index: Renamer | None = None, - columns: Renamer | None = None, - axis: Axis | None = None, - copy: bool | lib.NoDefault = lib.no_default, - inplace: bool = False, - level: Level | None = None, - errors: IgnoreRaise = "ignore", - ) -> DataFrame | None: + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" """ - Rename columns or index labels. + result = self._constructor_sliced( + [c.memory_usage(index=False, deep=deep) for col, c in self.items()], + index=self.columns, + dtype=np.intp, + ) + if index: + index_memory_usage = self._constructor_sliced( + self.index.memory_usage(deep=deep), index=["Index"] + ) + result = index_memory_usage._append(result) + return result - Function / dict values must be unique (1-to-1). Labels not contained in - a dict / Series will be left as-is. Extra labels listed don't throw an - error. + def transpose(self, *args, copy: bool = False) -> DataFrame: + """ + Transpose index and columns. - See the :ref:`user guide ` for more. + Reflect the DataFrame over its main diagonal by writing rows as columns + and vice-versa. The property :attr:`.T` is an accessor to the method + :meth:`transpose`. Parameters ---------- - mapper : dict-like or function - Dict-like or function transformations to apply to - that axis' values. Use either ``mapper`` and ``axis`` to - specify the axis to target with ``mapper``, or ``index`` and - ``columns``. - index : dict-like or function - Alternative to specifying axis (``mapper, axis=0`` - is equivalent to ``index=mapper``). - columns : dict-like or function - Alternative to specifying axis (``mapper, axis=1`` - is equivalent to ``columns=mapper``). - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis to target with ``mapper``. Can be either the axis name - ('index', 'columns') or number (0, 1). The default is 'index'. - copy : bool, default False - Also copy underlying data. + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" .. note:: The `copy` keyword will change behavior in pandas 3.0. @@ -5613,5988 +13980,5329 @@ def rename( You can already get the future behavior and improvements through enabling copy on write ``pd.options.mode.copy_on_write = True`` + idx_diff = result_index.difference(correl.index) - .. deprecated:: 3.0.0 - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - If True then value of copy is ignored. - level : int or level name, default None - In case of a MultiIndex, only rename labels in the specified - level. - errors : {'ignore', 'raise'}, default 'ignore' - If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, - or `columns` contains labels that are not present in the Index - being transformed. - If 'ignore', existing keys will be renamed and extra keys will be - ignored. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Returns ------- - DataFrame or None - DataFrame with the renamed axis labels or None if ``inplace=True``. - - Raises - ------ - KeyError - If any of the labels is not found in the selected axis and - "errors='raise'". + DataFrame + The transposed DataFrame. See Also -------- - DataFrame.rename_axis : Set the name of the axis. + numpy.transpose : Permute the dimensions of a given array. + + Notes + ----- + Transposing a DataFrame with mixed dtypes will result in a homogeneous + DataFrame with the `object` dtype. In such a case, a copy of the data + is always made. Examples -------- - ``DataFrame.rename`` supports two calling conventions + **Square DataFrame with homogeneous dtype** - * ``(index=index_mapper, columns=columns_mapper, ...)`` - * ``(mapper, axis={'index', 'columns'}, ...)`` + >>> d1 = {"col1": [1, 2], "col2": [3, 4]} + >>> df1 = pd.DataFrame(data=d1) + >>> df1 + col1 col2 + 0 1 3 + 1 2 4 - We *highly* recommend using keyword arguments to clarify your - intent. + >>> df1_transposed = df1.T # or df1.transpose() + >>> df1_transposed + 0 1 + col1 1 2 + col2 3 4 - Rename columns using a mapping: + When the dtype is homogeneous in the original DataFrame, we get a + transposed DataFrame with the same dtype: - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df.rename(columns={"A": "a", "B": "c"}) - a c - 0 1 4 - 1 2 5 - 2 3 6 + >>> df1.dtypes + col1 int64 + col2 int64 + dtype: object + >>> df1_transposed.dtypes + 0 int64 + 1 int64 + dtype: object - Rename index using a mapping: + **Non-square DataFrame with mixed dtypes** - >>> df.rename(index={0: "x", 1: "y", 2: "z"}) - A B - x 1 4 - y 2 5 - z 3 6 + >>> d2 = { + ... "name": ["Alice", "Bob"], + ... "score": [9.5, 8], + ... "employed": [False, True], + ... "kids": [0, 0], + ... } + >>> df2 = pd.DataFrame(data=d2) + >>> df2 + name score employed kids + 0 Alice 9.5 False 0 + 1 Bob 8.0 True 0 - Cast index labels to a different type: + >>> df2_transposed = df2.T # or df2.transpose() + >>> df2_transposed + 0 1 + name Alice Bob + score 9.5 8.0 + employed False True + kids 0 0 - >>> df.index - RangeIndex(start=0, stop=3, step=1) - >>> df.rename(index=str).index - Index(['0', '1', '2'], dtype='object') + When the DataFrame has mixed dtypes, we get a transposed DataFrame with + the `object` dtype: - >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") - Traceback (most recent call last): - KeyError: ['C'] not found in axis + >>> df2.dtypes + name object + score float64 + employed bool + kids int64 + dtype: object + >>> df2_transposed.dtypes + 0 object + 1 object + dtype: object + """ + nv.validate_transpose(args, {}) + # construct the args - Using axis-style parameters: + dtypes = list(self.dtypes) - >>> df.rename(str.lower, axis="columns") - a b - 0 1 4 - 1 2 5 - 2 3 6 + if self._can_fast_transpose: + # Note: tests pass without this, but this improves perf quite a bit. + new_vals = self._values.T - >>> df.rename({1: 2, 2: 4}, axis="index") - A B - 0 1 4 - 2 2 5 - 4 3 6 - """ - self._check_copy_deprecation(copy) - return super()._rename( - mapper=mapper, - index=index, - columns=columns, - axis=axis, - inplace=inplace, - level=level, - errors=errors, - ) + idx_diff = result_index.difference(correl.index) - def pop(self, item: Hashable) -> Series: - """ - Return item and drop it from DataFrame. Raise KeyError if not found. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Parameters - ---------- - item : label - Label of column to be popped. + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + return result.__finalize__(self, method="transpose") + + @property + def T(self) -> DataFrame: + """ + The transpose of the DataFrame. Returns ------- - Series - Series representing the item that is dropped. + DataFrame + The transposed DataFrame. See Also -------- - DataFrame.drop: Drop specified labels from rows or columns. - DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. - - Examples - -------- - >>> df = pd.DataFrame( - ... [ - ... ("falcon", "bird", 389.0), - ... ("parrot", "bird", 24.0), - ... ("lion", "mammal", 80.5), - ... ("monkey", "mammal", np.nan), - ... ], - ... columns=("name", "class", "max_speed"), - ... ) - >>> df - name class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal NaN + DataFrame.transpose : Transpose index and columns. - >>> df.pop("class") - 0 bird - 1 bird - 2 mammal - 3 mammal - Name: class, dtype: object + See Also + -------- + DataFrame.drop: Drop specified labels from rows or columns. + DataFrame.drop_duplicates: Return DataFrame with duplicate rows removed. + Examples + -------- + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) >>> df - name max_speed - 0 falcon 389.0 - 1 parrot 24.0 - 2 lion 80.5 - 3 monkey NaN - """ - return super().pop(item=item) + col1 col2 + 0 1 3 + 1 2 4 - @overload - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[True], regex - ) -> None: ... + >>> df.T + 0 1 + col1 1 2 + col2 3 4 + """ + return self.transpose() - @overload - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[False], regex - ) -> Self: ... + # ---------------------------------------------------------------------- + # Indexing Methods - def _replace_columnwise( - self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex - ) -> Self | None: + def _ixs(self, i: int, axis: AxisInt = 0) -> Series: """ - Dispatch to Series.replace column-wise. - Parameters ---------- - mapping : dict - of the form {col: (target, value)} - inplace : bool - regex : bool or same types as `to_replace` in DataFrame.replace + i : int + axis : int Returns ------- - DataFrame or None + Series """ - # Operate column-wise - res = self if inplace else self.copy(deep=False) - ax = self.columns - - for i, ax_value in enumerate(ax): - if ax_value in mapping: - ser = self.iloc[:, i] + # irow + if axis == 0: + new_mgr = self._mgr.fast_xs(i) - target, value = mapping[ax_value] - newobj = ser.replace(target, value, regex=regex) + result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) + result._name = self.index[i] + return result.__finalize__(self) - res._iset_item(i, newobj, inplace=inplace) + # icol + else: + col_mgr = self._mgr.iget(i) + return self._box_col_values(col_mgr, i) - if inplace: - return None - return res.__finalize__(self) + def _get_column_array(self, i: int) -> ArrayLike: + """ + Get the values of the i'th column (ndarray or ExtensionArray, as stored + in the Block) - @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift( - self, - periods: int | Sequence[int] = 1, - freq: Frequency | None = None, - axis: Axis = 0, - fill_value: Hashable = lib.no_default, - suffix: str | None = None, - ) -> DataFrame: - if freq is not None and fill_value is not lib.no_default: - # GH#53832 - raise ValueError( - "Passing a 'freq' together with a 'fill_value' is not allowed." - ) + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). + """ + return self._mgr.iget_values(i) - if self.empty and freq is None: - return self.copy() + def _iter_column_arrays(self) -> Iterator[ArrayLike]: + """ + Iterate over the arrays of all columns in order. + This returns the values as stored in the Block (ndarray or ExtensionArray). - axis = self._get_axis_number(axis) + idx_diff = result_index.difference(correl.index) - if is_list_like(periods): - periods = cast(Sequence, periods) - if axis == 1: - raise ValueError( - "If `periods` contains multiple shifts, `axis` cannot be 1." + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - if len(periods) == 0: - raise ValueError("If `periods` is an iterable, it cannot be empty.") - from pandas.core.reshape.concat import concat - shifted_dataframes = [] - for period in periods: - if not is_integer(period): - raise TypeError( - f"Periods must be integer, but {period} is {type(period)}." - ) - period = cast(int, period) - shifted_dataframes.append( - super() - .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value) - .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}") - ) - return concat(shifted_dataframes, axis=1) - elif suffix: - raise ValueError("Cannot specify `suffix` if `periods` is an int.") - periods = cast(int, periods) + return correl - ncols = len(self.columns) - if axis == 1 and periods != 0 and ncols > 0 and freq is None: - if fill_value is lib.no_default: - # We will infer fill_value to match the closest column + # ---------------------------------------------------------------------- + # ndarray-like stats methods - # Use a column that we know is valid for our column's dtype GH#38434 - label = self.columns[0] + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - if periods > 0: - result = self.iloc[:, :-periods] - for col in range(min(ncols, abs(periods))): - # TODO(EA2D): doing this in a loop unnecessary with 2D EAs - # Define filler inside loop so we get a copy - filler = self.iloc[:, 0].shift(len(self)) - result.insert(0, label, filler, allow_duplicates=True) - else: - result = self.iloc[:, -periods:] - for col in range(min(ncols, abs(periods))): - # Define filler inside loop so we get a copy - filler = self.iloc[:, -1].shift(len(self)) - result.insert( - len(result.columns), label, filler, allow_duplicates=True - ) + def __getitem__(self, key): + check_dict_or_set_indexers(key) + key = lib.item_from_zerodim(key) + key = com.apply_if_callable(key, self) - result.columns = self.columns.copy() - return result - elif len(self._mgr.blocks) > 1 or ( - # If we only have one block and we know that we can't - # keep the same dtype (i.e. the _can_hold_element check) - # then we can go through the reindex_indexer path - # (and avoid casting logic in the Block method). - not can_hold_element(self._mgr.blocks[0].values, fill_value) + if is_hashable(key) and not is_iterator(key): + # is_iterator to exclude generator e.g. test_getitem_listlike + # shortcut if the key is in columns + is_mi = isinstance(self.columns, MultiIndex) + # GH#45316 Return view if key is not duplicated + # Only use drop_duplicates with duplicates for performance + if not is_mi and ( + self.columns.is_unique + and key in self.columns + or key in self.columns.drop_duplicates(keep=False) ): - # GH#35488 we need to watch out for multi-block cases - # We only get here with fill_value not-lib.no_default - nper = abs(periods) - nper = min(nper, ncols) - if periods > 0: - indexer = np.array( - [-1] * nper + list(range(ncols - periods)), dtype=np.intp - ) - else: - indexer = np.array( - list(range(nper, ncols)) + [-1] * nper, dtype=np.intp - ) - mgr = self._mgr.reindex_indexer( - self.columns, - indexer, - axis=0, - fill_value=fill_value, - allow_dups=True, - ) - res_df = self._constructor_from_mgr(mgr, axes=mgr.axes) - return res_df.__finalize__(self, method="shift") - else: - return self.T.shift(periods=periods, fill_value=fill_value).T - - return super().shift( - periods=periods, freq=freq, axis=axis, fill_value=fill_value - ) - - @overload - def set_index( - self, - keys, - *, - drop: bool = ..., - append: bool = ..., - inplace: Literal[False] = ..., - verify_integrity: bool = ..., - ) -> DataFrame: ... - - @overload - def set_index( - self, - keys, - *, - drop: bool = ..., - append: bool = ..., - inplace: Literal[True], - verify_integrity: bool = ..., - ) -> None: ... + return self._get_item(key) - def set_index( - self, - keys, - *, - drop: bool = True, - append: bool = False, - inplace: bool = False, - verify_integrity: bool = False, - ) -> DataFrame | None: - """ - Set the DataFrame index using existing columns. + elif is_mi and self.columns.is_unique and key in self.columns: + return self._getitem_multilevel(key) - Set the DataFrame index (row labels) using one or more existing - columns or arrays (of the correct length). The index can replace the - existing index or expand on it. + idx_diff = result_index.difference(correl.index) - Parameters - ---------- - keys : label or array-like or list of labels/arrays - This parameter can be either a single column key, a single array of - the same length as the calling DataFrame, or a list containing an - arbitrary combination of column keys and arrays. Here, "array" - encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and - instances of :class:`~collections.abc.Iterator`. - drop : bool, default True - Delete columns to be used as the new index. - append : bool, default False - Whether to append columns to existing index. - Setting to True will add the new columns to existing index. - When set to False, the current index will be dropped from the DataFrame. - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - verify_integrity : bool, default False - Check the new index for duplicates. Otherwise defer the check until - necessary. Setting to False will improve the performance of this - method. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Returns - ------- - DataFrame or None - Changed row labels or None if ``inplace=True``. + return correl - See Also - -------- - DataFrame.reset_index : Opposite of set_index. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "month": [1, 4, 7, 10], - ... "year": [2012, 2014, 2013, 2014], - ... "sale": [55, 40, 84, 31], - ... } - ... ) - >>> df - month year sale - 0 1 2012 55 - 1 4 2014 40 - 2 7 2013 84 - 3 10 2014 31 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Set the index to become the 'month' column: + # Do we have a (boolean) DataFrame? + if isinstance(key, DataFrame): + return self.where(key) - >>> df.set_index("month") - year sale - month - 1 2012 55 - 4 2014 40 - 7 2013 84 - 10 2014 31 + # Do we have a (boolean) 1d indexer? + if com.is_bool_indexer(key): + return self._getitem_bool_array(key) - Create a MultiIndex using columns 'year' and 'month': + idx_diff = result_index.difference(correl.index) - >>> df.set_index(["year", "month"]) - sale - year month - 2012 1 55 - 2014 4 40 - 2013 7 84 - 2014 10 31 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Create a MultiIndex using an Index and a column: + return correl - >>> df.set_index([pd.Index([1, 2, 3, 4]), "year"]) - month sale - year - 1 2012 1 55 - 2 2014 4 40 - 3 2013 7 84 - 4 2014 10 31 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Create a MultiIndex using two Series: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> s = pd.Series([1, 2, 3, 4]) - >>> df.set_index([s, s**2]) - month year sale - 1 1 1 2012 55 - 2 4 4 2014 40 - 3 9 7 2013 84 - 4 16 10 2014 31 + if is_single_key: + if self.columns.nlevels > 1: + return self._getitem_multilevel(key) + indexer = self.columns.get_loc(key) + if is_integer(indexer): + indexer = [indexer] + else: + if is_iterator(key): + key = list(key) + indexer = self.columns._get_indexer_strict(key, "columns")[1] - Append a column to the existing index: - - >>> df = df.set_index("month") - >>> df.set_index("year", append=True) - sale - month year - 1 2012 55 - 4 2014 40 - 7 2013 84 - 10 2014 31 - - >>> df.set_index("year", append=False) - sale - year - 2012 55 - 2014 40 - 2013 84 - 2014 31 - """ - inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) - if not isinstance(keys, list): - keys = [keys] + # take() does not accept boolean indexers + if getattr(indexer, "dtype", None) == bool: + indexer = np.where(indexer)[0] - err_msg = ( - 'The parameter "keys" may be a column key, one-dimensional ' - "array, or a list containing only valid column keys and " - "one-dimensional arrays." - ) + if isinstance(indexer, slice): + return self._slice(indexer, axis=1) - missing: list[Hashable] = [] - for col in keys: - if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): - # arrays are fine as long as they are one-dimensional - # iterators get converted to list below - if getattr(col, "ndim", 1) != 1: - raise ValueError(err_msg) - else: - # everything else gets tried as a key; see GH 24969 - try: - found = col in self.columns - except TypeError as err: - raise TypeError( - f"{err_msg}. Received column of type {type(col)}" - ) from err - else: - if not found: - missing.append(col) + data = self.take(indexer, axis=1) - if missing: - raise KeyError(f"None of {missing} are in the columns") + if is_single_key: + # What does looking for a single key in a non-unique index return? + # The behavior is inconsistent. It returns a Series, except when + # - the key itself is repeated (test on data.shape, #9519), or + # - we have a MultiIndex on columns (test on self.columns, #21309) + if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): + # GH#26490 using data[key] can cause RecursionError + return data._get_item(key) - if inplace: - frame = self - else: - frame = self.copy(deep=False) + idx_diff = result_index.difference(correl.index) - arrays: list[Index] = [] - names: list[Hashable] = [] - if append: - names = list(self.index.names) - if isinstance(self.index, MultiIndex): - arrays.extend( - self.index._get_level_values(i) for i in range(self.index.nlevels) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - else: - arrays.append(self.index) - to_remove: set[Hashable] = set() - for col in keys: - if isinstance(col, MultiIndex): - arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) - names.extend(col.names) - elif isinstance(col, (Index, Series)): - # if Index then not MultiIndex (treated above) + return correl - # error: Argument 1 to "append" of "list" has incompatible type - # "Union[Index, Series]"; expected "Index" - arrays.append(col) # type: ignore[arg-type] - names.append(col.name) - elif isinstance(col, (list, np.ndarray)): - # error: Argument 1 to "append" of "list" has incompatible type - # "Union[List[Any], ndarray]"; expected "Index" - arrays.append(col) # type: ignore[arg-type] - names.append(None) - elif isinstance(col, abc.Iterator): - # error: Argument 1 to "append" of "list" has incompatible type - # "List[Any]"; expected "Index" - arrays.append(list(col)) # type: ignore[arg-type] - names.append(None) - # from here, col can only be a column label - else: - arrays.append(frame[col]) - names.append(col) - if drop: - to_remove.add(col) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - if len(arrays[-1]) != len(self): - # check newest element against length of calling frame, since - # ensure_index_from_sequences would not raise for append=False. - raise ValueError( - f"Length mismatch: Expected {len(self)} rows, " - f"received array of length {len(arrays[-1])}" - ) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - index = ensure_index_from_sequences(arrays, names) + def _getitem_bool_array(self, key): + # also raises Exception if object array with NA values + # warning here just in case -- previously __setitem__ was + # reindexing but __getitem__ was not; it seems more reasonable to + # go with the __setitem__ behavior since that is more consistent + # with all other indexing behavior + if isinstance(key, Series) and not key.index.equals(self.index): + warnings.warn( + "Boolean Series key will be reindexed to match DataFrame index.", + UserWarning, + stacklevel=find_stack_level(), + ) + elif len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}." + ) - if verify_integrity and not index.is_unique: - duplicates = index[index.duplicated()].unique() - raise ValueError(f"Index has duplicate keys: {duplicates}") + # check_bool_indexer will throw exception if Series key cannot + # be reindexed to match DataFrame rows + key = check_bool_indexer(self.index, key) + + if key.all(): + return self.copy(deep=False) + + indexer = key.nonzero()[0] + return self.take(indexer, axis=0) + + def _getitem_multilevel(self, key): + # self.columns is a MultiIndex + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_columns = self.columns[loc] + result_columns = maybe_droplevels(new_columns, key) + result = self.iloc[:, loc] + result.columns = result_columns + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. + if len(result.columns) == 1: + # e.g. test_frame_getitem_multicolumn_empty_level, + # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice + top = result.columns[0] + if isinstance(top, tuple): + top = top[0] + if top == "": + result = result[""] + if isinstance(result, Series): + result = self._constructor_sliced( + result, index=self.index, name=key + ) + + return result + else: + # loc is neither a slice nor ndarray, so must be an int + return self._ixs(loc, axis=1) - # use set to handle duplicate column names gracefully in case of drop - for c in to_remove: - del frame[c] + def _get_value(self, index, col, takeable: bool = False) -> Scalar: + """ + Quickly retrieve single value at passed column and index. - # clear up memory usage - index._cleanup() + Parameters + ---------- + index : row label + col : column label + takeable : interpret the index/col as indexers, default False - frame.index = index + Returns + ------- + scalar - if not inplace: - return frame - return None + idx_diff = result_index.difference(correl.index) - @overload - def reset_index( - self, - level: IndexLabel = ..., - *, - drop: bool = ..., - inplace: Literal[False] = ..., - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame: ... + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @overload - def reset_index( - self, - level: IndexLabel = ..., - *, - drop: bool = ..., - inplace: Literal[True], - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] | None = None, - ) -> None: ... + return correl - @overload - def reset_index( - self, - level: IndexLabel = ..., - *, - drop: bool = ..., - inplace: bool = ..., - col_level: Hashable = ..., - col_fill: Hashable = ..., - allow_duplicates: bool | lib.NoDefault = ..., - names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame | None: ... + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def reset_index( - self, - level: IndexLabel | None = None, - *, - drop: bool = False, - inplace: bool = False, - col_level: Hashable = 0, - col_fill: Hashable = "", - allow_duplicates: bool | lib.NoDefault = lib.no_default, - names: Hashable | Sequence[Hashable] | None = None, - ) -> DataFrame | None: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" """ - Reset the index, or a level of it. + if takeable: + series = self._ixs(col, axis=1) + return series._values[index] - Reset the index of the DataFrame, and use the default one instead. - If the DataFrame has a MultiIndex, this method can remove one or more - levels. + series = self._get_item(col) + engine = self.index._engine - Parameters - ---------- - level : int, str, tuple, or list, default None - Only remove the given levels from the index. Removes all levels by - default. - drop : bool, default False - Do not try to insert index into dataframe columns. This resets - the index to the default integer index. - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - col_level : int or str, default 0 - If the columns have multiple levels, determines which level the - labels are inserted into. By default it is inserted into the first - level. - col_fill : object, default '' - If the columns have multiple levels, determines how the other - levels are named. If None then the index name is repeated. - allow_duplicates : bool, optional, default lib.no_default - Allow duplicate column labels to be created. + if not isinstance(self.index, MultiIndex): + # CategoricalIndex: Trying to use the engine fastpath may give incorrect + # results if our categories are integers that dont match our codes + # IntervalIndex: IntervalTree has no get_loc + row = self.index.get_loc(index) + return series._values[row] - .. versionadded:: 1.5.0 + # For MultiIndex going through engine effectively restricts us to + # same-length tuples; see test_get_set_value_no_partial_indexing + loc = engine.get_loc(index) + return series._values[loc] - names : int, str or 1-dimensional list, default None - Using the given string, rename the DataFrame column which contains the - index data. If the DataFrame has a MultiIndex, this has to be a list - with length equal to the number of levels. + def isetitem(self, loc, value) -> None: + """ + Set the given value in the column with position `loc`. - .. versionadded:: 1.5.0 + This is a positional analogue to ``__setitem__``. - Returns - ------- - DataFrame or None - DataFrame with the new index or None if ``inplace=True``. + Parameters + ---------- + loc : int or sequence of ints + Index position for the column. + value : scalar or arraylike + Value(s) for the column. See Also -------- - DataFrame.set_index : Opposite of reset_index. - DataFrame.reindex : Change to new indices or expand indices. - DataFrame.reindex_like : Change to same indices as other DataFrame. - - Examples - -------- - >>> df = pd.DataFrame( - ... [("bird", 389.0), ("bird", 24.0), ("mammal", 80.5), ("mammal", np.nan)], - ... index=["falcon", "parrot", "lion", "monkey"], - ... columns=("class", "max_speed"), - ... ) - >>> df - class max_speed - falcon bird 389.0 - parrot bird 24.0 - lion mammal 80.5 - monkey mammal NaN + DataFrame.iloc : Purely integer-location based indexing for selection by + position. - When we reset the index, the old index is added as a column, and a - new sequential index is used: + idx_diff = result_index.difference(correl.index) - >>> df.reset_index() - index class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal NaN + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - We can use the `drop` parameter to avoid the old index being added as - a column: + return correl - >>> df.reset_index(drop=True) - class max_speed - 0 bird 389.0 - 1 bird 24.0 - 2 mammal 80.5 - 3 mammal NaN + # ---------------------------------------------------------------------- + # ndarray-like stats methods - You can also use `reset_index` with `MultiIndex`. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> index = pd.MultiIndex.from_tuples( - ... [ - ... ("bird", "falcon"), - ... ("bird", "parrot"), - ... ("mammal", "lion"), - ... ("mammal", "monkey"), - ... ], - ... names=["class", "name"], - ... ) - >>> columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) - >>> df = pd.DataFrame( - ... [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")], - ... index=index, - ... columns=columns, - ... ) + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) + >>> df.isetitem(1, [5, 6]) >>> df - speed species - max type - class name - bird falcon 389.0 fly - parrot 24.0 fly - mammal lion 80.5 run - monkey NaN jump + A B + 0 1 5 + 1 2 6 + """ + if isinstance(value, DataFrame): + if is_integer(loc): + loc = [loc] - Using the `names` parameter, choose a name for the index column: + if len(loc) != len(value.columns): + raise ValueError( + f"Got {len(loc)} positions but value has {len(value.columns)} " + f"columns." + ) - >>> df.reset_index(names=["classes", "names"]) - classes names speed species - max type - 0 bird falcon 389.0 fly - 1 bird parrot 24.0 fly - 2 mammal lion 80.5 run - 3 mammal monkey NaN jump + for i, idx in enumerate(loc): + arraylike, refs = self._sanitize_column(value.iloc[:, i]) + self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs) + return - If the index has multiple levels, we can reset a subset of them: + arraylike, refs = self._sanitize_column(value) + self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) - >>> df.reset_index(level="class") - class speed species - max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump + idx_diff = result_index.difference(correl.index) - If we are not dropping the index, by default, it is placed in the top - level. We can place it in another level: + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df.reset_index(level="class", col_level=1) - speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump + return correl - When the index is inserted under another level, we can specify under - which one with the parameter `col_fill`: + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.reset_index(level="class", col_level=1, col_fill="species") - species speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump + key = com.apply_if_callable(key, self) - If we specify a nonexistent level for `col_fill`, it is created: + # see if we can slice the rows + if isinstance(key, slice): + slc = self.index._convert_slice_indexer(key, kind="getitem") + return self._setitem_slice(slc, value) - >>> df.reset_index(level="class", col_level=1, col_fill="genus") - genus speed species - class max type - name - falcon bird 389.0 fly - parrot bird 24.0 fly - lion mammal 80.5 run - monkey mammal NaN jump - """ - inplace = validate_bool_kwarg(inplace, "inplace") - self._check_inplace_and_allows_duplicate_labels(inplace) - if inplace: - new_obj = self + if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: + self._setitem_frame(key, value) + elif isinstance(key, (Series, np.ndarray, list, Index)): + self._setitem_array(key, value) + elif isinstance(value, DataFrame): + self._set_item_frame_value(key, value) + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): + # Column to set is duplicated + self._setitem_array([key], value) else: - new_obj = self.copy(deep=False) - if allow_duplicates is not lib.no_default: - allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") + # set column + self._set_item(key, value) - new_index = default_index(len(new_obj)) - if level is not None: - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.index._get_level_number(lev) for lev in level] - if len(level) < self.index.nlevels: - new_index = self.index.droplevel(level) + def _setitem_slice(self, key: slice, value) -> None: + # NB: we can't just use self.loc[key] = value because that + # operates on labels and we need to operate positional for + # backwards-compat, xref GH#31469 + self.iloc[key] = value - if not drop: - to_insert: Iterable[tuple[Any, Any | None]] + def _setitem_array(self, key, value) -> None: + # also raises Exception if object array with NA values + if com.is_bool_indexer(key): + # bool indexer is indexing along rows + if len(key) != len(self.index): + raise ValueError( + f"Item wrong length {len(key)} instead of {len(self.index)}!" + ) + key = check_bool_indexer(self.index, key) + indexer = key.nonzero()[0] + if isinstance(value, DataFrame): + # GH#39931 reindex since iloc does not align + value = value.reindex(self.index.take(indexer)) + self.iloc[indexer] = value - default = "index" if "index" not in self else "level_0" - names = self.index._get_default_index_names(names, default) + else: + # Note: unlike self.iloc[:, indexer] = value, this will + # never try to overwrite values inplace - if isinstance(self.index, MultiIndex): - to_insert = zip(reversed(self.index.levels), reversed(self.index.codes)) - else: - to_insert = ((self.index, None),) + if isinstance(value, DataFrame): + check_key_length(self.columns, key, value) + for k1, k2 in zip(key, value.columns): + self[k1] = value[k2] - multi_col = isinstance(self.columns, MultiIndex) - for j, (lev, lab) in enumerate(to_insert, start=1): - i = self.index.nlevels - j - if level is not None and i not in level: - continue - name = names[i] - if multi_col: - col_name = list(name) if isinstance(name, tuple) else [name] - if col_fill is None: - if len(col_name) not in (1, self.columns.nlevels): - raise ValueError( - "col_fill=None is incompatible " - f"with incomplete column name {name}" - ) - col_fill = col_name[0] + elif not is_list_like(value): + for col in key: + self[col] = value - lev_num = self.columns._get_level_number(col_level) - name_lst = [col_fill] * lev_num + col_name - missing = self.columns.nlevels - len(name_lst) - name_lst += [col_fill] * missing - name = tuple(name_lst) + elif isinstance(value, np.ndarray) and value.ndim == 2: + self._iset_not_inplace(key, value) - # to ndarray and maybe infer different dtype - level_values = lev._values - if level_values.dtype == np.object_: - level_values = lib.maybe_convert_objects(level_values) + elif np.ndim(value) > 1: + # list of lists + value = DataFrame(value).values + self._setitem_array(key, value) - if lab is not None: - # if we have the codes, extract the values with a mask - level_values = algorithms.take( - level_values, lab, allow_fill=True, fill_value=lev._na_value - ) + idx_diff = result_index.difference(correl.index) - new_obj.insert( - 0, - name, - level_values, - allow_duplicates=allow_duplicates, + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - new_obj.index = new_index - if not inplace: - return new_obj + return correl - return None + # ---------------------------------------------------------------------- + # ndarray-like stats methods # ---------------------------------------------------------------------- - # Reindex-based selection methods + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> DataFrame: - res_mgr = self._mgr.isna(func=isna) - result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes) - return result.__finalize__(self, method="isna") + def _iset_not_inplace(self, key, value) -> None: + # GH#39510 when setting with df[key] = obj with a list-like key and + # list-like value, we iterate over those listlikes and set columns + # one at a time. This is different from dispatching to + # `self.loc[:, key]= value` because loc.__setitem__ may overwrite + # data inplace, whereas this will insert new arrays. - @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> DataFrame: - """ - DataFrame.isnull is an alias for DataFrame.isna. - """ - return self.isna() + def igetitem(obj, i: int): + # Note: we catch DataFrame obj before getting here, but + # hypothetically would return obj.iloc[:, i] + if isinstance(obj, np.ndarray): + return obj[..., i] + else: + return obj[i] - @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> DataFrame: - return ~self.isna() + if self.columns.is_unique: + if np.shape(value)[-1] != len(key): + raise ValueError("Columns must be same length as key") - @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> DataFrame: - """ - DataFrame.notnull is an alias for DataFrame.notna. - """ - return ~self.isna() + for i, col in enumerate(key): + self[col] = igetitem(value, i) - @overload - def dropna( - self, - *, - axis: Axis = ..., - how: AnyAll | lib.NoDefault = ..., - thresh: int | lib.NoDefault = ..., - subset: IndexLabel = ..., - inplace: Literal[False] = ..., - ignore_index: bool = ..., - ) -> DataFrame: ... + else: + ilocs = self.columns.get_indexer_non_unique(key)[0] + if (ilocs < 0).any(): + # key entries not in self.columns + raise NotImplementedError - @overload - def dropna( - self, - *, - axis: Axis = ..., - how: AnyAll | lib.NoDefault = ..., - thresh: int | lib.NoDefault = ..., - subset: IndexLabel = ..., - inplace: Literal[True], - ignore_index: bool = ..., - ) -> None: ... + if np.shape(value)[-1] != len(ilocs): + raise ValueError("Columns must be same length as key") - def dropna( - self, - *, - axis: Axis = 0, - how: AnyAll | lib.NoDefault = lib.no_default, - thresh: int | lib.NoDefault = lib.no_default, - subset: IndexLabel | AnyArrayLike | None = None, - inplace: bool = False, - ignore_index: bool = False, - ) -> DataFrame | None: - """ - Remove missing values. + assert np.ndim(value) <= 2 - See the :ref:`User Guide ` for more on which values are - considered missing, and how to work with missing data. + orig_columns = self.columns - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - Determine if rows or columns which contain missing values are - removed. + # Using self.iloc[:, i] = ... may set values inplace, which + # by convention we do not do in __setitem__ + try: + self.columns = Index(range(len(self.columns))) + for i, iloc in enumerate(ilocs): + self[iloc] = igetitem(value, i) + finally: + self.columns = orig_columns + + def _setitem_frame(self, key, value) -> None: + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, np.ndarray): + if key.shape != self.shape: + raise ValueError("Array conditional must be same shape as self") + key = self._constructor(key, **self._construct_axes_dict(), copy=False) + + if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): + raise TypeError( + "Must pass DataFrame or 2-d ndarray with boolean values only" + ) + + self._where(-key, value, inplace=True) + + def _set_item_frame_value(self, key, value: DataFrame) -> None: + self._ensure_valid_index(value) + + # align columns + if key in self.columns: + loc = self.columns.get_loc(key) + cols = self.columns[loc] + len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols) + if len_cols != len(value.columns): + raise ValueError("Columns must be same length as key") + + # align right-hand-side columns if self.columns + # is multi-index and self[key] is a sub-frame + if isinstance(self.columns, MultiIndex) and isinstance( + loc, (slice, Series, np.ndarray, Index) + ): + cols_droplevel = maybe_droplevels(cols, key) + if len(cols_droplevel) and not cols_droplevel.equals(value.columns): + value = value.reindex(cols_droplevel, axis=1) - * 0, or 'index' : Drop rows which contain missing values. - * 1, or 'columns' : Drop columns which contain missing value. + for col, col_droplevel in zip(cols, cols_droplevel): + self[col] = value[col_droplevel] + return - Only a single axis is allowed. + if is_scalar(cols): + self[cols] = value[value.columns[0]] + return - how : {'any', 'all'}, default 'any' - Determine if row or column is removed from DataFrame, when we have - at least one NA or all NA. + locs: np.ndarray | list + if isinstance(loc, slice): + locs = np.arange(loc.start, loc.stop, loc.step) + elif is_scalar(loc): + locs = [loc] + else: + locs = loc.nonzero()[0] - * 'any' : If any NA values are present, drop that row or column. - * 'all' : If all values are NA, drop that row or column. + return self.isetitem(locs, value) - thresh : int, optional - Require that many non-NA values. Cannot be combined with how. - subset : column label or iterable of labels, optional - Labels along other axis to consider, e.g. if you are dropping rows - these would be a list of columns to include. - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + if len(value.columns) > 1: + raise ValueError( + "Cannot set a DataFrame with multiple columns to the single " + f"column {key}" + ) + elif len(value.columns) == 0: + raise ValueError( + f"Cannot set a DataFrame without columns to the column {key}" + ) - .. versionadded:: 2.0.0 + self[key] = value[value.columns[0]] - Returns - ------- - DataFrame or None - DataFrame with NA entries dropped from it or None if ``inplace=True``. + def _iset_item_mgr( + self, + loc: int | slice | np.ndarray, + value, + inplace: bool = False, + refs: BlockValuesRefs | None = None, + ) -> None: + # when called from _set_item_mgr loc can be anything returned from get_loc + self._mgr.iset(loc, value, inplace=inplace, refs=refs) - See Also - -------- - DataFrame.isna: Indicate missing values. - DataFrame.notna : Indicate existing (non-missing) values. - DataFrame.fillna : Replace missing values. - Series.dropna : Drop missing values. - Index.dropna : Drop missing indices. + def _set_item_mgr( + self, key, value: ArrayLike, refs: BlockValuesRefs | None = None + ) -> None: + try: + loc = self._info_axis.get_loc(key) + except KeyError: + # This item wasn't present, just insert at end + self._mgr.insert(len(self._info_axis), key, value, refs) + else: + self._iset_item_mgr(loc, value, refs=refs) - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "name": ["Alfred", "Batman", "Catwoman"], - ... "toy": [np.nan, "Batmobile", "Bullwhip"], - ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT], - ... } - ... ) - >>> df - name toy born - 0 Alfred NaN NaT - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT + idx_diff = result_index.difference(correl.index) - Drop the rows where at least one element is missing. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df.dropna() - name toy born - 1 Batman Batmobile 1940-04-25 + return correl - Drop the columns where at least one element is missing. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.dropna(axis="columns") - name - 0 Alfred - 1 Batman - 2 Catwoman + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Drop the rows where all elements are missing. + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrames index or an error will be thrown. - >>> df.dropna(how="all") - name toy born - 0 Alfred NaN NaT - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT + Series/TimeSeries will be conformed to the DataFrames index to + ensure homogeneity. + """ + value, refs = self._sanitize_column(value) - Keep only the rows with at least 2 non-NA values. + if ( + key in self.columns + and value.ndim == 1 + and not isinstance(value.dtype, ExtensionDtype) + ): + # broadcast across multiple columns if necessary + if not self.columns.is_unique or isinstance(self.columns, MultiIndex): + existing_piece = self[key] + if isinstance(existing_piece, DataFrame): + value = np.tile(value, (len(existing_piece.columns), 1)).T + refs = None - >>> df.dropna(thresh=2) - name toy born - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT + self._set_item_mgr(key, value, refs) - Define in which columns to look for missing values. + def _set_value( + self, index: IndexLabel, col, value: Scalar, takeable: bool = False + ) -> None: + """ + Put single value at passed column and index. - >>> df.dropna(subset=["name", "toy"]) - name toy born - 1 Batman Batmobile 1940-04-25 - 2 Catwoman Bullwhip NaT + Parameters + ---------- + index : Label + row label + col : Label + column label + value : scalar + takeable : bool, default False + Sets whether or not index/col interpreted as indexers """ - if (how is not lib.no_default) and (thresh is not lib.no_default): - raise TypeError( - "You cannot set both the how and thresh arguments at the same time." - ) + try: + if takeable: + icol = col + iindex = cast(int, index) + else: + icol = self.columns.get_loc(col) + iindex = self.index.get_loc(index) + self._mgr.column_setitem(icol, iindex, value, inplace_only=True) - if how is lib.no_default: - how = "any" + except (KeyError, TypeError, ValueError, LossySetitemError): + # get_loc might raise a KeyError for missing labels (falling back + # to (i)loc will do expansion of the index) + # column_setitem will do validation that may raise TypeError, + # ValueError, or LossySetitemError + # set using a non-recursive method & reset the cache + if takeable: + self.iloc[index, col] = value + else: + self.loc[index, col] = value - inplace = validate_bool_kwarg(inplace, "inplace") - if isinstance(axis, (tuple, list)): - # GH20987 - raise TypeError("supplying multiple axes to axis is no longer supported.") + except InvalidIndexError as ii_err: + # GH48729: Seems like you are trying to assign a value to a + # row when only scalar options are permitted + raise InvalidIndexError( + f"You can only assign a scalar value not a {type(value)}" + ) from ii_err - axis = self._get_axis_number(axis) - agg_axis = 1 - axis + def _ensure_valid_index(self, value) -> None: + """ + Ensure that if we don't have an index, that we can create one from the + passed value. + """ + # GH5632, make sure that we are a Series convertible + if not len(self.index) and is_list_like(value) and len(value): + if not isinstance(value, DataFrame): + try: + value = Series(value) + except (ValueError, NotImplementedError, TypeError) as err: + raise ValueError( + "Cannot set a frame with no defined index " + "and a value that cannot be converted to a Series" + ) from err - agg_obj = self - if subset is not None: - # subset needs to be list - if not is_list_like(subset): - subset = [cast(Hashable, subset)] - ax = self._get_axis(agg_axis) - indices = ax.get_indexer_for(subset) - check = indices == -1 - if check.any(): - raise KeyError(np.array(subset)[check].tolist()) - agg_obj = self.take(indices, axis=agg_axis) + # GH31368 preserve name of index + index_copy = value.index.copy() + if self.index.name is not None: + index_copy.name = self.index.name - if thresh is not lib.no_default: - count = agg_obj.count(axis=agg_axis) - mask = count >= thresh - elif how == "any": - # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' - mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) - elif how == "all": - # faster equivalent to 'agg_obj.count(agg_axis) > 0' - mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) - else: - raise ValueError(f"invalid how option: {how}") + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) - if np.all(mask): - result = self.copy(deep=False) - else: - result = self.loc(axis=axis)[mask] + def _box_col_values(self, values: SingleBlockManager, loc: int) -> Series: + """ + Provide boxed values for a column. + """ + # Lookup in columns so that if e.g. a str datetime was passed + # we attach the Timestamp object as the name. + name = self.columns[loc] + # We get index=self.index bc values is a SingleBlockManager + obj = self._constructor_sliced_from_mgr(values, axes=values.axes) + obj._name = name + return obj.__finalize__(self) - if ignore_index: - result.index = default_index(len(result)) + def _get_item(self, item: Hashable) -> Series: + loc = self.columns.get_loc(item) + return self._ixs(loc, axis=1) - if not inplace: - return result - self._update_inplace(result) - return None + # ---------------------------------------------------------------------- + # Unsorted @overload - def drop_duplicates( - self, - subset: Hashable | Iterable[Hashable] | None = ..., - *, - keep: DropKeep = ..., - inplace: Literal[True], - ignore_index: bool = ..., - ) -> None: ... + def query( + self, expr: str, *, inplace: Literal[False] = ..., **kwargs + ) -> DataFrame: ... @overload - def drop_duplicates( - self, - subset: Hashable | Iterable[Hashable] | None = ..., - *, - keep: DropKeep = ..., - inplace: Literal[False] = ..., - ignore_index: bool = ..., - ) -> DataFrame: ... + def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: ... @overload - def drop_duplicates( - self, - subset: Hashable | Iterable[Hashable] | None = ..., - *, - keep: DropKeep = ..., - inplace: bool = ..., - ignore_index: bool = ..., + def query( + self, expr: str, *, inplace: bool = ..., **kwargs ) -> DataFrame | None: ... - def drop_duplicates( - self, - subset: Hashable | Iterable[Hashable] | None = None, - *, - keep: DropKeep = "first", - inplace: bool = False, - ignore_index: bool = False, - ) -> DataFrame | None: + def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: """ - Return DataFrame with duplicate rows removed. - - Considering certain columns is optional. Indexes, including time indexes - are ignored. + Query the columns of a DataFrame with a boolean expression. Parameters ---------- - subset : column label or iterable of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', ``False``}, default 'first' - Determines which duplicates (if any) to keep. - - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - inplace : bool, default ``False`` - Whether to modify the DataFrame rather than creating a new one. - ignore_index : bool, default ``False`` - If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. - - Returns - ------- - DataFrame or None - DataFrame with duplicates removed or None if ``inplace=True``. - - See Also - -------- - DataFrame.value_counts: Count unique combinations of columns. - - Notes - ----- - This method requires columns specified by ``subset`` to be of hashable type. - Passing unhashable columns will raise a ``TypeError``. - - Examples - -------- - Consider dataset containing ramen rating. - - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - By default, it removes duplicate rows based on all columns. - - >>> df.drop_duplicates() - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 - - To remove duplicates on specific column(s), use ``subset``. - - >>> df.drop_duplicates(subset=["brand"]) - brand style rating - 0 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - - To remove duplicates and keep last occurrences, use ``keep``. + expr : str + The query string to evaluate. - >>> df.drop_duplicates(subset=["brand", "style"], keep="last") - brand style rating - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 4 Indomie pack 5.0 - """ - if self.empty: - return self.copy(deep=False) + You can refer to variables + in the environment by prefixing them with an '@' character like + ``@a + b``. - inplace = validate_bool_kwarg(inplace, "inplace") - ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. - result = self[-self.duplicated(subset, keep=keep)] - if ignore_index: - result.index = default_index(len(result)) + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. - if inplace: - self._update_inplace(result) - return None - else: - return result + idx_diff = result_index.difference(correl.index) - def duplicated( - self, - subset: Hashable | Iterable[Hashable] | None = None, - keep: DropKeep = "first", - ) -> Series: - """ - Return boolean Series denoting duplicate rows. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Considering certain columns is optional. + return correl - Parameters - ---------- - subset : column label or iterable of labels, optional - Only consider certain columns for identifying duplicates, by - default use all of the columns. - keep : {'first', 'last', False}, default 'first' - Determines which duplicates (if any) to mark. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - - ``first`` : Mark duplicates as ``True`` except for the first occurrence. - - ``last`` : Mark duplicates as ``True`` except for the last occurrence. - - False : Mark all duplicates as ``True``. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + Whether to modify the DataFrame rather than creating a new one. + **kwargs + See the documentation for :func:`eval` for complete details + on the keyword arguments accepted by :meth:`DataFrame.query`. Returns ------- - Series - Boolean series for each duplicated rows. + DataFrame or None + DataFrame resulting from the provided query expression or + None if ``inplace=True``. See Also -------- - Index.duplicated : Equivalent method on index. - Series.duplicated : Equivalent method on Series. - Series.drop_duplicates : Remove duplicate values from Series. - DataFrame.drop_duplicates : Remove duplicate values from DataFrame. + eval : Evaluate a string describing operations on + DataFrame columns. + DataFrame.eval : Evaluate a string describing operations on + DataFrame columns. - Examples - -------- - Consider dataset containing ramen rating. + Notes + ----- + The result of the evaluation of this expression is first passed to + :attr:`DataFrame.loc` and if that fails because of a + multidimensional key (e.g., a DataFrame) then the result will be passed + to :meth:`DataFrame.__getitem__`. - >>> df = pd.DataFrame( - ... { - ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], - ... "style": ["cup", "cup", "cup", "pack", "pack"], - ... "rating": [4, 4, 3.5, 15, 5], - ... } - ... ) - >>> df - brand style rating - 0 Yum Yum cup 4.0 - 1 Yum Yum cup 4.0 - 2 Indomie cup 3.5 - 3 Indomie pack 15.0 - 4 Indomie pack 5.0 + This method uses the top-level :func:`eval` function to + evaluate the passed query. - By default, for each set of duplicated values, the first occurrence - is set on False and all others on True. + The :meth:`~pandas.DataFrame.query` method uses a slightly + modified Python syntax by default. For example, the ``&`` and ``|`` + (bitwise) operators have the precedence of their boolean cousins, + :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, + however the semantics are different. - >>> df.duplicated() - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool + You can change the semantics of the expression by passing the keyword + argument ``parser='python'``. This enforces the same semantics as + evaluation in Python space. Likewise, you can pass ``engine='python'`` + to evaluate an expression using Python itself as a backend. This is not + recommended as it is inefficient compared to using ``numexpr`` as the + engine. - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True. + The :attr:`DataFrame.index` and + :attr:`DataFrame.columns` attributes of the + :class:`~pandas.DataFrame` instance are placed in the query namespace + by default, which allows you to treat both the index and columns of the + frame as a column in the frame. + The identifier ``index`` is used for the frame index; you can also + use the name of the index to identify it in a query. Please note that + Python keywords may not be used as identifiers. - >>> df.duplicated(keep="last") - 0 True - 1 False - 2 False - 3 False - 4 False - dtype: bool + For further details and examples see the ``query`` documentation in + :ref:`indexing `. - By setting ``keep`` on False, all duplicates are True. + *Backtick quoted variables* - >>> df.duplicated(keep=False) - 0 True - 1 True - 2 False - 3 False - 4 False - dtype: bool + Backtick quoted variables are parsed as literal Python code and + are converted internally to a Python valid identifier. + This can lead to the following problems. - To find duplicates on specific column(s), use ``subset``. + During parsing a number of disallowed characters inside the backtick + quoted string are replaced by strings that are allowed as a Python identifier. + These characters include all operators in Python, the space character, the + question mark, the exclamation mark, the dollar sign, and the euro sign. + For other characters that fall outside the ASCII range (U+0001..U+007F) + and those that are not further specified in PEP 3131, + the query parser will raise an error. + This excludes whitespace different than the space character, + but also the hashtag (as it is used for comments) and the backtick + itself (backtick can also not be escaped). + + In a special case, quotes that make a pair around a backtick can + confuse the parser. + For example, ```it's` > `that's``` will raise an error, + as it forms a quoted string (``'s > `that'``) with a backtick inside. + + See also the Python documentation about lexical analysis + (https://docs.python.org/3/reference/lexical_analysis.html) + in combination with the source code in :mod:`pandas.core.computation.parsing`. - >>> df.duplicated(subset=["brand"]) - 0 False - 1 True - 2 False - 3 True - 4 True - dtype: bool - """ + Examples + -------- + >>> df = pd.DataFrame( + ... {"A": range(1, 6), "B": range(10, 0, -2), "C C": range(10, 5, -1)} + ... ) + >>> df + A B C C + 0 1 10 10 + 1 2 8 9 + 2 3 6 8 + 3 4 4 7 + 4 5 2 6 + >>> df.query("A > B") + A B C C + 4 5 2 6 - if self.empty: - return self._constructor_sliced(dtype=bool) + The previous expression is equivalent to - def f(vals) -> tuple[np.ndarray, int]: - labels, shape = algorithms.factorize(vals, size_hint=len(self)) - return labels.astype("i8"), len(shape) + >>> df[df.A > df.B] + A B C C + 4 5 2 6 - if subset is None: - subset = self.columns - elif ( - not np.iterable(subset) - or isinstance(subset, str) - or (isinstance(subset, tuple) and subset in self.columns) - ): - subset = (subset,) + For columns with spaces in their name, you can use backtick quoting. - # needed for mypy since can't narrow types using np.iterable - subset = cast(Sequence, subset) + >>> df.query("B == `C C`") + A B C C + 0 1 10 10 - # Verify all columns in subset exist in the queried dataframe - # Otherwise, raise a KeyError, same as if you try to __getitem__ with a - # key that doesn't exist. - diff = set(subset) - set(self.columns) - if diff: - raise KeyError(Index(diff)) + The previous expression is equivalent to - if len(subset) == 1 and self.columns.is_unique: - # GH#45236 This is faster than get_group_index below - result = self[next(iter(subset))].duplicated(keep) - result.name = None - else: - vals = (col.values for name, col in self.items() if name in subset) - labels, shape = map(list, zip(*map(f, vals))) + >>> df[df.B == df["C C"]] + A B C C + 0 1 10 10 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + if not isinstance(expr, str): + msg = f"expr must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + kwargs["level"] = kwargs.pop("level", 0) + 1 + kwargs["target"] = None + res = self.eval(expr, **kwargs) - ids = get_group_index(labels, tuple(shape), sort=False, xnull=False) - result = self._constructor_sliced(duplicated(ids, keep), index=self.index) - return result.__finalize__(self, method="duplicated") + try: + result = self.loc[res] + except ValueError: + # when res is multi-dimensional loc raises, but this is sometimes a + # valid query + result = self[res] - # ---------------------------------------------------------------------- - # Sorting - # error: Signature of "sort_values" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def sort_values( - self, - by: IndexLabel, - *, - axis: Axis = ..., - ascending=..., - inplace: Literal[False] = ..., - kind: SortKind = ..., - na_position: NaPosition = ..., - ignore_index: bool = ..., - key: ValueKeyFunc = ..., - ) -> DataFrame: ... + if inplace: + self._update_inplace(result) + return None + else: + return result @overload - def sort_values( - self, - by: IndexLabel, - *, - axis: Axis = ..., - ascending=..., - inplace: Literal[True], - kind: SortKind = ..., - na_position: str = ..., - ignore_index: bool = ..., - key: ValueKeyFunc = ..., - ) -> None: ... + idx_diff = result_index.difference(correl.index) - def sort_values( - self, - by: IndexLabel, - *, - axis: Axis = 0, - ascending: bool | list[bool] | tuple[bool, ...] = True, - inplace: bool = False, - kind: SortKind = "quicksort", - na_position: str = "last", - ignore_index: bool = False, - key: ValueKeyFunc | None = None, - ) -> DataFrame | None: + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" """ - Sort by the values along either axis. + Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. This allows + `eval` to run arbitrary code, which can make you vulnerable to code + injection if you pass user input to this function. Parameters ---------- - by : str or list of str - Name or list of names to sort by. + idx_diff = result_index.difference(correl.index) - - if `axis` is 0 or `'index'` then `by` may contain index - levels and/or column labels. - - if `axis` is 1 or `'columns'` then `by` may contain column - levels and/or index labels. - axis : "{0 or 'index', 1 or 'columns'}", default 0 - Axis to be sorted. - ascending : bool or list of bool, default True - Sort ascending vs. descending. Specify list for multiple sort - orders. If this is a list of bools, must match the length of - the by. - inplace : bool, default False - If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. See also :func:`numpy.sort` for more - information. `mergesort` and `stable` are the only stable algorithms. For - DataFrames, this option is only applied when sorting on a single - column or label. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if `first`; `last` puts NaNs at the - end. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - key : callable, optional - Apply the key function to the values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect a - ``Series`` and return a Series with the same shape as the input. - It will be applied to each column in `by` independently. The values in the - returned Series will be used as the keys for sorting. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Returns ------- - DataFrame or None - DataFrame with sorted values or None if ``inplace=True``. + ndarray, scalar, pandas object, or None + The result of the evaluation or None if ``inplace=True``. See Also -------- - DataFrame.sort_index : Sort a DataFrame by the index. - Series.sort_values : Similar method for a Series. + DataFrame.query : Evaluates a boolean expression to query the columns + of a frame. + DataFrame.assign : Can evaluate an expression or function to create new + values for a column. + eval : Evaluate a Python expression as a string using various + backends. + + Notes + ----- + For more details see the API documentation for :func:`~eval`. + For detailed examples see :ref:`enhancing performance with eval + `. Examples -------- - >>> df = pd.DataFrame( - ... { - ... "col1": ["A", "A", "B", np.nan, "D", "C"], - ... "col2": [2, 1, 9, 8, 7, 4], - ... "col3": [0, 1, 9, 4, 2, 3], - ... "col4": ["a", "B", "c", "D", "e", "F"], - ... } + >>> df = pd.DataFrame({"A": range(1, 6), "B": range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval("A + B") + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval("C = A + B") + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' ... ) - >>> df - col1 col2 col3 col4 - 0 A 2 0 a - 1 A 1 1 B - 2 B 9 9 c - 3 NaN 8 4 D - 4 D 7 2 e - 5 C 4 3 F + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + from pandas.core.computation.eval import eval as _eval - **Sort by a single column** + inplace = validate_bool_kwarg(inplace, "inplace") + kwargs["level"] = kwargs.pop("level", 0) + 1 + index_resolvers = self._get_index_resolvers() + column_resolvers = self._get_cleaned_column_resolvers() + resolvers = column_resolvers, index_resolvers + if "target" not in kwargs: + kwargs["target"] = self + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers - In this case, we are sorting the rows according to values in ``col1``: + return _eval(expr, inplace=inplace, **kwargs) - >>> df.sort_values(by=["col1"]) - col1 col2 col3 col4 - 0 A 2 0 a - 1 A 1 1 B - 2 B 9 9 c - 5 C 4 3 F - 4 D 7 2 e - 3 NaN 8 4 D + idx_diff = result_index.difference(correl.index) - **Sort by multiple columns** + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - You can also provide multiple columns to ``by`` argument, as shown below. - In this example, the rows are first sorted according to ``col1``, and then - the rows that have an identical value in ``col1`` are sorted according - to ``col2``. + return correl - >>> df.sort_values(by=["col1", "col2"]) - col1 col2 col3 col4 - 1 A 1 1 B - 0 A 2 0 a - 2 B 9 9 c - 5 C 4 3 F - 4 D 7 2 e - 3 NaN 8 4 D + # ---------------------------------------------------------------------- + # ndarray-like stats methods - **Sort in a descending order** + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + """ + Return a subset of the DataFrame's columns based on the column dtypes. - The sort order can be reversed using ``ascending`` argument, as shown below: + Parameters + ---------- + idx_diff = result_index.difference(correl.index) - >>> df.sort_values(by="col1", ascending=False) - col1 col2 col3 col4 - 4 D 7 2 e - 5 C 4 3 F - 2 B 9 9 c - 0 A 2 0 a - 1 A 1 1 B - 3 NaN 8 4 D + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - **Placing any** ``NA`` **first** + return correl - Note that in the above example, the rows that contain an ``NA`` value in their - ``col1`` are placed at the end of the dataframe. This behavior can be modified - via ``na_position`` argument, as shown below: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.sort_values(by="col1", ascending=False, na_position="first") - col1 col2 col3 col4 - 3 NaN 8 4 D - 4 D 7 2 e - 5 C 4 3 F - 2 B 9 9 c - 0 A 2 0 a - 1 A 1 1 B + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - **Customized sort order** + Returns + ------- + DataFrame + The subset of the frame including the dtypes in ``include`` and + excluding the dtypes in ``exclude``. - The ``key`` argument allows for a further customization of sorting behaviour. - For example, you may want - to ignore the `letter's case `__ - when sorting strings: + Raises + ------ + ValueError + * If both of ``include`` and ``exclude`` are empty + * If ``include`` and ``exclude`` have overlapping elements + * If any kind of string dtype is passed in. - >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) - col1 col2 col3 col4 - 0 A 2 0 a - 1 A 1 1 B - 2 B 9 9 c - 3 NaN 8 4 D - 4 D 7 2 e - 5 C 4 3 F + See Also + -------- + DataFrame.dtypes: Return Series with the data type of each column. - Another typical example is - `natural sorting `__. - This can be done using - ``natsort`` `package `__, - which provides sorted indices according - to their natural order, as shown below: + Notes + ----- + * To select all *numeric* types, use ``np.number`` or ``'number'`` + * To select strings you must use the ``object`` dtype, but note that + this will return *all* object dtype columns + * See the `numpy dtype hierarchy + `__ + * To select datetimes, use ``np.datetime64``, ``'datetime'`` or + ``'datetime64'`` + * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or + ``'timedelta64'`` + * To select Pandas categorical dtypes, use ``'category'`` + * To select Pandas datetimetz dtypes, use ``'datetimetz'`` + or ``'datetime64[ns, tz]'`` + Examples + -------- >>> df = pd.DataFrame( - ... { - ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], - ... "value": [10, 20, 30, 40, 50], - ... } + ... {"a": [1, 2] * 3, "b": [True, False] * 3, "c": [1.0, 2.0] * 3} ... ) >>> df - time value - 0 0hr 10 - 1 128hr 20 - 2 72hr 30 - 3 48hr 40 - 4 96hr 50 - >>> from natsort import index_natsorted - >>> index_natsorted(df["time"]) - [0, 3, 2, 4, 1] - >>> df.sort_values( - ... by="time", - ... key=lambda x: np.argsort(index_natsorted(x)), - ... ) - time value - 0 0hr 10 - 3 48hr 40 - 2 72hr 30 - 4 96hr 50 - 1 128hr 20 + a b c + 0 1 True 1.0 + 1 2 False 2.0 + 2 1 True 1.0 + 3 2 False 2.0 + 4 1 True 1.0 + 5 2 False 2.0 + + >>> df.select_dtypes(include="bool") + b + 0 True + 1 False + 2 True + 3 False + 4 True + 5 False + + >>> df.select_dtypes(include=["float64"]) + c + 0 1.0 + 1 2.0 + 2 1.0 + 3 2.0 + 4 1.0 + 5 2.0 + + >>> df.select_dtypes(exclude=["int64"]) + b c + 0 True 1.0 + 1 False 2.0 + 2 True 1.0 + 3 False 2.0 + 4 True 1.0 + 5 False 2.0 """ - inplace = validate_bool_kwarg(inplace, "inplace") - axis = self._get_axis_number(axis) - ascending = validate_ascending(ascending) - if not isinstance(by, list): - by = [by] - # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; - # expected "Sized" - if is_sequence(ascending) and ( - len(by) != len(ascending) # type: ignore[arg-type] - ): - # error: Argument 1 to "len" has incompatible type "Union[bool, - # List[bool]]"; expected "Sized" - raise ValueError( - f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] - f" != length of by ({len(by)})" - ) - if len(by) > 1: - keys = (self._get_label_or_level_values(x, axis=axis) for x in by) + if not is_list_like(include): + include = (include,) if include is not None else () + if not is_list_like(exclude): + exclude = (exclude,) if exclude is not None else () - # need to rewrap columns in Series to apply key function - if key is not None: - keys_data = [Series(k, name=name) for (k, name) in zip(keys, by)] - else: - # error: Argument 1 to "list" has incompatible type - # "Generator[ExtensionArray | ndarray[Any, Any], None, None]"; - # expected "Iterable[Series]" - keys_data = list(keys) # type: ignore[arg-type] + selection = (frozenset(include), frozenset(exclude)) - indexer = lexsort_indexer( - keys_data, orders=ascending, na_position=na_position, key=key - ) - elif len(by): - # len(by) == 1 + if not any(selection): + raise ValueError("at least one of include or exclude must be nonempty") + + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - k = self._get_label_or_level_values(by[0], axis=axis) + include = check_int_infer_dtype(include) + exclude = check_int_infer_dtype(exclude) - # need to rewrap column in Series to apply key function - if key is not None: - # error: Incompatible types in assignment (expression has type - # "Series", variable has type "ndarray") - k = Series(k, name=by[0]) # type: ignore[assignment] + for dtypes in (include, exclude): + invalidate_string_dtypes(dtypes) - if isinstance(ascending, (tuple, list)): - ascending = ascending[0] + idx_diff = result_index.difference(correl.index) - indexer = nargsort( - k, kind=kind, ascending=ascending, na_position=na_position, key=key - ) - else: - if inplace: - return self._update_inplace(self) - else: - return self.copy(deep=False) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - if is_range_indexer(indexer, len(indexer)): - result = self.copy(deep=False) - if ignore_index: - result.index = default_index(len(result)) + return correl - if inplace: - return self._update_inplace(result) - else: - return result + # ---------------------------------------------------------------------- + # ndarray-like stats methods - new_data = self._mgr.take( - indexer, axis=self._get_block_manager_axis(axis), verify=False - ) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - if ignore_index: - new_data.set_axis( - self._get_block_manager_axis(axis), default_index(len(indexer)) + def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: + # GH 46870: BooleanDtype._is_numeric == True but should be excluded + dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype + return issubclass(dtype.type, tuple(dtypes_set)) or ( + np.number in dtypes_set + and getattr(dtype, "_is_numeric", False) + and not is_bool_dtype(dtype) ) - result = self._constructor_from_mgr(new_data, axes=new_data.axes) - if inplace: - return self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_values") + def predicate(arr: ArrayLike) -> bool: + dtype = arr.dtype + if include: + if not dtype_predicate(dtype, include): + return False - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: IndexLabel = ..., - ascending: bool | Sequence[bool] = ..., - inplace: Literal[True], - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> None: ... + if exclude: + if dtype_predicate(dtype, exclude): + return False - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: IndexLabel = ..., - ascending: bool | Sequence[bool] = ..., - inplace: Literal[False] = ..., - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> DataFrame: ... + return True - @overload - def sort_index( - self, - *, - axis: Axis = ..., - level: IndexLabel = ..., - ascending: bool | Sequence[bool] = ..., - inplace: bool = ..., - kind: SortKind = ..., - na_position: NaPosition = ..., - sort_remaining: bool = ..., - ignore_index: bool = ..., - key: IndexKeyFunc = ..., - ) -> DataFrame | None: ... + mgr = self._mgr._get_data_subset(predicate).copy(deep=False) + return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) - def sort_index( + def insert( self, - *, - axis: Axis = 0, - level: IndexLabel | None = None, - ascending: bool | Sequence[bool] = True, - inplace: bool = False, - kind: SortKind = "quicksort", - na_position: NaPosition = "last", - sort_remaining: bool = True, - ignore_index: bool = False, - key: IndexKeyFunc | None = None, - ) -> DataFrame | None: + loc: int, + column: Hashable, + value: object, + allow_duplicates: bool | lib.NoDefault = lib.no_default, + ) -> None: """ - Sort object by labels (along an axis). + Insert column into DataFrame at specified location. - Returns a new DataFrame sorted by label if `inplace` argument is - ``False``, otherwise updates the original DataFrame and returns None. + Raises a ValueError if `column` is already contained in the DataFrame, + unless `allow_duplicates` is set to True. Parameters ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis along which to sort. The value 0 identifies the rows, - and 1 identifies the columns. - level : int or level name or list of ints or list of level names - If not None, sort on values in specified index level(s). - ascending : bool or list-like of bools, default True - Sort ascending vs. descending. When the index is a MultiIndex the - sort direction can be controlled for each level individually. - inplace : bool, default False - Whether to modify the DataFrame rather than creating a new one. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. See also :func:`numpy.sort` for more - information. `mergesort` and `stable` are the only stable algorithms. For - DataFrames, this option is only applied when sorting on a single - column or label. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. - Not implemented for MultiIndex. - sort_remaining : bool, default True - If True and sorting by level and index is multilevel, sort by other - levels too (in order) after sorting by specified level. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - key : callable, optional - If not None, apply the key function to the index values - before sorting. This is similar to the `key` argument in the - builtin :meth:`sorted` function, with the notable difference that - this `key` function should be *vectorized*. It should expect an - ``Index`` and return an ``Index`` of the same shape. For MultiIndex - inputs, the key is applied *per level*. + idx_diff = result_index.difference(correl.index) - Returns - ------- - DataFrame or None - The original DataFrame sorted by the labels or None if ``inplace=True``. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" See Also -------- - Series.sort_index : Sort Series by the index. - DataFrame.sort_values : Sort DataFrame by the value. - Series.sort_values : Sort Series by the value. + Index.insert : Insert new item by index. Examples -------- - >>> df = pd.DataFrame( - ... [1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], columns=["A"] - ... ) - >>> df.sort_index() - A - 1 4 - 29 2 - 100 1 - 150 5 - 234 3 - - By default, it sorts in ascending order, to sort in descending order, - use ``ascending=False`` - - >>> df.sort_index(ascending=False) - A - 234 3 - 150 5 - 100 1 - 29 2 - 1 4 - - A key function can be specified which is applied to the index before - sorting. For a ``MultiIndex`` this is applied to each level separately. - - >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"]) - >>> df.sort_index(key=lambda x: x.str.lower()) - a - A 1 - b 2 - C 3 - d 4 - """ - return super().sort_index( - axis=axis, - level=level, - ascending=ascending, - inplace=inplace, - kind=kind, - na_position=na_position, - sort_remaining=sort_remaining, - ignore_index=ignore_index, - key=key, - ) + >>> df = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + >>> df + col1 col2 + 0 1 3 + 1 2 4 + >>> df.insert(1, "newcol", [99, 99]) + >>> df + col1 newcol col2 + 0 1 99 3 + 1 2 99 4 + >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) + >>> df + col1 col1 newcol col2 + 0 100 1 99 3 + 1 100 2 99 4 - def value_counts( - self, - subset: IndexLabel | None = None, - normalize: bool = False, - sort: bool = True, - ascending: bool = False, - dropna: bool = True, - ) -> Series: + Notice that pandas uses index alignment in case of `value` from type `Series`: + + >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) + >>> df + col0 col1 col1 newcol col2 + 0 NaN 100 1 99 3 + 1 5.0 100 2 99 4 """ - Return a Series containing the frequency of each distinct row in the DataFrame. + if allow_duplicates is lib.no_default: + allow_duplicates = False + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + if not allow_duplicates and column in self.columns: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {column}, already exists") + if not is_integer(loc): + raise TypeError("loc must be int") + # convert non stdlib ints to satisfy typing checks + loc = int(loc) + if isinstance(value, DataFrame) and len(value.columns) > 1: + raise ValueError( + f"Expected a one-dimensional object, got a DataFrame with " + f"{len(value.columns)} columns instead." + ) + elif isinstance(value, DataFrame): + value = value.iloc[:, 0] - Parameters - ---------- - subset : Hashable or a sequence of the previous, optional - Columns to use when counting unique combinations. - normalize : bool, default False - Return proportions rather than frequencies. - sort : bool, default True - Sort by frequencies when True. Preserve the order of the data when False. + value, refs = self._sanitize_column(value) + self._mgr.insert(loc, column, value, refs=refs) - .. versionchanged:: 3.0.0 + def assign(self, **kwargs) -> DataFrame: + r""" + Assign new columns to a DataFrame. - Prior to 3.0.0, ``sort=False`` would sort by the columns values. - ascending : bool, default False - Sort in ascending order. - dropna : bool, default True - Do not include counts of rows that contain NA values. + Returns a new object with all original columns in addition to new ones. + Existing columns that are re-assigned will be overwritten. - .. versionadded:: 1.3.0 + Parameters + ---------- + **kwargs : dict of {str: callable or Series} + The column names are keywords. If the values are + callable, they are computed on the DataFrame and + assigned to the new columns. The callable must not + change input DataFrame (though pandas doesn't check it). + If the values are not callable, (e.g. a Series, scalar, or array), + they are simply assigned. Returns ------- - Series - Series containing the frequency of each distinct row in the DataFrame. - - See Also - -------- - Series.value_counts: Equivalent method on Series. + DataFrame + A new DataFrame with the new columns in addition to + all the existing columns. Notes ----- - The returned Series will have a MultiIndex with one level per input - column but an Index (non-multi) for a single label. By default, rows - that contain any NA values are omitted from the result. By default, - the resulting Series will be sorted by frequencies in descending order so that - the first element is the most frequently-occurring row. + Assigning multiple columns within the same ``assign`` is possible. + Later items in '\*\*kwargs' may refer to newly created or modified + columns in 'df'; items are computed and assigned into 'df' in order. Examples -------- - >>> df = pd.DataFrame( - ... {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, - ... index=["falcon", "dog", "cat", "ant"], - ... ) + >>> df = pd.DataFrame({"temp_c": [17.0, 25.0]}, index=["Portland", "Berkeley"]) >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - cat 4 0 - ant 6 0 + temp_c + Portland 17.0 + Berkeley 25.0 - >>> df.value_counts() - num_legs num_wings - 4 0 2 - 2 2 1 - 6 0 1 - Name: count, dtype: int64 + Where the value is a callable, evaluated on `df`: - >>> df.value_counts(sort=False) - num_legs num_wings - 2 2 1 - 4 0 2 - 6 0 1 - Name: count, dtype: int64 + >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 - >>> df.value_counts(ascending=True) - num_legs num_wings - 2 2 1 - 6 0 1 - 4 0 2 - Name: count, dtype: int64 + Alternatively, the same behavior can be achieved by directly + referencing an existing Series or sequence: - >>> df.value_counts(normalize=True) - num_legs num_wings - 4 0 0.50 - 2 2 0.25 - 6 0 0.25 - Name: proportion, dtype: float64 + >>> df.assign(temp_f=df["temp_c"] * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 - With `dropna` set to `False` we can also count rows with NA values. + You can create multiple columns within the same assign where one + of the columns depends on another one defined within the same assign: - >>> df = pd.DataFrame( - ... { - ... "first_name": ["John", "Anne", "John", "Beth"], - ... "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], - ... } + >>> df.assign( + ... temp_f=lambda x: x["temp_c"] * 9 / 5 + 32, + ... temp_k=lambda x: (x["temp_f"] + 459.67) * 5 / 9, ... ) - >>> df - first_name middle_name - 0 John Smith - 1 Anne - 2 John - 3 Beth Louise - - >>> df.value_counts() - first_name middle_name - Beth Louise 1 - John Smith 1 - Name: count, dtype: int64 - - >>> df.value_counts(dropna=False) - first_name middle_name - Anne NaN 1 - Beth Louise 1 - John Smith 1 - NaN 1 - Name: count, dtype: int64 - - >>> df.value_counts("first_name") - first_name - John 2 - Anne 1 - Beth 1 - Name: count, dtype: int64 + temp_c temp_f temp_k + Portland 17.0 62.6 290.15 + Berkeley 25.0 77.0 298.15 """ - if subset is None: - subset = self.columns.tolist() - - name = "proportion" if normalize else "count" - counts = self.groupby( - subset, sort=False, dropna=dropna, observed=False - )._grouper.size() - counts.name = name + data = self.copy(deep=False) - if sort: - counts = counts.sort_values(ascending=ascending) - if normalize: - counts /= counts.sum() + for k, v in kwargs.items(): + data[k] = com.apply_if_callable(v, data) + return data - # Force MultiIndex for a list_like subset with a single column - if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type] - counts.index = MultiIndex.from_arrays( - [counts.index], names=[counts.index.name] - ) + def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: + """ + Ensures new columns (which go into the BlockManager as new blocks) are + always copied (or a reference is being tracked to them under CoW) + and converted into an array. - return counts + Parameters + ---------- + value : scalar, Series, or array-like - def nlargest( - self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" - ) -> DataFrame: + Returns + ------- + tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs """ - Return the first `n` rows ordered by `columns` in descending order. + idx_diff = result_index.difference(correl.index) - Return the first `n` rows with the largest values in `columns`, in - descending order. The columns that are not specified are returned as - well, but not used for ordering. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - This method is equivalent to - ``df.sort_values(columns, ascending=False).head(n)``, but more - performant. + return correl - Parameters - ---------- - n : int - Number of rows to return. - columns : Hashable or a sequence of the previous - Column label(s) to order by. - keep : {'first', 'last', 'all'}, default 'first' - Where there are duplicate values: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - - ``first`` : prioritize the first occurrence(s) - - ``last`` : prioritize the last occurrence(s) - - ``all`` : keep all the ties of the smallest item even if it means - selecting more than ``n`` items. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + ) + return arr, None - Returns - ------- - DataFrame - The first `n` rows ordered by the given columns in descending - order. + @property + def _series(self): + return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} - See Also - -------- - DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in - ascending order. - DataFrame.sort_values : Sort DataFrame by the values. - DataFrame.head : Return the first `n` rows without re-ordering. + # ---------------------------------------------------------------------- + # Reindexing and alignment + + def _reindex_multi(self, axes: dict[str, Index], fill_value) -> DataFrame: + """ + We are guaranteed non-Nones in the axes. + """ - Notes - ----- - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. + new_index, row_indexer = self.index.reindex(axes["index"]) + new_columns, col_indexer = self.columns.reindex(axes["columns"]) + + if row_indexer is not None and col_indexer is not None: + # Fastpath. By doing two 'take's at once we avoid making an + # unnecessary copy. + # We only get here with `self._can_fast_transpose`, which (almost) + # ensures that self.values is cheap. It may be worth making this + # condition more specific. + indexer = row_indexer, col_indexer + new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) + return self._constructor( + new_values, index=new_index, columns=new_columns, copy=False + ) + else: + return self._reindex_with_indexers( + {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, + fill_value=fill_value, + ) + @Appender( + """ Examples -------- - >>> df = pd.DataFrame( - ... { - ... "population": [ - ... 59000000, - ... 65000000, - ... 434000, - ... 434000, - ... 434000, - ... 337000, - ... 11300, - ... 11300, - ... 11300, - ... ], - ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - ... }, - ... index=[ - ... "Italy", - ... "France", - ... "Malta", - ... "Maldives", - ... "Brunei", - ... "Iceland", - ... "Nauru", - ... "Tuvalu", - ... "Anguilla", - ... ], - ... ) - >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 11300 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - - In the following example, we will use ``nlargest`` to select the three - rows having the largest values in column "population". + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - >>> df.nlargest(3, "population") - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT + Change the row labels. - When using ``keep='last'``, ties are resolved in reverse order: + >>> df.set_axis(['a', 'b', 'c'], axis='index') + A B + a 1 4 + b 2 5 + c 3 6 - >>> df.nlargest(3, "population", keep="last") - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Brunei 434000 12128 BN + Change the column labels. - When using ``keep='all'``, the number of element kept can go beyond ``n`` - if there are duplicate values for the smallest element, all the - ties are kept: + >>> df.set_axis(['I', 'II'], axis='columns') + I II + 0 1 4 + 1 2 5 + 2 3 6 + """ + ) + @Substitution( + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + extended_summary_sub=" column or", + axis_description_sub=", and 1 identifies the columns", + see_also_sub=" or columns", + ) + @Appender(NDFrame.set_axis.__doc__) + def set_axis( + self, + labels, + *, + axis: Axis = 0, + copy: bool | None = None, + ) -> DataFrame: + return super().set_axis(labels, axis=axis) - >>> df.nlargest(3, "population", keep="all") - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN + @doc( + NDFrame.reindex, + klass=_shared_doc_kwargs["klass"], + optional_reindex=_shared_doc_kwargs["optional_reindex"], + ) + def reindex( + self, + labels=None, + *, + index=None, + columns=None, + axis: Axis | None = None, + method: ReindexMethod | None = None, + copy: bool | lib.NoDefault = lib.no_default, + level: Level | None = None, + fill_value: Scalar | None = np.nan, + limit: int | None = None, + tolerance=None, + ) -> DataFrame: + return super().reindex( + labels=labels, + index=index, + columns=columns, + axis=axis, + method=method, + level=level, + fill_value=fill_value, + limit=limit, + tolerance=tolerance, + copy=copy, + ) - However, ``nlargest`` does not keep ``n`` distinct largest elements: + @overload + def drop( + self, + labels: IndexLabel | ListLike = ..., + *, + axis: Axis = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., + level: Level = ..., + inplace: Literal[True], + errors: IgnoreRaise = ..., + ) -> None: ... - >>> df.nlargest(5, "population", keep="all") - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN + @overload + def drop( + self, + labels: IndexLabel | ListLike = ..., + *, + axis: Axis = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., + level: Level = ..., + inplace: Literal[False] = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: ... - To order by the largest values in column "population" and then "GDP", - we can specify multiple columns like in the next example. + @overload + def drop( + self, + labels: IndexLabel | ListLike = ..., + *, + axis: Axis = ..., + index: IndexLabel | ListLike = ..., + columns: IndexLabel | ListLike = ..., + level: Level = ..., + inplace: bool = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: ... - >>> df.nlargest(3, ["population", "GDP"]) - population GDP alpha-2 - France 65000000 2583560 FR - Italy 59000000 1937894 IT - Brunei 434000 12128 BN + def drop( + self, + labels: IndexLabel | ListLike = None, + *, + axis: Axis = 0, + index: IndexLabel | ListLike = None, + columns: IndexLabel | ListLike = None, + level: Level | None = None, + inplace: bool = False, + errors: IgnoreRaise = "raise", + ) -> DataFrame | None: """ - return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() + Drop specified labels from rows or columns. - def nsmallest( - self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" - ) -> DataFrame: - """ - Return the first `n` rows ordered by `columns` in ascending order. + Remove rows or columns by specifying label names and corresponding + axis, or by directly specifying index or column names. When using a + multi-index, labels on different levels can be removed by specifying + the level. See the :ref:`user guide ` + for more information about the now unused levels. + + Parameters + ---------- + labels : single label or list-like + Index or column labels to drop. A tuple will be used as a single + label and not treated as a list-like. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to drop labels from the index (0 or 'index') or + columns (1 or 'columns'). + index : single label or list-like + Alternative to specifying axis (``labels, axis=0`` + is equivalent to ``index=labels``). + columns : single label or list-like + Alternative to specifying axis (``labels, axis=1`` + is equivalent to ``columns=labels``). + level : int or level name, optional + For MultiIndex, level from which the labels will be removed. + inplace : bool, default False + If False, return a copy. Otherwise, do operation + in place and return None. + errors : {'ignore', 'raise'}, default 'raise' + If 'ignore', suppress error and only existing labels are + dropped. + + Returns + ------- + DataFrame or None + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl - Return the first `n` rows with the smallest values in `columns`, in - ascending order. The columns that are not specified are returned as - well, but not used for ordering. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - This method is equivalent to - ``df.sort_values(columns, ascending=True).head(n)``, but more - performant. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + >>> df + A B C D + 0 0 1 2 3 + 1 4 5 6 7 + 2 8 9 10 11 - Parameters - ---------- - n : int - Number of items to retrieve. - columns : list or str - Column name or names to order by. - keep : {'first', 'last', 'all'}, default 'first' - Where there are duplicate values: + Drop columns - - ``first`` : take the first occurrence. - - ``last`` : take the last occurrence. - - ``all`` : keep all the ties of the largest item even if it means - selecting more than ``n`` items. + >>> df.drop(["B", "C"], axis=1) + A D + 0 0 3 + 1 4 7 + 2 8 11 - Returns - ------- - DataFrame - DataFrame with the first `n` rows ordered by `columns` in ascending order. + >>> df.drop(columns=["B", "C"]) + A D + 0 0 3 + 1 4 7 + 2 8 11 - See Also - -------- - DataFrame.nlargest : Return the first `n` rows ordered by `columns` in - descending order. - DataFrame.sort_values : Sort DataFrame by the values. - DataFrame.head : Return the first `n` rows without re-ordering. + Drop a row by index - Examples - -------- + >>> df.drop([0, 1]) + A B C D + 2 8 9 10 11 + + Drop columns and/or rows of MultiIndex DataFrame + + >>> midx = pd.MultiIndex( + ... levels=[["llama", "cow", "falcon"], ["speed", "weight", "length"]], + ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], + ... ) >>> df = pd.DataFrame( - ... { - ... "population": [ - ... 59000000, - ... 65000000, - ... 434000, - ... 434000, - ... 434000, - ... 337000, - ... 337000, - ... 11300, - ... 11300, - ... ], - ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], - ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], - ... }, - ... index=[ - ... "Italy", - ... "France", - ... "Malta", - ... "Maldives", - ... "Brunei", - ... "Iceland", - ... "Nauru", - ... "Tuvalu", - ... "Anguilla", + ... index=midx, + ... columns=["big", "small"], + ... data=[ + ... [45, 30], + ... [200, 100], + ... [1.5, 1], + ... [30, 20], + ... [250, 150], + ... [1.5, 0.8], + ... [320, 250], + ... [1, 0.8], + ... [0.3, 0.2], ... ], ... ) >>> df - population GDP alpha-2 - Italy 59000000 1937894 IT - France 65000000 2583560 FR - Malta 434000 12011 MT - Maldives 434000 4520 MV - Brunei 434000 12128 BN - Iceland 337000 17036 IS - Nauru 337000 182 NR - Tuvalu 11300 38 TV - Anguilla 11300 311 AI + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + weight 1.0 0.8 + length 0.3 0.2 - In the following example, we will use ``nsmallest`` to select the - three rows having the smallest values in column "population". + Drop a specific index combination from the MultiIndex + DataFrame, i.e., drop the combination ``'falcon'`` and + ``'weight'``, which deletes only the corresponding row - >>> df.nsmallest(3, "population") - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS + >>> df.drop(index=("falcon", "weight")) + big small + llama speed 45.0 30.0 + weight 200.0 100.0 + length 1.5 1.0 + cow speed 30.0 20.0 + weight 250.0 150.0 + length 1.5 0.8 + falcon speed 320.0 250.0 + length 0.3 0.2 - When using ``keep='last'``, ties are resolved in reverse order: + >>> df.drop(index="cow", columns="small") + big + llama speed 45.0 + weight 200.0 + length 1.5 + falcon speed 320.0 + weight 1.0 + length 0.3 - >>> df.nsmallest(3, "population", keep="last") - population GDP alpha-2 - Anguilla 11300 311 AI - Tuvalu 11300 38 TV - Nauru 337000 182 NR + idx_diff = result_index.difference(correl.index) - When using ``keep='all'``, the number of element kept can go beyond ``n`` - if there are duplicate values for the largest element, all the - ties are kept. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df.nsmallest(3, "population", keep="all") - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS - Nauru 337000 182 NR + return correl - However, ``nsmallest`` does not keep ``n`` distinct - smallest elements: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.nsmallest(4, "population", keep="all") - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Iceland 337000 17036 IS - Nauru 337000 182 NR + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - To order by the smallest values in column "population" and then "GDP", we can - specify multiple columns like in the next example. + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: Literal[True], + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> None: ... - >>> df.nsmallest(3, ["population", "GDP"]) - population GDP alpha-2 - Tuvalu 11300 38 TV - Anguilla 11300 311 AI - Nauru 337000 182 NR - """ - return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: Literal[False] = ..., + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame: ... - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + @overload + def rename( + self, + mapper: Renamer | None = ..., + *, + index: Renamer | None = ..., + columns: Renamer | None = ..., + axis: Axis | None = ..., + copy: bool | None = ..., + inplace: bool = ..., + level: Level = ..., + errors: IgnoreRaise = ..., + ) -> DataFrame | None: ... + + def rename( + self, + mapper: Renamer | None = None, + *, + index: Renamer | None = None, + columns: Renamer | None = None, + axis: Axis | None = None, + copy: bool | None = None, + inplace: bool = False, + level: Level | None = None, + errors: IgnoreRaise = "ignore", + ) -> DataFrame | None: """ - Swap levels i and j in a :class:`MultiIndex`. + Rename columns or index labels. + + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. Extra labels listed don't throw an + error. - Default is to swap the two innermost levels of the index. + See the :ref:`user guide ` for more. Parameters ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise. + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + You can already get the future behavior and improvements through + enabling copy on write ``pd.options.mode.copy_on_write = True`` + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + If True then value of copy is ignored. + level : int or level name, default None + In case of a MultiIndex, only rename labels in the specified + level. + errors : {'ignore', 'raise'}, default 'ignore' + If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, + or `columns` contains labels that are not present in the Index + being transformed. + If 'ignore', existing keys will be renamed and extra keys will be + ignored. Returns ------- - DataFrame - DataFrame with levels swapped in MultiIndex. + DataFrame or None + DataFrame with the renamed axis labels or None if ``inplace=True``. + + Raises + ------ + KeyError + If any of the labels is not found in the selected axis and + "errors='raise'". See Also -------- - DataFrame.reorder_levels: Reorder levels of MultiIndex. - DataFrame.sort_index: Sort MultiIndex. + DataFrame.rename_axis : Set the name of the axis. Examples -------- - >>> df = pd.DataFrame( - ... {"Grade": ["A", "B", "A", "C"]}, - ... index=[ - ... ["Final exam", "Final exam", "Coursework", "Coursework"], - ... ["History", "Geography", "History", "Geography"], - ... ["January", "February", "March", "April"], - ... ], - ... ) - >>> df - Grade - Final exam History January A - Geography February B - Coursework History March A - Geography April C + ``DataFrame.rename`` supports two calling conventions - In the following example, we will swap the levels of the indices. - Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. - By not supplying any arguments for i and j, we swap the last and second to - last indices. + * ``(index=index_mapper, columns=columns_mapper, ...)`` + * ``(mapper, axis={'index', 'columns'}, ...)`` - >>> df.swaplevel() - Grade - Final exam January History A - February Geography B - Coursework March History A - April Geography C + We *highly* recommend using keyword arguments to clarify your + intent. - By supplying one argument, we can choose which index to swap the last - index with. We can for example swap the first index with the last one as - follows. + Rename columns using a mapping: - >>> df.swaplevel(0) - Grade - January History Final exam A - February Geography Final exam B - March History Coursework A - April Geography Coursework C + >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df.rename(columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 - We can also define explicitly which indices we want to swap by supplying values - for both i and j. Here, we for example swap the first and second indices. + Rename index using a mapping: - >>> df.swaplevel(0, 1) - Grade - History Final exam January A - Geography Final exam February B - History Coursework March A - Geography Coursework April C - """ - result = self.copy(deep=False) + >>> df.rename(index={0: "x", 1: "y", 2: "z"}) + A B + x 1 4 + y 2 5 + z 3 6 - axis = self._get_axis_number(axis) + Cast index labels to a different type: - if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover - raise TypeError("Can only swap levels on a hierarchical axis.") + >>> df.index + RangeIndex(start=0, stop=3, step=1) + >>> df.rename(index=str).index + Index(['0', '1', '2'], dtype='object') - if axis == 0: - assert isinstance(result.index, MultiIndex) - result.index = result.index.swaplevel(i, j) - else: - assert isinstance(result.columns, MultiIndex) - result.columns = result.columns.swaplevel(i, j) - return result + >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") + Traceback (most recent call last): + KeyError: ['C'] not found in axis + + Using axis-style parameters: - def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: + >>> df.rename(str.lower, axis="columns") + a b + 0 1 4 + 1 2 5 + 2 3 6 + + >>> df.rename({1: 2, 2: 4}, axis="index") + A B + 0 1 4 + 2 2 5 + 4 3 6 """ - Rearrange index or column levels using input ``order``. + return super()._rename( + mapper=mapper, + index=index, + columns=columns, + axis=axis, + inplace=inplace, + level=level, + errors=errors, + ) - May not drop or duplicate levels. + def pop(self, item: Hashable) -> Series: + """ + Return item and drop it from DataFrame. Raise KeyError if not found. Parameters ---------- - order : list of int or list of str - List representing new level order. Reference level by number - (position) or by key (label). - axis : {0 or 'index', 1 or 'columns'}, default 0 - Where to reorder levels. + item : label + Label of column to be popped. Returns ------- - DataFrame - DataFrame with indices or columns with reordered levels. - - See Also - -------- - DataFrame.swaplevel : Swap levels i and j in a MultiIndex. + Series + Series representing the item that is dropped. Examples -------- - >>> data = { - ... "class": ["Mammals", "Mammals", "Reptiles"], - ... "diet": ["Omnivore", "Carnivore", "Carnivore"], - ... "species": ["Humans", "Dogs", "Snakes"], - ... } - >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) - >>> df = df.set_index(["class", "diet"]) + >>> df = pd.DataFrame( + ... [ + ... ("falcon", "bird", 389.0), + ... ("parrot", "bird", 24.0), + ... ("lion", "mammal", 80.5), + ... ("monkey", "mammal", np.nan), + ... ], + ... columns=("name", "class", "max_speed"), + ... ) >>> df - species - class diet - Mammals Omnivore Humans - Carnivore Dogs - Reptiles Carnivore Snakes + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN - Let's reorder the levels of the index: + >>> df.pop("class") + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object - >>> df.reorder_levels(["diet", "class"]) - species - diet class - Omnivore Mammals Humans - Carnivore Mammals Dogs - Reptiles Snakes + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN """ - axis = self._get_axis_number(axis) - if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover - raise TypeError("Can only reorder levels on a hierarchical axis.") - - result = self.copy(deep=False) - - if axis == 0: - assert isinstance(result.index, MultiIndex) - result.index = result.index.reorder_levels(order) - else: - assert isinstance(result.columns, MultiIndex) - result.columns = result.columns.reorder_levels(order) - return result - - # ---------------------------------------------------------------------- - # Arithmetic Methods - - def _cmp_method(self, other, op): - axis: Literal[1] = 1 # only relevant for Series other case - - self, other = self._align_for_op(other, axis, flex=False, level=None) - - # See GH#4537 for discussion of scalar op behavior - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data, other=other) - - def _arith_method(self, other, op): - if self._should_reindex_frame_op(other, op, 1, None, None): - return self._arith_method_with_reindex(other, op) - - axis: Literal[1] = 1 # only relevant for Series other case - other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) - - self, other = self._align_for_op(other, axis, flex=True, level=None) + return super().pop(item=item) - with np.errstate(all="ignore"): - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data, other=other) + @overload + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[True], regex + ) -> None: ... - _logical_method = _arith_method + @overload + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: Literal[False], regex + ) -> Self: ... - def _dispatch_frame_op( - self, right, func: Callable, axis: AxisInt | None = None - ) -> DataFrame: + def _replace_columnwise( + self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex + ) -> Self | None: """ - Evaluate the frame operation func(left, right) by evaluating - column-by-column, dispatching to the Series implementation. + Dispatch to Series.replace column-wise. Parameters ---------- - right : scalar, Series, or DataFrame - func : arithmetic or comparison operator - axis : {None, 0, 1} + mapping : dict + of the form {col: (target, value)} + inplace : bool + regex : bool or same types as `to_replace` in DataFrame.replace Returns ------- - DataFrame - - Notes - ----- - Caller is responsible for setting np.errstate where relevant. + DataFrame or None """ - # Get the appropriate array-op to apply to each column/block's values. - array_op = ops.get_array_op(func) - - right = lib.item_from_zerodim(right) - if not is_list_like(right): - # i.e. scalar, faster than checking np.ndim(right) == 0 - bm = self._mgr.apply(array_op, right=right) - return self._constructor_from_mgr(bm, axes=bm.axes) - - elif isinstance(right, DataFrame): - assert self.index.equals(right.index) - assert self.columns.equals(right.columns) - # TODO: The previous assertion `assert right._indexed_same(self)` - # fails in cases with empty columns reached via - # _frame_arith_method_with_reindex - - # TODO operate_blockwise expects a manager of the same type - bm = self._mgr.operate_blockwise( - right._mgr, - array_op, - ) - return self._constructor_from_mgr(bm, axes=bm.axes) - - elif isinstance(right, Series) and axis == 1: - # axis=1 means we want to operate row-by-row - assert right.index.equals(self.columns) - - right = right._values - # maybe_align_as_frame ensures we do not have an ndarray here - assert not isinstance(right, np.ndarray) - - arrays = [ - array_op(_left, _right) - for _left, _right in zip(self._iter_column_arrays(), right) - ] - - elif isinstance(right, Series): - assert right.index.equals(self.index) - right = right._values + # Operate column-wise + res = self if inplace else self.copy(deep=False) + ax = self.columns - arrays = [array_op(left, right) for left in self._iter_column_arrays()] + for i, ax_value in enumerate(ax): + if ax_value in mapping: + ser = self.iloc[:, i] - else: - raise NotImplementedError(right) + target, value = mapping[ax_value] + newobj = ser.replace(target, value, regex=regex) - return type(self)._from_arrays( - arrays, self.columns, self.index, verify_integrity=False - ) + res._iset_item(i, newobj, inplace=inplace) - def _combine_frame(self, other: DataFrame, func, fill_value=None): - # at this point we have `self._indexed_same(other)` + if inplace: + return None + return res.__finalize__(self) - if fill_value is None: - # since _arith_op may be called in a loop, avoid function call - # overhead if possible by doing this check once - _arith_op = func + idx_diff = result_index.difference(correl.index) - else: + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - def _arith_op(left, right): - # for the mixed_type case where we iterate over columns, - # _arith_op(left, right) is equivalent to - # left._binop(right, func, fill_value=fill_value) - left, right = ops.fill_binop(left, right, fill_value) - return func(left, right) + return correl - new_data = self._dispatch_frame_op(other, _arith_op) - return new_data + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: - """ - For DataFrame-with-DataFrame operations that require reindexing, - operate only on shared columns, then reindex. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Parameters - ---------- - right : DataFrame - op : binary operator + if self.empty: + return self.copy() - Returns - ------- - DataFrame - """ - left = self + axis = self._get_axis_number(axis) - # GH#31623, only operate on shared columns - cols, lcol_indexer, rcol_indexer = left.columns.join( - right.columns, how="inner", return_indexers=True - ) + if is_list_like(periods): + periods = cast(Sequence, periods) + if axis == 1: + raise ValueError( + "If `periods` contains multiple shifts, `axis` cannot be 1." + ) + if len(periods) == 0: + raise ValueError("If `periods` is an iterable, it cannot be empty.") + from pandas.core.reshape.concat import concat - new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] - new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + shifted_dataframes = [] + for period in periods: + if not is_integer(period): + raise TypeError( + f"Periods must be integer, but {period} is {type(period)}." + ) + period = cast(int, period) + shifted_dataframes.append( + super() + .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value) + .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}") + ) + return concat(shifted_dataframes, axis=1) + elif suffix: + raise ValueError("Cannot specify `suffix` if `periods` is an int.") + periods = cast(int, periods) - # GH#60498 For MultiIndex column alignment - if isinstance(cols, MultiIndex): - # When overwriting column names, make a shallow copy so as to not modify - # the input DFs - new_left = new_left.copy(deep=False) - new_right = new_right.copy(deep=False) - new_left.columns = cols - new_right.columns = cols + ncols = len(self.columns) + arrays = self._mgr.arrays + if axis == 1 and periods != 0 and ncols > 0 and freq is None: + if fill_value is lib.no_default: + # We will infer fill_value to match the closest column - result = op(new_left, new_right) + idx_diff = result_index.difference(correl.index) - # Do the join on the columns instead of using left._align_for_op - # to avoid constructing two potentially large/sparse DataFrames - join_columns = left.columns.join(right.columns, how="outer") + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - if result.columns.has_duplicates: - # Avoid reindexing with a duplicate axis. - # https://github.com/pandas-dev/pandas/issues/35194 - indexer, _ = result.columns.get_indexer_non_unique(join_columns) - indexer = algorithms.unique1d(indexer) - result = result._reindex_with_indexers( - {1: [join_columns, indexer]}, allow_dups=True - ) - else: - result = result.reindex(join_columns, axis=1) + return correl - return result + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool: - """ - Check if this is an operation between DataFrames that will need to reindex. - """ - if op is operator.pow or op is roperator.rpow: - # GH#32685 pow has special semantics for operating with null values - return False + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - if not isinstance(right, DataFrame): - return False + if periods > 0: + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, label, filler, allow_duplicates=True) + else: + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), label, filler, allow_duplicates=True + ) - if ( - ( - isinstance(self.columns, MultiIndex) - or isinstance(right.columns, MultiIndex) - ) - and not self.columns.equals(right.columns) - and fill_value is None - ): - # GH#60498 Reindex if MultiIndexe columns are not matching - # GH#60903 Don't reindex if fill_value is provided - return True + result.columns = self.columns.copy() + return result + elif len(arrays) > 1 or ( + # If we only have one block and we know that we can't + # keep the same dtype (i.e. the _can_hold_element check) + # then we can go through the reindex_indexer path + # (and avoid casting logic in the Block method). + not can_hold_element(arrays[0], fill_value) + ): + # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default + nper = abs(periods) + nper = min(nper, ncols) + if periods > 0: + indexer = np.array( + [-1] * nper + list(range(ncols - periods)), dtype=np.intp + ) + else: + indexer = np.array( + list(range(nper, ncols)) + [-1] * nper, dtype=np.intp + ) + mgr = self._mgr.reindex_indexer( + self.columns, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + ) + res_df = self._constructor_from_mgr(mgr, axes=mgr.axes) + return res_df.__finalize__(self, method="shift") + else: + return self.T.shift(periods=periods, fill_value=fill_value).T - if fill_value is None and level is None and axis == 1: - # TODO: any other cases we should handle here? + return super().shift( + periods=periods, freq=freq, axis=axis, fill_value=fill_value + ) - # Intersection is always unique so we have to check the unique columns - left_uniques = self.columns.unique() - right_uniques = right.columns.unique() - cols = left_uniques.intersection(right_uniques) - if len(cols) and not ( - len(cols) == len(left_uniques) and len(cols) == len(right_uniques) - ): - # TODO: is there a shortcut available when len(cols) == 0? - return True + @overload + def set_index( + self, + keys, + *, + drop: bool = ..., + append: bool = ..., + inplace: Literal[False] = ..., + verify_integrity: bool = ..., + ) -> DataFrame: ... - return False + @overload + def set_index( + self, + keys, + *, + drop: bool = ..., + append: bool = ..., + inplace: Literal[True], + verify_integrity: bool = ..., + ) -> None: ... - def _align_for_op( + def set_index( self, - other, - axis: AxisInt, - flex: bool | None = False, - level: Level | None = None, - ): + keys, + *, + drop: bool = True, + append: bool = False, + inplace: bool = False, + verify_integrity: bool = False, + ) -> DataFrame | None: """ - Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. + Set the DataFrame index using existing columns. + + Set the DataFrame index (row labels) using one or more existing + columns or arrays (of the correct length). The index can replace the + existing index or expand on it. Parameters ---------- - other : Any - axis : int - flex : bool or None, default False - Whether this is a flex op, in which case we reindex. - None indicates not to check for alignment. - level : int or level name, default None + keys : label or array-like or list of labels/arrays + This parameter can be either a single column key, a single array of + the same length as the calling DataFrame, or a list containing an + arbitrary combination of column keys and arrays. Here, "array" + encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and + instances of :class:`~collections.abc.Iterator`. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + verify_integrity : bool, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method. Returns ------- - left : DataFrame - right : Any - """ - left, right = self, other + DataFrame or None + Changed row labels or None if ``inplace=True``. - def to_series(right): - msg = ( - "Unable to coerce to Series, " - "length must be {req_len}: given {given_len}" - ) + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. - # pass dtype to avoid doing inference, which would break consistency - # with Index/Series ops - dtype = None - if getattr(right, "dtype", None) == object: - # can't pass right.dtype unconditionally as that would break on e.g. - # datetime64[h] ndarray - dtype = object + See Also + -------- + DataFrame.swaplevel : Swap levels i and j in a MultiIndex. - if axis == 0: - if len(left.index) != len(right): - raise ValueError( - msg.format(req_len=len(left.index), given_len=len(right)) - ) - right = left._constructor_sliced(right, index=left.index, dtype=dtype) - else: - if len(left.columns) != len(right): - raise ValueError( - msg.format(req_len=len(left.columns), given_len=len(right)) - ) - right = left._constructor_sliced(right, index=left.columns, dtype=dtype) - return right + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "month": [1, 4, 7, 10], + ... "year": [2012, 2014, 2013, 2014], + ... "sale": [55, 40, 84, 31], + ... } + ... ) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 - if isinstance(right, np.ndarray): - if right.ndim == 1: - right = to_series(right) + Set the index to become the 'month' column: - elif right.ndim == 2: - # We need to pass dtype=right.dtype to retain object dtype - # otherwise we lose consistency with Index and array ops - dtype = None - if right.dtype == object: - # can't pass right.dtype unconditionally as that would break on e.g. - # datetime64[h] ndarray - dtype = object + idx_diff = result_index.difference(correl.index) - if right.shape == left.shape: - right = left._constructor( - right, index=left.index, columns=left.columns, dtype=dtype - ) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - elif right.shape[0] == left.shape[0] and right.shape[1] == 1: - # Broadcast across columns - right = np.broadcast_to(right, left.shape) - right = left._constructor( - right, index=left.index, columns=left.columns, dtype=dtype - ) + return correl - elif right.shape[1] == left.shape[1] and right.shape[0] == 1: - # Broadcast along rows - right = to_series(right[0, :]) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - else: - raise ValueError( - "Unable to coerce to DataFrame, shape " - f"must be {left.shape}: given {right.shape}" - ) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - elif right.ndim > 2: - raise ValueError( - "Unable to coerce to Series/DataFrame, " - f"dimension must be <= 2: {right.shape}" - ) + Create a MultiIndex using columns 'year' and 'month': - elif is_list_like(right) and not isinstance(right, (Series, DataFrame)): - # GH#36702. Raise when attempting arithmetic with list of array-like. - if any(is_array_like(el) for el in right): - raise ValueError( - f"Unable to coerce list of {type(right[0])} to Series/DataFrame" + >>> df.set_index(["year", "month"]) + sale + year month + 2012 1 55 + 2014 4 40 + 2013 7 84 + 2014 10 31 + + Create a MultiIndex using an Index and a column: + + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - # GH#17901 - right = to_series(right) - if flex is not None and isinstance(right, DataFrame): - if not left._indexed_same(right): - if flex: - left, right = left.align(right, join="outer", level=level) - else: - raise ValueError( - "Can only compare identically-labeled (both index and columns) " - "DataFrame objects" - ) - elif isinstance(right, Series): - # axis=1 is default for DataFrame-with-Series op - axis = axis if axis is not None else 1 - if not flex: - if not left.axes[axis].equals(right.index): - raise ValueError( - "Operands are not aligned. Do " - "`left, right = left.align(right, axis=1)` " - "before operating." - ) + return correl - left, right = left.align( - right, - join="outer", - axis=axis, - level=level, - ) - right = left._maybe_align_series_as_frame(right, axis) - return left, right + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): - """ - If the Series operand is not EA-dtype, we can broadcast to 2D and operate - blockwise. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + Create a MultiIndex using two Series: + + >>> s = pd.Series([1, 2, 3, 4]) + >>> df.set_index([s, s**2]) + month year sale + 1 1 1 2012 55 + 2 4 4 2014 40 + 3 9 7 2013 84 + 4 16 10 2014 31 """ - rvalues = series._values - if not isinstance(rvalues, np.ndarray): - # TODO(EA2D): no need to special-case with 2D EAs - if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): - # We can losslessly+cheaply cast to ndarray - rvalues = np.asarray(rvalues) + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if not isinstance(keys, list): + keys = [keys] + + err_msg = ( + 'The parameter "keys" may be a column key, one-dimensional ' + "array, or a list containing only valid column keys and " + "one-dimensional arrays." + ) + + missing: list[Hashable] = [] + for col in keys: + if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): + # arrays are fine as long as they are one-dimensional + # iterators get converted to list below + if getattr(col, "ndim", 1) != 1: + raise ValueError(err_msg) + else: + # everything else gets tried as a key; see GH 24969 + try: + found = col in self.columns + except TypeError as err: + raise TypeError( + f"{err_msg}. Received column of type {type(col)}" + ) from err + else: + if not found: + missing.append(col) + + if missing: + raise KeyError(f"None of {missing} are in the columns") + + if inplace: + frame = self + else: + frame = self.copy(deep=False) + + arrays: list[Index] = [] + names: list[Hashable] = [] + if append: + names = list(self.index.names) + if isinstance(self.index, MultiIndex): + arrays.extend( + self.index._get_level_values(i) for i in range(self.index.nlevels) + ) else: - return series + arrays.append(self.index) - if axis == 0: - rvalues = rvalues.reshape(-1, 1) - else: - rvalues = rvalues.reshape(1, -1) + to_remove: list[Hashable] = [] + for col in keys: + if isinstance(col, MultiIndex): + arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) + names.extend(col.names) + elif isinstance(col, (Index, Series)): + # if Index then not MultiIndex (treated above) - rvalues = np.broadcast_to(rvalues, self.shape) - # pass dtype to avoid doing inference - return self._constructor( - rvalues, - index=self.index, - columns=self.columns, - dtype=rvalues.dtype, - ).__finalize__(series) + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[Index, Series]"; expected "Index" + arrays.append(col) # type: ignore[arg-type] + names.append(col.name) + elif isinstance(col, (list, np.ndarray)): + # error: Argument 1 to "append" of "list" has incompatible type + # "Union[List[Any], ndarray]"; expected "Index" + arrays.append(col) # type: ignore[arg-type] + names.append(None) + elif isinstance(col, abc.Iterator): + # error: Argument 1 to "append" of "list" has incompatible type + # "List[Any]"; expected "Index" + arrays.append(list(col)) # type: ignore[arg-type] + names.append(None) + # from here, col can only be a column label + else: + arrays.append(frame[col]) + names.append(col) + if drop: + to_remove.append(col) - def _flex_arith_method( - self, other, op, *, axis: Axis = "columns", level=None, fill_value=None - ): - axis = self._get_axis_number(axis) if axis is not None else 1 + if len(arrays[-1]) != len(self): + # check newest element against length of calling frame, since + # ensure_index_from_sequences would not raise for append=False. + raise ValueError( + f"Length mismatch: Expected {len(self)} rows, " + f"received array of length {len(arrays[-1])}" + ) - if self._should_reindex_frame_op(other, op, axis, fill_value, level): - return self._arith_method_with_reindex(other, op) + index = ensure_index_from_sequences(arrays, names) - if isinstance(other, Series) and fill_value is not None: - # TODO: We could allow this in cases where we end up going - # through the DataFrame path - raise NotImplementedError(f"fill_value {fill_value} not supported.") + if verify_integrity and not index.is_unique: + duplicates = index[index.duplicated()].unique() + raise ValueError(f"Index has duplicate keys: {duplicates}") - other = ops.maybe_prepare_scalar_for_op(other, self.shape) - self, other = self._align_for_op(other, axis, flex=True, level=level) + # use set to handle duplicate column names gracefully in case of drop + for c in set(to_remove): + del frame[c] - with np.errstate(all="ignore"): - if isinstance(other, DataFrame): - # Another DataFrame - new_data = self._combine_frame(other, op, fill_value) + # clear up memory usage + index._cleanup() - elif isinstance(other, Series): - new_data = self._dispatch_frame_op(other, op, axis=axis) - else: - # in this case we always have `np.ndim(other) == 0` - if fill_value is not None: - self = self.fillna(fill_value) + frame.index = index - new_data = self._dispatch_frame_op(other, op) + if not inplace: + return frame + return None - return self._construct_result(new_data, other=other) + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame: ... - def _construct_result(self, result, other) -> DataFrame: - """ - Wrap the result of an arithmetic, comparison, or logical operation. + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> None: ... - Parameters - ---------- - result : DataFrame + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[True], + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> None: ... - Returns - ------- - DataFrame - """ - out = self._constructor(result, copy=False).__finalize__(self) - # Pin columns instead of passing to constructor for compat with - # non-unique columns case - out.columns = self.columns - out.index = self.index - out = out.__finalize__(other) - return out + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: Literal[False] = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame: ... - def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = self // other - mod = self - div * other - return div, mod + @overload + def reset_index( + self, + level: IndexLabel = ..., + *, + drop: bool = ..., + inplace: bool = ..., + col_level: Hashable = ..., + col_fill: Hashable = ..., + allow_duplicates: bool | lib.NoDefault = ..., + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame | None: ... - def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: - # Naive implementation, room for optimization - div = other // self - mod = other - div * self - return div, mod + def reset_index( + self, + level: IndexLabel | None = None, + *, + drop: bool = False, + inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: bool | lib.NoDefault = lib.no_default, + names: Hashable | Sequence[Hashable] | None = None, + ) -> DataFrame | None: + """ + Reset the index, or a level of it. - def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): - axis = self._get_axis_number(axis) if axis is not None else 1 + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. - self, other = self._align_for_op(other, axis, flex=True, level=level) + idx_diff = result_index.difference(correl.index) - new_data = self._dispatch_frame_op(other, op, axis=axis) - return self._construct_result(new_data, other=other) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @Appender(ops.make_flex_doc("eq", "dataframe")) - def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) + return correl - @Appender(ops.make_flex_doc("ne", "dataframe")) - def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - @Appender(ops.make_flex_doc("le", "dataframe")) - def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.le, axis=axis, level=level) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @Appender(ops.make_flex_doc("lt", "dataframe")) - def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) + .. versionadded:: 1.5.0 - @Appender(ops.make_flex_doc("ge", "dataframe")) - def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) + names : int, str or 1-dimensional list, default None + Using the given string, rename the DataFrame column which contains the + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels. - @Appender(ops.make_flex_doc("gt", "dataframe")) - def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: - return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) + .. versionadded:: 1.5.0 - @Appender(ops.make_flex_doc("add", "dataframe")) - def add( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.add, level=level, fill_value=fill_value, axis=axis - ) + Returns + ------- + DataFrame or None + DataFrame with the new index or None if ``inplace=True``. - @Appender(ops.make_flex_doc("radd", "dataframe")) - def radd( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.radd, level=level, fill_value=fill_value, axis=axis - ) + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. - @Appender(ops.make_flex_doc("sub", "dataframe")) - def sub( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.sub, level=level, fill_value=fill_value, axis=axis - ) + idx_diff = result_index.difference(correl.index) - subtract = sub + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @Appender(ops.make_flex_doc("rsub", "dataframe")) - def rsub( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rsub, level=level, fill_value=fill_value, axis=axis - ) + return correl - @Appender(ops.make_flex_doc("mul", "dataframe")) - def mul( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.mul, level=level, fill_value=fill_value, axis=axis - ) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - multiply = mul + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @Appender(ops.make_flex_doc("rmul", "dataframe")) - def rmul( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rmul, level=level, fill_value=fill_value, axis=axis - ) + When we reset the index, the old index is added as a column, and a + new sequential index is used: - @Appender(ops.make_flex_doc("truediv", "dataframe")) - def truediv( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.truediv, level=level, fill_value=fill_value, axis=axis - ) + >>> df.reset_index() + index class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN - div = truediv - divide = truediv + We can use the `drop` parameter to avoid the old index being added as + a column: - @Appender(ops.make_flex_doc("rtruediv", "dataframe")) - def rtruediv( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis - ) + idx_diff = result_index.difference(correl.index) - rdiv = rtruediv + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @Appender(ops.make_flex_doc("floordiv", "dataframe")) - def floordiv( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.floordiv, level=level, fill_value=fill_value, axis=axis - ) + return correl - @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) - def rfloordiv( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis - ) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - @Appender(ops.make_flex_doc("mod", "dataframe")) - def mod( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.mod, level=level, fill_value=fill_value, axis=axis - ) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @Appender(ops.make_flex_doc("rmod", "dataframe")) - def rmod( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rmod, level=level, fill_value=fill_value, axis=axis - ) + You can also use `reset_index` with `MultiIndex`. - @Appender(ops.make_flex_doc("pow", "dataframe")) - def pow( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, operator.pow, level=level, fill_value=fill_value, axis=axis - ) + >>> index = pd.MultiIndex.from_tuples( + ... [ + ... ("bird", "falcon"), + ... ("bird", "parrot"), + ... ("mammal", "lion"), + ... ("mammal", "monkey"), + ... ], + ... names=["class", "name"], + ... ) + >>> columns = pd.MultiIndex.from_tuples([("speed", "max"), ("species", "type")]) + >>> df = pd.DataFrame( + ... [(389.0, "fly"), (24.0, "fly"), (80.5, "run"), (np.nan, "jump")], + ... index=index, + ... columns=columns, + ... ) + >>> df + speed species + max type + class name + bird falcon 389.0 fly + parrot 24.0 fly + mammal lion 80.5 run + monkey NaN jump - @Appender(ops.make_flex_doc("rpow", "dataframe")) - def rpow( - self, other, axis: Axis = "columns", level=None, fill_value=None - ) -> DataFrame: - return self._flex_arith_method( - other, roperator.rpow, level=level, fill_value=fill_value, axis=axis - ) + Using the `names` parameter, choose a name for the index column: - # ---------------------------------------------------------------------- - # Combination-Related + >>> df.reset_index(names=["classes", "names"]) + classes names speed species + max type + 0 bird falcon 389.0 fly + 1 bird parrot 24.0 fly + 2 mammal lion 80.5 run + 3 mammal monkey NaN jump - @doc( - _shared_docs["compare"], - dedent( - """ - Returns - ------- - DataFrame - DataFrame that shows the differences stacked side by side. + If the index has multiple levels, we can reset a subset of them: - The resulting index will be a MultiIndex with 'self' and 'other' - stacked alternately at the inner level. + >>> df.reset_index(level="class") + class speed species + max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump - Raises - ------ - ValueError - When the two DataFrames don't have identical labels or shape. + If we are not dropping the index, by default, it is placed in the top + level. We can place it in another level: - See Also - -------- - Series.compare : Compare with another Series and show differences. - DataFrame.equals : Test whether two objects contain the same elements. + >>> df.reset_index(level="class", col_level=1) + speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump - Notes - ----- - Matching NaNs will not appear as a difference. + When the index is inserted under another level, we can specify under + which one with the parameter `col_fill`: - Can only compare identically-labeled - (i.e. same shape, identical row and column labels) DataFrames + >>> df.reset_index(level="class", col_level=1, col_fill="species") + species speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump - Examples - -------- - >>> df = pd.DataFrame( - ... {{ - ... "col1": ["a", "a", "b", "b", "a"], - ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] - ... }}, - ... columns=["col1", "col2", "col3"], - ... ) - >>> df - col1 col2 col3 - 0 a 1.0 1.0 - 1 a 2.0 2.0 - 2 b 3.0 3.0 - 3 b NaN 4.0 - 4 a 5.0 5.0 - - >>> df2 = df.copy() - >>> df2.loc[0, 'col1'] = 'c' - >>> df2.loc[2, 'col3'] = 4.0 - >>> df2 - col1 col2 col3 - 0 c 1.0 1.0 - 1 a 2.0 2.0 - 2 b 3.0 4.0 - 3 b NaN 4.0 - 4 a 5.0 5.0 - - Align the differences on columns - - >>> df.compare(df2) - col1 col3 - self other self other - 0 a c NaN NaN - 2 NaN NaN 3.0 4.0 - - Assign result_names - - >>> df.compare(df2, result_names=("left", "right")) - col1 col3 - left right left right - 0 a c NaN NaN - 2 NaN NaN 3.0 4.0 - - Stack the differences on rows - - >>> df.compare(df2, align_axis=0) - col1 col3 - 0 self a NaN - other c NaN - 2 self NaN 3.0 - other NaN 4.0 - - Keep the equal values - - >>> df.compare(df2, keep_equal=True) - col1 col3 - self other self other - 0 a c 1.0 1.0 - 2 b b 3.0 4.0 - - Keep all original rows and columns - - >>> df.compare(df2, keep_shape=True) - col1 col2 col3 - self other self other self other - 0 a c NaN NaN NaN NaN - 1 NaN NaN NaN NaN NaN NaN - 2 NaN NaN NaN NaN 3.0 4.0 - 3 NaN NaN NaN NaN NaN NaN - 4 NaN NaN NaN NaN NaN NaN - - Keep all original rows and columns and also all original values - - >>> df.compare(df2, keep_shape=True, keep_equal=True) - col1 col2 col3 - self other self other self other - 0 a c 1.0 1.0 1.0 1.0 - 1 a a 2.0 2.0 2.0 2.0 - 2 b b 3.0 3.0 3.0 4.0 - 3 b b NaN NaN 4.0 4.0 - 4 a a 5.0 5.0 5.0 5.0 - """ - ), - klass=_shared_doc_kwargs["klass"], - ) - def compare( - self, - other: DataFrame, - align_axis: Axis = 1, - keep_shape: bool = False, - keep_equal: bool = False, - result_names: Suffixes = ("self", "other"), - ) -> DataFrame: - return super().compare( - other=other, - align_axis=align_axis, - keep_shape=keep_shape, - keep_equal=keep_equal, - result_names=result_names, - ) + If we specify a nonexistent level for `col_fill`, it is created: - def combine( - self, - other: DataFrame, - func: Callable[[Series, Series], Series | Hashable], - fill_value=None, - overwrite: bool = True, - ) -> DataFrame: + >>> df.reset_index(level="class", col_level=1, col_fill="genus") + genus speed species + class max type + name + falcon bird 389.0 fly + parrot bird 24.0 fly + lion mammal 80.5 run + monkey mammal NaN jump """ - Perform column-wise combine with another DataFrame. - - Combines a DataFrame with `other` DataFrame using `func` - to element-wise combine columns. The row and column indexes of the - resulting DataFrame will be the union of the two. + inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) + if inplace: + new_obj = self + else: + new_obj = self.copy(deep=False) + if allow_duplicates is not lib.no_default: + allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") - Parameters - ---------- - other : DataFrame - The DataFrame to merge column-wise. - func : function - Function that takes two series as inputs and return a Series or a - scalar. Used to merge the two dataframes column by columns. - fill_value : scalar value, default None - The value to fill NaNs with prior to passing any column to the - merge func. - overwrite : bool, default True - If True, columns in `self` that do not exist in `other` will be - overwritten with NaNs. + new_index = default_index(len(new_obj)) + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if len(level) < self.index.nlevels: + new_index = self.index.droplevel(level) - Returns - ------- - DataFrame - Combination of the provided DataFrames. + idx_diff = result_index.difference(correl.index) - See Also - -------- - DataFrame.combine_first : Combine two DataFrame objects and default to - non-null values in frame calling the method. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Examples - -------- - Combine using a simple function that chooses the smaller column. + return correl - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) - >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 - >>> df1.combine(df2, take_smaller) - A B - 0 0 3 - 1 0 3 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Example using a true element-wise combine function. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df1 = pd.DataFrame({"A": [5, 0], "B": [2, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) - >>> df1.combine(df2, np.minimum) - A B - 0 1 2 - 1 0 3 + if isinstance(self.index, MultiIndex): + to_insert = zip(self.index.levels, self.index.codes) + else: + to_insert = ((self.index, None),) - Using `fill_value` fills Nones prior to passing the column to the - merge function. + idx_diff = result_index.difference(correl.index) - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) - >>> df1.combine(df2, take_smaller, fill_value=-5) - A B - 0 0 -5.0 - 1 0 4.0 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - However, if the same element in both dataframes is None, that None - is preserved + return correl - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [None, 3]}) - >>> df1.combine(df2, take_smaller, fill_value=-5) - A B - 0 0 -5.0 - 1 0 3.0 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Example that demonstrates the use of `overwrite` and behavior when - the axis differ between the dataframes. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df1 = pd.DataFrame({"A": [0, 0], "B": [4, 4]}) - >>> df2 = pd.DataFrame( - ... { - ... "B": [3, 3], - ... "C": [-10, 1], - ... }, - ... index=[1, 2], - ... ) - >>> df1.combine(df2, take_smaller) - A B C - 0 NaN NaN NaN - 1 NaN 3.0 -10.0 - 2 NaN 3.0 1.0 + lev_num = self.columns._get_level_number(col_level) + name_lst = [col_fill] * lev_num + col_name + missing = self.columns.nlevels - len(name_lst) + name_lst += [col_fill] * missing + name = tuple(name_lst) - >>> df1.combine(df2, take_smaller, overwrite=False) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 -10.0 - 2 NaN 3.0 1.0 + # to ndarray and maybe infer different dtype + level_values = lev._values + if level_values.dtype == np.object_: + level_values = lib.maybe_convert_objects(level_values) - Demonstrating the preference of the passed in dataframe. + if lab is not None: + # if we have the codes, extract the values with a mask + level_values = algorithms.take( + level_values, lab, allow_fill=True, fill_value=lev._na_value + ) - >>> df2 = pd.DataFrame( - ... { - ... "B": [3, 3], - ... "C": [1, 1], - ... }, - ... index=[1, 2], - ... ) - >>> df2.combine(df1, take_smaller) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 NaN - 2 NaN 3.0 NaN + new_obj.insert( + 0, + name, + level_values, + allow_duplicates=allow_duplicates, + ) - >>> df2.combine(df1, take_smaller, overwrite=False) - A B C - 0 0.0 NaN NaN - 1 0.0 3.0 1.0 - 2 NaN 3.0 1.0 - """ - other_idxlen = len(other.index) # save for compare - other_columns = other.columns + new_obj.index = new_index + if not inplace: + return new_obj - this, other = self.align(other) - new_index = this.index + return None - if other.empty and len(new_index) == len(self.index): - return self.copy() + # ---------------------------------------------------------------------- + # Reindex-based selection methods - if self.empty and len(other) == other_idxlen: - return other.copy() - - # preserve column order - new_columns = self.columns.union(other_columns, sort=False) - do_fill = fill_value is not None - result = {} - for col in new_columns: - series = this[col] - other_series = other[col] - - this_dtype = series.dtype - other_dtype = other_series.dtype - - this_mask = isna(series) - other_mask = isna(other_series) - - # don't overwrite columns unnecessarily - # DO propagate if this column is not in the intersection - if not overwrite and other_mask.all(): - result[col] = this[col].copy() - continue - - if do_fill: - series = series.copy() - other_series = other_series.copy() - series[this_mask] = fill_value - other_series[other_mask] = fill_value - - if col not in self.columns: - # If self DataFrame does not have col in other DataFrame, - # try to promote series, which is all NaN, as other_dtype. - new_dtype = other_dtype - try: - series = series.astype(new_dtype) - except ValueError: - # e.g. new_dtype is integer types - pass - else: - # if we have different dtypes, possibly promote - new_dtype = find_common_type([this_dtype, other_dtype]) - series = series.astype(new_dtype) - other_series = other_series.astype(new_dtype) + idx_diff = result_index.difference(correl.index) - arr = func(series, other_series) - if isinstance(new_dtype, np.dtype): - # if new_dtype is an EA Dtype, then `func` is expected to return - # the correct dtype without any additional casting - # error: No overload variant of "maybe_downcast_to_dtype" matches - # argument types "Union[Series, Hashable]", "dtype[Any]" - arr = maybe_downcast_to_dtype( # type: ignore[call-overload] - arr, new_dtype + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - result[col] = arr + return correl - # convert_objects just in case - frame_result = self._constructor(result, index=new_index, columns=new_columns) - return frame_result.__finalize__(self, method="combine") + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def combine_first(self, other: DataFrame) -> DataFrame: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" """ - Update null elements with value in the same location in `other`. - - Combine two DataFrame objects by filling null values in one DataFrame - with non-null values from other DataFrame. The row and column indexes - of the resulting DataFrame will be the union of the two. The resulting - dataframe contains the 'first' dataframe values and overrides the - second one values where both first.loc[index, col] and - second.loc[index, col] are not missing values, upon calling - first.combine_first(second). + DataFrame.isnull is an alias for DataFrame.isna. + """ + idx_diff = result_index.difference(correl.index) - Parameters - ---------- - other : DataFrame - Provided DataFrame to use to fill null values. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Returns - ------- - DataFrame - The result of combining the provided DataFrame with the other object. + return correl - See Also - -------- - DataFrame.combine : Perform series-wise operation on two DataFrames - using a given function. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Examples - -------- - >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]}) - >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]}) - >>> df1.combine_first(df2) - A B - 0 1.0 3.0 - 1 0.0 4.0 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Null values still persist if the location of that null value - does not exist in `other` + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notna(self) -> DataFrame: + return ~self.isna() - >>> df1 = pd.DataFrame({"A": [None, 0], "B": [4, None]}) - >>> df2 = pd.DataFrame({"B": [3, 3], "C": [1, 1]}, index=[1, 2]) - >>> df1.combine_first(df2) - A B C - 0 NaN 4.0 NaN - 1 0.0 3.0 1.0 - 2 NaN 3.0 1.0 + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) + def notnull(self) -> DataFrame: """ - from pandas.core.computation import expressions - - def combiner(x: Series, y: Series): - mask = x.isna()._values - - x_values = x._values - y_values = y._values + DataFrame.notnull is an alias for DataFrame.notna. + """ + return ~self.isna() - # If the column y in other DataFrame is not in first DataFrame, - # just return y_values. - if y.name not in self.columns: - return y_values + idx_diff = result_index.difference(correl.index) - return expressions.where(mask, y_values, x_values) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - if len(other) == 0: - combined = self.reindex( - self.columns.append(other.columns.difference(self.columns)), axis=1 - ) - combined = combined.astype(other.dtypes) - else: - combined = self.combine(other, combiner, overwrite=False) + return correl - dtypes = { - col: find_common_type([self.dtypes[col], other.dtypes[col]]) - for col in self.columns.intersection(other.columns) - if combined.dtypes[col] != self.dtypes[col] - } + # ---------------------------------------------------------------------- + # ndarray-like stats methods - if dtypes: - combined = combined.astype(dtypes) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - return combined.__finalize__(self, method="combine_first") + @overload + def dropna( + self, + *, + axis: Axis = ..., + how: AnyAll | lib.NoDefault = ..., + thresh: int | lib.NoDefault = ..., + subset: IndexLabel = ..., + inplace: Literal[True], + ignore_index: bool = ..., + ) -> None: ... - def update( + def dropna( self, - other, - join: UpdateJoin = "left", - overwrite: bool = True, - filter_func=None, - errors: IgnoreRaise = "ignore", - ) -> None: + *, + axis: Axis = 0, + how: AnyAll | lib.NoDefault = lib.no_default, + thresh: int | lib.NoDefault = lib.no_default, + subset: IndexLabel | AnyArrayLike | None = None, + inplace: bool = False, + ignore_index: bool = False, + ) -> DataFrame | None: """ - Modify in place using non-NA values from another DataFrame. + Remove missing values. - Aligns on indices. There is no return value. + See the :ref:`User Guide ` for more on which values are + considered missing, and how to work with missing data. Parameters ---------- - other : DataFrame, or object coercible into a DataFrame - Should have at least one matching index/column label - with the original DataFrame. If a Series is passed, - its name attribute must be set, and that will be - used as the column name to align with the original DataFrame. - join : {'left'}, default 'left' - Only left join is implemented, keeping the index and columns of the - original object. - overwrite : bool, default True - How to handle non-NA values for overlapping keys: - - * True: overwrite original DataFrame's values - with values from `other`. - * False: only update values that are NA in - the original DataFrame. - - filter_func : callable(1d-array) -> bool 1d-array, optional - Can choose to replace values other than NA. Return True for values - that should be updated. - errors : {'raise', 'ignore'}, default 'ignore' - If 'raise', will raise a ValueError if the DataFrame and `other` - both contain non-NA data in the same place. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + + Only a single axis is allowed. + + how : {'any', 'all'}, default 'any' + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + + thresh : int, optional + Require that many non-NA values. Cannot be combined with how. + subset : column label or sequence of labels, optional + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + .. versionadded:: 2.0.0 Returns ------- - None - This method directly changes calling object. - - Raises - ------ - ValueError - * When `errors='raise'` and there's overlapping non-NA data. - * When `errors` is not either `'ignore'` or `'raise'` - NotImplementedError - * If `join != 'left'` + DataFrame or None + DataFrame with NA entries dropped from it or None if ``inplace=True``. See Also -------- - dict.update : Similar method for dictionaries. - DataFrame.merge : For column(s)-on-column(s) operations. - - Notes - ----- - 1. Duplicate indices on `other` are not supported and raises `ValueError`. + DataFrame.isna: Indicate missing values. + DataFrame.notna : Indicate existing (non-missing) values. + DataFrame.fillna : Replace missing values. + Series.dropna : Drop missing values. + Index.dropna : Drop missing indices. Examples -------- - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400, 500, 600]}) - >>> new_df = pd.DataFrame({"B": [4, 5, 6], "C": [7, 8, 9]}) - >>> df.update(new_df) + >>> df = pd.DataFrame( + ... { + ... "name": ["Alfred", "Batman", "Catwoman"], + ... "toy": [np.nan, "Batmobile", "Bullwhip"], + ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT], + ... } + ... ) >>> df - A B - 0 1 4 - 1 2 5 - 2 3 6 + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT - The DataFrame's length does not increase as a result of the update, - only values at matching index/column labels are updated. + Drop the rows where at least one element is missing. - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_df = pd.DataFrame({"B": ["d", "e", "f", "g", "h", "i"]}) - >>> df.update(new_df) - >>> df - A B - 0 a d - 1 b e - 2 c f + >>> df.dropna() + name toy born + 1 Batman Batmobile 1940-04-25 - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_df = pd.DataFrame({"B": ["d", "f"]}, index=[0, 2]) - >>> df.update(new_df) - >>> df - A B - 0 a d - 1 b y - 2 c f + Drop the columns where at least one element is missing. + + >>> df.dropna(axis="columns") + name + 0 Alfred + 1 Batman + 2 Catwoman - For Series, its name attribute must be set. + Drop the rows where all elements are missing. - >>> df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - >>> new_column = pd.Series(["d", "e", "f"], name="B") - >>> df.update(new_column) - >>> df - A B - 0 a d - 1 b e - 2 c f + >>> df.dropna(how="all") + name toy born + 0 Alfred NaN NaT + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT + + Keep only the rows with at least 2 non-NA values. + + >>> df.dropna(thresh=2) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT - If `other` contains NaNs the corresponding values are not updated - in the original dataframe. + Define in which columns to look for missing values. - >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [400.0, 500.0, 600.0]}) - >>> new_df = pd.DataFrame({"B": [4, np.nan, 6]}) - >>> df.update(new_df) - >>> df - A B - 0 1 4.0 - 1 2 500.0 - 2 3 6.0 + >>> df.dropna(subset=["name", "toy"]) + name toy born + 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT """ - if not PYPY: - if sys.getrefcount(self) <= REF_COUNT: - warnings.warn( - _chained_assignment_method_msg, - ChainedAssignmentError, - stacklevel=2, - ) - - # TODO: Support other joins - if join != "left": # pragma: no cover - raise NotImplementedError("Only left join is supported") - if errors not in ["ignore", "raise"]: - raise ValueError("The parameter errors must be either 'ignore' or 'raise'") + if (how is not lib.no_default) and (thresh is not lib.no_default): + raise TypeError( + "You cannot set both the how and thresh arguments at the same time." + ) - if not isinstance(other, DataFrame): - other = DataFrame(other) + if how is lib.no_default: + how = "any" - if other.index.has_duplicates: - raise ValueError("Update not allowed with duplicate indexes on other.") + inplace = validate_bool_kwarg(inplace, "inplace") + if isinstance(axis, (tuple, list)): + # GH20987 + raise TypeError("supplying multiple axes to axis is no longer supported.") - index_intersection = other.index.intersection(self.index) - if index_intersection.empty: - raise ValueError( - "Update not allowed when the index on `other` has no intersection " - "with this dataframe." - ) + axis = self._get_axis_number(axis) + agg_axis = 1 - axis - other = other.reindex(index_intersection) - this_data = self.loc[index_intersection] + agg_obj = self + if subset is not None: + # subset needs to be list + if not is_list_like(subset): + subset = [cast(Hashable, subset)] + ax = self._get_axis(agg_axis) + indices = ax.get_indexer_for(subset) + check = indices == -1 + if check.any(): + raise KeyError(np.array(subset)[check].tolist()) + agg_obj = self.take(indices, axis=agg_axis) - for col in self.columns.intersection(other.columns): - this = this_data[col] - that = other[col] + if thresh is not lib.no_default: + count = agg_obj.count(axis=agg_axis) + mask = count >= thresh + elif how == "any": + # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' + mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) + elif how == "all": + # faster equivalent to 'agg_obj.count(agg_axis) > 0' + mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) + else: + raise ValueError(f"invalid how option: {how}") - if filter_func is not None: - mask = ~filter_func(this) | isna(that) - else: - if errors == "raise": - mask_this = notna(that) - mask_that = notna(this) - if any(mask_this & mask_that): - raise ValueError("Data overlaps.") - - if overwrite: - mask = isna(that) - else: - mask = notna(this) + if np.all(mask): + result = self.copy(deep=False) + else: + result = self.loc(axis=axis)[mask] - # don't overwrite columns unnecessarily - if mask.all(): - continue + if ignore_index: + result.index = default_index(len(result)) - self.loc[index_intersection, col] = this.where(mask, that) + if not inplace: + return result + self._update_inplace(result) + return None - # ---------------------------------------------------------------------- - # Data reshaping - @Appender( - dedent( - """ - Examples - -------- - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df - Animal Max Speed - 0 Falcon 380.0 - 1 Falcon 370.0 - 2 Parrot 24.0 - 3 Parrot 26.0 - >>> df.groupby(['Animal']).mean() - Max Speed - Animal - Falcon 375.0 - Parrot 25.0 - - **Hierarchical Indexes** - - We can groupby different levels of a hierarchical index - using the `level` parameter: - - >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] - >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) - >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - ... index=index) - >>> df - Max Speed - Animal Type - Falcon Captive 390.0 - Wild 350.0 - Parrot Captive 30.0 - Wild 20.0 - >>> df.groupby(level=0).mean() - Max Speed - Animal - Falcon 370.0 - Parrot 25.0 - >>> df.groupby(level="Type").mean() - Max Speed - Type - Captive 210.0 - Wild 185.0 - - We can also choose to include NA in group keys or not by setting - `dropna` parameter, the default setting is `True`. - - >>> arr = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] - >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) - - >>> df.groupby(by=["b"]).sum() - a c - b - 1.0 2 3 - 2.0 2 5 - - >>> df.groupby(by=["b"], dropna=False).sum() - a c - b - 1.0 2 3 - 2.0 2 5 - NaN 1 4 - - >>> arr = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] - >>> df = pd.DataFrame(arr, columns=["a", "b", "c"]) - - >>> df.groupby(by="a").sum() - b c - a - a 13.0 13.0 - b 12.3 123.0 - - >>> df.groupby(by="a", dropna=False).sum() - b c - a - a 13.0 13.0 - b 12.3 123.0 - NaN 12.3 33.0 - - When using ``.apply()``, use ``group_keys`` to include or exclude the - group keys. The ``group_keys`` argument defaults to ``True`` (include). - - >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) - >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) - Max Speed - Animal - Falcon 0 380.0 - 1 370.0 - Parrot 2 24.0 - 3 26.0 - - >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) - Max Speed - 0 380.0 - 1 370.0 - 2 24.0 - 3 26.0 - """ - ) - ) - @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) - def groupby( + @overload + def drop_duplicates( self, - by=None, - level: IndexLabel | None = None, - as_index: bool = True, - sort: bool = True, - group_keys: bool = True, - observed: bool = True, - dropna: bool = True, - ) -> DataFrameGroupBy: - from pandas.core.groupby.generic import DataFrameGroupBy + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: Literal[True], + ignore_index: bool = ..., + ) -> None: ... - if level is None and by is None: - raise TypeError("You have to supply one of 'by' and 'level'") + @overload + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: Literal[False] = ..., + ignore_index: bool = ..., + ) -> DataFrame: ... - return DataFrameGroupBy( - obj=self, - keys=by, - level=level, - as_index=as_index, - sort=sort, - group_keys=group_keys, - observed=observed, - dropna=dropna, - ) + @overload + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = ..., + *, + keep: DropKeep = ..., + inplace: bool = ..., + ignore_index: bool = ..., + ) -> DataFrame | None: ... - _shared_docs["pivot"] = """ - Return reshaped DataFrame organized by given index / column values. + def drop_duplicates( + self, + subset: Hashable | Sequence[Hashable] | None = None, + *, + keep: DropKeep = "first", + inplace: bool = False, + ignore_index: bool = False, + ) -> DataFrame | None: + """ + Return DataFrame with duplicate rows removed. - Reshape data (produce a "pivot" table) based on column values. Uses - unique values from specified `index` / `columns` to form axes of the - resulting DataFrame. This function does not support data - aggregation, multiple values will result in a MultiIndex in the - columns. See the :ref:`User Guide ` for more on reshaping. + Considering certain columns is optional. Indexes, including time indexes + are ignored. Parameters - ----------%s - columns : Hashable or a sequence of the previous - Column to use to make new frame's columns. - index : Hashable or a sequence of the previous, optional - Column to use to make new frame's index. If not given, uses existing index. - values : Hashable or a sequence of the previous, optional - Column(s) to use for populating new frame's values. If not - specified, all remaining columns will be used and the result will - have hierarchically indexed columns. + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', ``False``}, default 'first' + Determines which duplicates (if any) to keep. + + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + Whether to modify the DataFrame rather than creating a new one. + ignore_index : bool, default ``False`` + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. Returns ------- - DataFrame - Returns reshaped DataFrame. - - Raises - ------ - ValueError: - When there are any `index`, `columns` combinations with multiple - values. `DataFrame.pivot_table` when you need to aggregate. + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. See Also -------- - DataFrame.pivot_table : Generalization of pivot that can handle - duplicate values for one index/column pair. - DataFrame.unstack : Pivot based on the index values instead of a - column. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. + DataFrame.value_counts: Count unique combinations of columns. Notes ----- - For finer-tuned control, see hierarchical indexing documentation along - with the related stack/unstack methods. - - Reference :ref:`the user guide ` for more examples. + This method requires columns specified by ``subset`` to be of hashable type. + Passing unhashable columns will raise a ``TypeError``. Examples -------- - >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', - ... 'two'], - ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], - ... 'baz': [1, 2, 3, 4, 5, 6], - ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) - >>> df - foo bar baz zoo - 0 one A 1 x - 1 one B 2 y - 2 one C 3 z - 3 two A 4 q - 4 two B 5 w - 5 two C 6 t - - >>> df.pivot(index='foo', columns='bar', values='baz') - bar A B C - foo - one 1 2 3 - two 4 5 6 - - >>> df.pivot(index='foo', columns='bar')['baz'] - bar A B C - foo - one 1 2 3 - two 4 5 6 - - >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) - baz zoo - bar A B C A B C - foo - one 1 2 3 x y z - two 4 5 6 q w t - - You could also assign a list of column names or a list of index names. - - >>> df = pd.DataFrame({ - ... "lev1": [1, 1, 1, 2, 2, 2], - ... "lev2": [1, 1, 2, 1, 1, 2], - ... "lev3": [1, 2, 1, 2, 1, 2], - ... "lev4": [1, 2, 3, 4, 5, 6], - ... "values": [0, 1, 2, 3, 4, 5]}) - >>> df - lev1 lev2 lev3 lev4 values - 0 1 1 1 1 0 - 1 1 1 2 2 1 - 2 1 2 1 3 2 - 3 2 1 2 4 3 - 4 2 1 1 5 4 - 5 2 2 2 6 5 - - >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") - lev2 1 2 - lev3 1 2 1 2 - lev1 - 1 0.0 1.0 2.0 NaN - 2 4.0 3.0 NaN 5.0 - - >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") - lev3 1 2 - lev1 lev2 - 1 1 0.0 1.0 - 2 2.0 NaN - 2 1 4.0 3.0 - 2 NaN 5.0 - - A ValueError is raised if there are any duplicates. - - >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], - ... "bar": ['A', 'A', 'B', 'C'], - ... "baz": [1, 2, 3, 4]}) - >>> df - foo bar baz - 0 one A 1 - 1 one A 2 - 2 two B 3 - 3 two C 4 - - Notice that the first two rows are the same for our `index` - and `columns` arguments. - - >>> df.pivot(index='foo', columns='bar', values='baz') - Traceback (most recent call last): - ... - ValueError: Index contains duplicate entries, cannot reshape - """ - - @Substitution("") - @Appender(_shared_docs["pivot"]) - def pivot( - self, *, columns, index=lib.no_default, values=lib.no_default - ) -> DataFrame: - from pandas.core.reshape.pivot import pivot + Consider dataset containing ramen rating. - return pivot(self, index=index, columns=columns, values=values) + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } + ... ) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 - _shared_docs["pivot_table"] = """ - Create a spreadsheet-style pivot table as a DataFrame. + By default, it removes duplicate rows based on all columns. - The levels in the pivot table will be stored in MultiIndex objects - (hierarchical indexes) on the index and columns of the result DataFrame. + >>> df.drop_duplicates() + brand style rating + 0 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 - Parameters - ----------%s - values : list-like or scalar, optional - Column or columns to aggregate. - index : column, Grouper, array, or sequence of the previous - Keys to group by on the pivot table index. If a list is passed, - it can contain any of the other types (except list). If an array is - passed, it must be the same length as the data and will be used in - the same manner as column values. - columns : column, Grouper, array, or sequence of the previous - Keys to group by on the pivot table column. If a list is passed, - it can contain any of the other types (except list). If an array is - passed, it must be the same length as the data and will be used in - the same manner as column values. - aggfunc : function, list of functions, dict, default "mean" - If a list of functions is passed, the resulting pivot table will have - hierarchical columns whose top level are the function names - (inferred from the function objects themselves). - If a dict is passed, the key is column to aggregate and the value is - function or list of functions. If ``margin=True``, aggfunc will be - used to calculate the partial aggregates. - fill_value : scalar, default None - Value to replace missing values with (in the resulting pivot table, - after aggregation). - margins : bool, default False - If ``margins=True``, special ``All`` columns and rows - will be added with partial group aggregates across the categories - on the rows and columns. - dropna : bool, default True - Do not include columns whose entries are all NaN. If True, + To remove duplicates on specific column(s), use ``subset``. - * rows with an NA value in any column will be omitted before computing - margins, - * index/column keys containing NA values will be dropped (see ``dropna`` - parameter in :meth:`DataFrame.groupby`). + idx_diff = result_index.difference(correl.index) - margins_name : str, default 'All' - Name of the row / column that will contain the totals - when margins is True. - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - .. versionchanged:: 3.0.0 + return correl - The default value is now ``True``. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - sort : bool, default True - Specifies if the result should be sorted. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - .. versionadded:: 1.3.0 + To remove duplicates and keep last occurrences, use ``keep``. - **kwargs : dict - Optional keyword arguments to pass to ``aggfunc``. + >>> df.drop_duplicates(subset=["brand", "style"], keep="last") + brand style rating + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + """ + if self.empty: + return self.copy(deep=False) - .. versionadded:: 3.0.0 + idx_diff = result_index.difference(correl.index) - Returns - ------- - DataFrame - An Excel style pivot table. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - See Also - -------- - DataFrame.pivot : Pivot without aggregation that can handle - non-numeric data. - DataFrame.melt: Unpivot a DataFrame from wide to long format, - optionally leaving identifiers set. - wide_to_long : Wide panel to long format. Less flexible but more - user-friendly than melt. + return correl - Notes - ----- - Reference :ref:`the user guide ` for more examples. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Examples - -------- - >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - ... "bar", "bar", "bar", "bar"], - ... "B": ["one", "one", "one", "two", "two", - ... "one", "one", "two", "two"], - ... "C": ["small", "large", "large", "small", - ... "small", "large", "small", "small", - ... "large"], - ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) - >>> df - A B C D E - 0 foo one small 1 2 - 1 foo one large 2 4 - 2 foo one large 2 5 - 3 foo two small 3 5 - 4 foo two small 3 6 - 5 bar one large 4 6 - 6 bar one small 5 8 - 7 bar two small 6 9 - 8 bar two large 7 9 - - This first example aggregates values by taking the sum. - - >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc="sum") - >>> table - C large small - A B - bar one 4.0 5.0 - two 7.0 6.0 - foo one 4.0 1.0 - two NaN 6.0 - - We can also fill missing values using the `fill_value` parameter. - - >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], - ... columns=['C'], aggfunc="sum", fill_value=0) - >>> table - C large small - A B - bar one 4 5 - two 7 6 - foo one 4 1 - two 0 6 - - The next example aggregates by taking the mean across multiple columns. - - >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': "mean", 'E': "mean"}) - >>> table - D E - A C - bar large 5.500000 7.500000 - small 5.500000 8.500000 - foo large 2.000000 4.500000 - small 2.333333 4.333333 - - We can also calculate multiple types of aggregations for any given - value column. - - >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], - ... aggfunc={'D': "mean", - ... 'E': ["min", "max", "mean"]}) - >>> table - D E - mean max mean min - A C - bar large 5.500000 9 7.500000 6 - small 5.500000 9 8.500000 8 - foo large 2.000000 5 4.500000 4 - small 2.333333 6 4.333333 2 - """ + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @Substitution("") - @Appender(_shared_docs["pivot_table"]) - def pivot_table( - self, - values=None, - index=None, - columns=None, - aggfunc: AggFuncType = "mean", - fill_value=None, - margins: bool = False, - dropna: bool = True, - margins_name: Level = "All", - observed: bool = True, - sort: bool = True, - **kwargs, - ) -> DataFrame: - from pandas.core.reshape.pivot import pivot_table + result = self[-self.duplicated(subset, keep=keep)] + if ignore_index: + result.index = default_index(len(result)) - return pivot_table( - self, - values=values, - index=index, - columns=columns, - aggfunc=aggfunc, - fill_value=fill_value, - margins=margins, - dropna=dropna, - margins_name=margins_name, - observed=observed, - sort=sort, - **kwargs, - ) + if inplace: + self._update_inplace(result) + return None + else: + return result - def stack( + def duplicated( self, - level: IndexLabel = -1, - dropna: bool | lib.NoDefault = lib.no_default, - sort: bool | lib.NoDefault = lib.no_default, - future_stack: bool = True, - ): + subset: Hashable | Sequence[Hashable] | None = None, + keep: DropKeep = "first", + ) -> Series: """ - Stack the prescribed level(s) from columns to index. - - Return a reshaped DataFrame or Series having a multi-level - index with one or more new inner-most levels compared to the current - DataFrame. The new inner-most levels are created by pivoting the - columns of the current dataframe: + Return boolean Series denoting duplicate rows. - - if the columns have a single level, the output is a Series; - - if the columns have multiple levels, the new index level(s) is (are) - taken from the prescribed level(s) and the output is a DataFrame. + Considering certain columns is optional. Parameters ---------- - level : int, str, list, default -1 - Level(s) to stack from the column axis onto the index - axis, defined as one index or label, or a list of indices - or labels. - dropna : bool, default True - Whether to drop rows in the resulting Frame/Series with - missing values. Stacking a column level onto the index - axis can create combinations of index and column values - that are missing from the original dataframe. See Examples - section. - sort : bool, default True - Whether to sort the levels of the resulting MultiIndex. - future_stack : bool, default True - Whether to use the new implementation that will replace the current - implementation in pandas 3.0. When True, dropna and sort have no impact - on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release - notes ` for more details. + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to mark. + + - ``first`` : Mark duplicates as ``True`` except for the first occurrence. + - ``last`` : Mark duplicates as ``True`` except for the last occurrence. + - False : Mark all duplicates as ``True``. Returns ------- - DataFrame or Series - Stacked dataframe or series. + Series + Boolean series for each duplicated rows. See Also -------- - DataFrame.unstack : Unstack prescribed level(s) from index axis - onto column axis. - DataFrame.pivot : Reshape dataframe from long format to wide - format. - DataFrame.pivot_table : Create a spreadsheet-style pivot table - as a DataFrame. - - Notes - ----- - The function is named by analogy with a collection of books - being reorganized from being side by side on a horizontal - position (the columns of the dataframe) to being stacked - vertically on top of each other (in the index of the - dataframe). - - Reference :ref:`the user guide ` for more examples. + Index.duplicated : Equivalent method on index. + Series.duplicated : Equivalent method on Series. + Series.drop_duplicates : Remove duplicate values from Series. + DataFrame.drop_duplicates : Remove duplicate values from DataFrame. Examples -------- - **Single level columns** + Consider dataset containing ramen rating. - >>> df_single_level_cols = pd.DataFrame( - ... [[0, 1], [2, 3]], index=["cat", "dog"], columns=["weight", "height"] + >>> df = pd.DataFrame( + ... { + ... "brand": ["Yum Yum", "Yum Yum", "Indomie", "Indomie", "Indomie"], + ... "style": ["cup", "cup", "cup", "pack", "pack"], + ... "rating": [4, 4, 3.5, 15, 5], + ... } ... ) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 - Stacking a dataframe with a single level column axis returns a Series: - - >>> df_single_level_cols - weight height - cat 0 1 - dog 2 3 - >>> df_single_level_cols.stack() - cat weight 0 - height 1 - dog weight 2 - height 3 - dtype: int64 + By default, for each set of duplicated values, the first occurrence + is set on False and all others on True. + + >>> df.duplicated() + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool - **Multi level columns: simple case** + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True. - >>> multicol1 = pd.MultiIndex.from_tuples( - ... [("weight", "kg"), ("weight", "pounds")] - ... ) - >>> df_multi_level_cols1 = pd.DataFrame( - ... [[1, 2], [2, 4]], index=["cat", "dog"], columns=multicol1 - ... ) + >>> df.duplicated(keep="last") + 0 True + 1 False + 2 False + 3 False + 4 False + dtype: bool - Stacking a dataframe with a multi-level column axis: - - >>> df_multi_level_cols1 - weight - kg pounds - cat 1 2 - dog 2 4 - >>> df_multi_level_cols1.stack() - weight - cat kg 1 - pounds 2 - dog kg 2 - pounds 4 - - **Missing values** - - >>> multicol2 = pd.MultiIndex.from_tuples([("weight", "kg"), ("height", "m")]) - >>> df_multi_level_cols2 = pd.DataFrame( - ... [[1.0, 2.0], [3.0, 4.0]], index=["cat", "dog"], columns=multicol2 - ... ) + By setting ``keep`` on False, all duplicates are True. - It is common to have missing values when stacking a dataframe - with multi-level columns, as the stacked dataframe typically - has more values than the original dataframe. Missing values - are filled with NaNs: - - >>> df_multi_level_cols2 - weight height - kg m - cat 1.0 2.0 - dog 3.0 4.0 - >>> df_multi_level_cols2.stack() - weight height - cat kg 1.0 NaN - m NaN 2.0 - dog kg 3.0 NaN - m NaN 4.0 - - **Prescribing the level(s) to be stacked** - - The first parameter controls which level or levels are stacked: - - >>> df_multi_level_cols2.stack(0) - kg m - cat weight 1.0 NaN - height NaN 2.0 - dog weight 3.0 NaN - height NaN 4.0 - >>> df_multi_level_cols2.stack([0, 1]) - cat weight kg 1.0 - height m 2.0 - dog weight kg 3.0 - height m 4.0 - dtype: float64 + >>> df.duplicated(keep=False) + 0 True + 1 True + 2 False + 3 False + 4 False + dtype: bool + + To find duplicates on specific column(s), use ``subset``. + + >>> df.duplicated(subset=["brand"]) + 0 False + 1 True + 2 False + 3 True + 4 True + dtype: bool """ - if not future_stack: - from pandas.core.reshape.reshape import ( - stack, - stack_multiple, - ) - warnings.warn( - "The previous implementation of stack is deprecated and will be " - "removed in a future version of pandas. See the What's New notes " - "for pandas 2.1.0 for details. Do not specify the future_stack " - "argument to adopt the new implementation and silence this warning.", - FutureWarning, - stacklevel=find_stack_level(), - ) + if self.empty: + return self._constructor_sliced(dtype=bool) - if dropna is lib.no_default: - dropna = True - if sort is lib.no_default: - sort = True + def f(vals) -> tuple[np.ndarray, int]: + labels, shape = algorithms.factorize(vals, size_hint=len(self)) + return labels.astype("i8"), len(shape) - if isinstance(level, (tuple, list)): - result = stack_multiple(self, level, dropna=dropna, sort=sort) - else: - result = stack(self, level, dropna=dropna, sort=sort) - else: - from pandas.core.reshape.reshape import stack_v3 + if subset is None: + # https://github.com/pandas-dev/pandas/issues/28770 + # Incompatible types in assignment (expression has type "Index", variable + # has type "Sequence[Any]") + subset = self.columns # type: ignore[assignment] + elif ( + not np.iterable(subset) + or isinstance(subset, str) + or isinstance(subset, tuple) + and subset in self.columns + ): + subset = (subset,) - if dropna is not lib.no_default: - raise ValueError( - "dropna must be unspecified as the new " - "implementation does not introduce rows of NA values. This " - "argument will be removed in a future version of pandas." - ) + # needed for mypy since can't narrow types using np.iterable + subset = cast(Sequence, subset) - if sort is not lib.no_default: - raise ValueError( - "Cannot specify sort, this argument will be " - "removed in a future version of pandas. Sort the result using " - ".sort_index instead." - ) + # Verify all columns in subset exist in the queried dataframe + # Otherwise, raise a KeyError, same as if you try to __getitem__ with a + # key that doesn't exist. + diff = set(subset) - set(self.columns) + if diff: + raise KeyError(Index(diff)) - if ( - isinstance(level, (tuple, list)) - and not all(lev in self.columns.names for lev in level) - and not all(isinstance(lev, int) for lev in level) - ): - raise ValueError( - "level should contain all level names or all level " - "numbers, not a mixture of the two." - ) + if len(subset) == 1 and self.columns.is_unique: + # GH#45236 This is faster than get_group_index below + result = self[subset[0]].duplicated(keep) + result.name = None + else: + vals = (col.values for name, col in self.items() if name in subset) + labels, shape = map(list, zip(*map(f, vals))) - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.columns._get_level_number(lev) for lev in level] - result = stack_v3(self, level) + ids = get_group_index(labels, tuple(shape), sort=False, xnull=False) + result = self._constructor_sliced(duplicated(ids, keep), index=self.index) + return result.__finalize__(self, method="duplicated") + + # ---------------------------------------------------------------------- + # Sorting + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + @overload # type: ignore[override] + def sort_values( + self, + by: IndexLabel, + *, + axis: Axis = ..., + ascending=..., + inplace: Literal[False] = ..., + kind: SortKind = ..., + na_position: NaPosition = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> DataFrame: ... - return result.__finalize__(self, method="stack") + @overload + def sort_values( + self, + by: IndexLabel, + *, + axis: Axis = ..., + ascending=..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: str = ..., + ignore_index: bool = ..., + key: ValueKeyFunc = ..., + ) -> None: ... - def explode( + def sort_values( self, - column: IndexLabel, + by: IndexLabel, + *, + axis: Axis = 0, + ascending: bool | list[bool] | tuple[bool, ...] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: str = "last", ignore_index: bool = False, - ) -> DataFrame: + key: ValueKeyFunc | None = None, + ) -> DataFrame | None: """ - Transform each element of a list-like to a row, replicating index values. + Sort by the values along either axis. Parameters ---------- - column : IndexLabel - Column(s) to explode. - For multiple columns, specify a non-empty list with each element - be str or tuple, and all specified columns their list-like data - on same row of the frame must have matching length. - - .. versionadded:: 1.3.0 - Multi-column explode + by : str or list of str + Name or list of names to sort by. + - if `axis` is 0 or `'index'` then `by` may contain index + levels and/or column labels. + - if `axis` is 1 or `'columns'` then `by` may contain column + levels and/or index labels. + axis : "{0 or 'index', 1 or 'columns'}", default 0 + Axis to be sorted. + ascending : bool or list of bool, default True + Sort ascending vs. descending. Specify list for multiple sort + orders. If this is a list of bools, must match the length of + the by. + inplace : bool, default False + If True, perform operation in-place. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the + end. ignore_index : bool, default False - If True, the resulting index will be labeled 0, 1, …, n - 1. + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + Apply the key function to the values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect a + ``Series`` and return a Series with the same shape as the input. + It will be applied to each column in `by` independently. Returns ------- - DataFrame - Exploded lists to rows of the subset columns; - index will be duplicated for these rows. - - Raises - ------ - ValueError : - * If columns of the frame are not unique. - * If specified columns to explode is empty list. - * If specified columns to explode have not matching count of - elements rowwise in the frame. + DataFrame or None + DataFrame with sorted values or None if ``inplace=True``. See Also -------- - DataFrame.unstack : Pivot a level of the (necessarily hierarchical) - index labels. - DataFrame.melt : Unpivot a DataFrame from wide format to long format. - Series.explode : Explode a DataFrame from list-like columns to long format. - - Notes - ----- - This routine will explode list-likes including lists, tuples, sets, - Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged, and empty list-likes will - result in a np.nan for that row. In addition, the ordering of rows in the - output will be non-deterministic when exploding sets. - - Reference :ref:`the user guide ` for more examples. + DataFrame.sort_index : Sort a DataFrame by the index. + Series.sort_values : Similar method for a Series. Examples -------- >>> df = pd.DataFrame( ... { - ... "A": [[0, 1, 2], "foo", [], [3, 4]], - ... "B": 1, - ... "C": [["a", "b", "c"], np.nan, [], ["d", "e"]], + ... "col1": ["A", "A", "B", np.nan, "D", "C"], + ... "col2": [2, 1, 9, 8, 7, 4], + ... "col3": [0, 1, 9, 4, 2, 3], + ... "col4": ["a", "B", "c", "D", "e", "F"], ... } ... ) >>> df - A B C - 0 [0, 1, 2] 1 [a, b, c] - 1 foo 1 NaN - 2 [] 1 [] - 3 [3, 4] 1 [d, e] - - Single-column explode. - - >>> df.explode("A") - A B C - 0 0 1 [a, b, c] - 0 1 1 [a, b, c] - 0 2 1 [a, b, c] - 1 foo 1 NaN - 2 NaN 1 [] - 3 3 1 [d, e] - 3 4 1 [d, e] - - Multi-column explode. - - >>> df.explode(list("AC")) - A B C - 0 0 1 a - 0 1 1 b - 0 2 1 c - 1 foo 1 NaN - 2 NaN 1 NaN - 3 3 1 d - 3 4 1 e - """ - if not self.columns.is_unique: - duplicate_cols = self.columns[self.columns.duplicated()].tolist() - raise ValueError( - f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}" - ) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F - columns: list[Hashable] - if is_scalar(column) or isinstance(column, tuple): - columns = [column] - elif isinstance(column, list) and all( - is_scalar(c) or isinstance(c, tuple) for c in column - ): - if not column: - raise ValueError("column must be nonempty") - if len(column) > len(set(column)): - raise ValueError("column must be unique") - columns = column - else: - raise ValueError("column must be a scalar, tuple, or list thereof") + **Sort by a single column** - df = self.reset_index(drop=True) - if len(columns) == 1: - result = df[columns[0]].explode() - else: - mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1 - counts0 = self[columns[0]].apply(mylen) - for c in columns[1:]: - if not all(counts0 == self[c].apply(mylen)): - raise ValueError("columns must have matching element counts") - result = DataFrame({c: df[c].explode() for c in columns}) - result = df.drop(columns, axis=1).join(result) - if ignore_index: - result.index = default_index(len(result)) - else: - result.index = self.index.take(result.index) - result = result.reindex(columns=self.columns) + In this case, we are sorting the rows according to values in ``col1``: - return result.__finalize__(self, method="explode") + >>> df.sort_values(by=["col1"]) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D - def unstack( - self, level: IndexLabel = -1, fill_value=None, sort: bool = True - ) -> DataFrame | Series: - """ - Pivot a level of the (necessarily hierarchical) index labels. + **Sort by multiple columns** - Returns a DataFrame having a new level of column labels whose inner-most level - consists of the pivoted index labels. + You can also provide multiple columns to ``by`` argument, as shown below. + In this example, the rows are first sorted according to ``col1``, and then + the rows that have an identical value in ``col1`` are sorted according + to ``col2``. - If the index is not a MultiIndex, the output will be a Series - (the analogue of stack when the columns are not a MultiIndex). + >>> df.sort_values(by=["col1", "col2"]) + col1 col2 col3 col4 + 1 A 1 1 B + 0 A 2 0 a + 2 B 9 9 c + 5 C 4 3 F + 4 D 7 2 e + 3 NaN 8 4 D - Parameters - ---------- - level : int, str, or list of these, default -1 (last level) - Level(s) of index to unstack, can pass level name. - fill_value : scalar - Replace NaN with this value if the unstack produces missing values. - sort : bool, default True - Sort the level(s) in the resulting MultiIndex columns. + **Sort in a descending order** - Returns - ------- - Series or DataFrame - If index is a MultiIndex: DataFrame with pivoted index labels as new - inner-most level column labels, else Series. + The sort order can be reversed using ``ascending`` argument, as shown below: - See Also - -------- - DataFrame.pivot : Pivot a table based on column values. - DataFrame.stack : Pivot a level of the column labels (inverse operation - from `unstack`). + >>> df.sort_values(by="col1", ascending=False) + col1 col2 col3 col4 + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B + 3 NaN 8 4 D - Notes - ----- - Reference :ref:`the user guide ` for more examples. + **Placing any** ``NA`` **first** - Examples - -------- - >>> index = pd.MultiIndex.from_tuples( - ... [("one", "a"), ("one", "b"), ("two", "a"), ("two", "b")] - ... ) - >>> s = pd.Series(np.arange(1.0, 5.0), index=index) - >>> s - one a 1.0 - b 2.0 - two a 3.0 - b 4.0 - dtype: float64 + idx_diff = result_index.difference(correl.index) - >>> s.unstack(level=-1) - a b - one 1.0 2.0 - two 3.0 4.0 - - >>> s.unstack(level=0) - one two - a 1.0 3.0 - b 2.0 4.0 - - >>> df = s.unstack(level=0) - >>> df.unstack() - one a 1.0 - b 2.0 - two a 3.0 - b 4.0 - dtype: float64 - """ - from pandas.core.reshape.reshape import unstack + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - result = unstack(self, level, fill_value, sort) + return correl - return result.__finalize__(self, method="unstack") + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def melt( - self, - id_vars=None, - value_vars=None, - var_name=None, - value_name: Hashable = "value", - col_level: Level | None = None, - ignore_index: bool = True, - ) -> DataFrame: - """ - Unpivot DataFrame from wide to long format, optionally leaving identifiers set. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. + >>> df.sort_values(by="col1", ascending=False, na_position="first") + col1 col2 col3 col4 + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F + 2 B 9 9 c + 0 A 2 0 a + 1 A 1 1 B - Parameters - ---------- - id_vars : scalar, tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : scalar, tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar, default None - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column, can't be an existing column label. - col_level : scalar, optional - If columns are a MultiIndex then use this level to melt. - ignore_index : bool, default True - If True, original index is ignored. If False, original index is retained. - Index labels will be repeated as necessary. + **Customized sort order** - Returns - ------- - DataFrame - Unpivoted DataFrame. + The ``key`` argument allows for a further customization of sorting behaviour. + For example, you may want + to ignore the `letter's case `__ + when sorting strings: - See Also - -------- - melt : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. + >>> df.sort_values(by="col4", key=lambda col: col.str.lower()) + col1 col2 col3 col4 + 0 A 2 0 a + 1 A 1 1 B + 2 B 9 9 c + 3 NaN 8 4 D + 4 D 7 2 e + 5 C 4 3 F - Notes - ----- - Reference :ref:`the user guide ` for more examples. + Another typical example is + `natural sorting `__. + This can be done using + ``natsort`` `package `__, + which provides sorted indices according + to their natural order, as shown below: - Examples - -------- >>> df = pd.DataFrame( ... { - ... "A": {0: "a", 1: "b", 2: "c"}, - ... "B": {0: 1, 1: 3, 2: 5}, - ... "C": {0: 2, 1: 4, 2: 6}, + ... "time": ["0hr", "128hr", "72hr", "48hr", "96hr"], + ... "value": [10, 20, 30, 40, 50], ... } ... ) >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> index_natsorted(df["time"]) + [0, 3, 2, 4, 1] + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(x)), + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 + """ + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + ascending = validate_ascending(ascending) + if not isinstance(by, list): + by = [by] + # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; + # expected "Sized" + if is_sequence(ascending) and ( + len(by) != len(ascending) # type: ignore[arg-type] + ): + # error: Argument 1 to "len" has incompatible type "Union[bool, + # List[bool]]"; expected "Sized" + raise ValueError( + f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] + f" != length of by ({len(by)})" + ) + if len(by) > 1: + keys = [self._get_label_or_level_values(x, axis=axis) for x in by] - >>> df.melt(id_vars=["A"], value_vars=["B"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 + # need to rewrap columns in Series to apply key function + if key is not None: + # error: List comprehension has incompatible type List[Series]; + # expected List[ndarray] + keys = [ + Series(k, name=name) # type: ignore[misc] + for (k, name) in zip(keys, by) + ] - >>> df.melt(id_vars=["A"], value_vars=["B", "C"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 + indexer = lexsort_indexer( + keys, orders=ascending, na_position=na_position, key=key + ) + elif len(by): + # len(by) == 1 - The names of 'variable' and 'value' columns can be customized: + k = self._get_label_or_level_values(by[0], axis=axis) - >>> df.melt( - ... id_vars=["A"], - ... value_vars=["B"], - ... var_name="myVarname", - ... value_name="myValname", - ... ) - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 + idx_diff = result_index.difference(correl.index) - Original index values can be kept around: + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df.melt(id_vars=["A"], value_vars=["B", "C"], ignore_index=False) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 0 a C 2 - 1 b C 4 - 2 c C 6 + return correl - If you have multi-index columns: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.columns = [list("ABC"), list("DEF")] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.melt(col_level=0, id_vars=["A"], value_vars=["B"]) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 + if isinstance(ascending, (tuple, list)): + ascending = ascending[0] - >>> df.melt(id_vars=[("A", "D")], value_vars=[("B", "E")]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ - return melt( - self, - id_vars=id_vars, - value_vars=value_vars, - var_name=var_name, - value_name=value_name, - col_level=col_level, - ignore_index=ignore_index, - ).__finalize__(self, method="melt") + indexer = nargsort( + k, kind=kind, ascending=ascending, na_position=na_position, key=key + ) + else: + if inplace: + return self._update_inplace(self) + else: + return self.copy(deep=False) - # ---------------------------------------------------------------------- - # Time series-related + if is_range_indexer(indexer, len(indexer)): + result = self.copy(deep=False) + if ignore_index: + result.index = default_index(len(result)) - @doc( - Series.diff, - klass="DataFrame", - extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " - "Take difference over rows (0) or columns (1).\n", - other_klass="Series", - examples=dedent( - """ - Difference with previous row + if inplace: + return self._update_inplace(result) + else: + return result - >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], - ... 'b': [1, 1, 2, 3, 5, 8], - ... 'c': [1, 4, 9, 16, 25, 36]}) - >>> df - a b c - 0 1 1 1 - 1 2 1 4 - 2 3 2 9 - 3 4 3 16 - 4 5 5 25 - 5 6 8 36 + idx_diff = result_index.difference(correl.index) - >>> df.diff() - a b c - 0 NaN NaN NaN - 1 1.0 0.0 3.0 - 2 1.0 1.0 5.0 - 3 1.0 1.0 7.0 - 4 1.0 2.0 9.0 - 5 1.0 3.0 11.0 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Difference with previous column + return correl - >>> df.diff(axis=1) - a b c - 0 NaN 0 0 - 1 NaN -1 3 - 2 NaN -1 7 - 3 NaN -1 13 - 4 NaN 0 20 - 5 NaN 2 28 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Difference with 3rd previous row + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.diff(periods=3) - a b c - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 NaN NaN NaN - 3 3.0 2.0 15.0 - 4 3.0 4.0 21.0 - 5 3.0 6.0 27.0 + if ignore_index: + new_data.set_axis( + self._get_block_manager_axis(axis), default_index(len(indexer)) + ) - Difference with following row + result = self._constructor_from_mgr(new_data, axes=new_data.axes) + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_values") - >>> df.diff(periods=-1) - a b c - 0 -1.0 0.0 -3.0 - 1 -1.0 -1.0 -5.0 - 2 -1.0 -1.0 -7.0 - 3 -1.0 -2.0 -9.0 - 4 -1.0 -3.0 -11.0 - 5 NaN NaN NaN + @overload + def sort_index( + self, + *, + axis: Axis = ..., + level: IndexLabel = ..., + ascending: bool | Sequence[bool] = ..., + inplace: Literal[True], + kind: SortKind = ..., + na_position: NaPosition = ..., + sort_remaining: bool = ..., + ignore_index: bool = ..., + key: IndexKeyFunc = ..., + ) -> None: ... - Overflow in input dtype + @overload + def sort_index( + self, + idx_diff = result_index.difference(correl.index) - >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) - >>> df.diff() - a - 0 NaN - 1 255.0""" - ), - ) - def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: - if not lib.is_integer(periods): - if not (is_float(periods) and periods.is_integer()): - raise ValueError("periods must be an integer") - periods = int(periods) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - axis = self._get_axis_number(axis) - if axis == 1: - if periods != 0: - # in the periods == 0 case, this is equivalent diff of 0 periods - # along axis=0, and the Manager method may be somewhat more - # performant, so we dispatch in that case. - return self - self.shift(periods, axis=axis) - # With periods=0 this is equivalent to a diff with axis=0 - axis = 0 + return correl - new_data = self._mgr.diff(n=periods) - res_df = self._constructor_from_mgr(new_data, axes=new_data.axes) - return res_df.__finalize__(self, "diff") + # ---------------------------------------------------------------------- + # ndarray-like stats methods # ---------------------------------------------------------------------- - # Function application + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - def _gotitem( + def sort_index( self, - key: IndexLabel, - ndim: int, - subset: DataFrame | Series | None = None, - ) -> DataFrame | Series: + *, + axis: Axis = 0, + level: IndexLabel | None = None, + ascending: bool | Sequence[bool] = True, + inplace: bool = False, + kind: SortKind = "quicksort", + na_position: NaPosition = "last", + sort_remaining: bool = True, + ignore_index: bool = False, + key: IndexKeyFunc | None = None, + ) -> DataFrame | None: """ - Sub-classes to define. Return a sliced object. + Sort object by labels (along an axis). + + Returns a new DataFrame sorted by label if `inplace` argument is + ``False``, otherwise updates the original DataFrame and returns None. Parameters ---------- - key : string / list of selections - ndim : {1, 2} - requested ndim of result - subset : object, default None - subset to act on - """ - if subset is None: - subset = self - elif subset.ndim == 1: # is Series - return subset + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis along which to sort. The value 0 identifies the rows, + and 1 identifies the columns. + level : int or level name or list of ints or list of level names + If not None, sort on values in specified index level(s). + ascending : bool or list-like of bools, default True + Sort ascending vs. descending. When the index is a MultiIndex the + sort direction can be controlled for each level individually. + inplace : bool, default False + Whether to modify the DataFrame rather than creating a new one. + kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' + Choice of sorting algorithm. See also :func:`numpy.sort` for more + information. `mergesort` and `stable` are the only stable algorithms. For + DataFrames, this option is only applied when sorting on a single + column or label. + na_position : {'first', 'last'}, default 'last' + Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. + Not implemented for MultiIndex. + sort_remaining : bool, default True + If True and sorting by level and index is multilevel, sort by other + levels too (in order) after sorting by specified level. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + key : callable, optional + If not None, apply the key function to the index values + before sorting. This is similar to the `key` argument in the + builtin :meth:`sorted` function, with the notable difference that + this `key` function should be *vectorized*. It should expect an + ``Index`` and return an ``Index`` of the same shape. For MultiIndex + inputs, the key is applied *per level*. - # TODO: _shallow_copy(subset)? - return subset[key] + Returns + ------- + DataFrame or None + The original DataFrame sorted by the labels or None if ``inplace=True``. - _agg_see_also_doc = dedent( - """ - See Also - -------- - DataFrame.apply : Perform any type of operations. - DataFrame.transform : Perform transformation type operations. - DataFrame.groupby : Perform operations over groups. - DataFrame.resample : Perform operations over resampled bins. - DataFrame.rolling : Perform operations over rolling window. - DataFrame.expanding : Perform operations over expanding window. - core.window.ewm.ExponentialMovingWindow : Perform operation over exponential - weighted window. - """ - ) + See Also + -------- + Series.sort_index : Sort Series by the index. + DataFrame.sort_values : Sort DataFrame by the value. + Series.sort_values : Sort Series by the value. - _agg_examples_doc = dedent( - """ - Examples - -------- - >>> df = pd.DataFrame([[1, 2, 3], - ... [4, 5, 6], - ... [7, 8, 9], - ... [np.nan, np.nan, np.nan]], - ... columns=['A', 'B', 'C']) + Examples + -------- + >>> df = pd.DataFrame( + ... [1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], columns=["A"] + ... ) + >>> df.sort_index() + A + 1 4 + 29 2 + 100 1 + 150 5 + 234 3 - Aggregate these functions over the rows. + By default, it sorts in ascending order, to sort in descending order, + use ``ascending=False`` - >>> df.agg(['sum', 'min']) - A B C - sum 12.0 15.0 18.0 - min 1.0 2.0 3.0 + >>> df.sort_index(ascending=False) + A + 234 3 + 150 5 + 100 1 + 29 2 + 1 4 - Different aggregations per column. + A key function can be specified which is applied to the index before + sorting. For a ``MultiIndex`` this is applied to each level separately. - >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) - A B - sum 12.0 NaN - min 1.0 2.0 - max NaN 8.0 + >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=["A", "b", "C", "d"]) + >>> df.sort_index(key=lambda x: x.str.lower()) + a + A 1 + b 2 + C 3 + d 4 + """ + return super().sort_index( + axis=axis, + level=level, + ascending=ascending, + inplace=inplace, + kind=kind, + na_position=na_position, + sort_remaining=sort_remaining, + ignore_index=ignore_index, + key=key, + ) - Aggregate different functions over the columns and rename the index of the resulting - DataFrame. + def value_counts( + self, + subset: IndexLabel | None = None, + normalize: bool = False, + sort: bool = True, + ascending: bool = False, + dropna: bool = True, + ) -> Series: + """ + Return a Series containing the frequency of each distinct row in the Dataframe. - >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')) - A B C - x 7.0 NaN NaN - y NaN 2.0 NaN - z NaN NaN 6.0 + Parameters + ---------- + subset : label or list of labels, optional + Columns to use when counting unique combinations. + normalize : bool, default False + Return proportions rather than frequencies. + sort : bool, default True + Sort by frequencies when True. Sort by DataFrame column values when False. + ascending : bool, default False + Sort in ascending order. + dropna : bool, default True + Don't include counts of rows that contain NA values. - Aggregate over the columns. + .. versionadded:: 1.3.0 - >>> df.agg("mean", axis="columns") - 0 2.0 - 1 5.0 - 2 8.0 - 3 NaN - dtype: float64 - """ - ) + Returns + ------- + Series - @doc( - _shared_docs["aggregate"], - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - see_also=_agg_see_also_doc, - examples=_agg_examples_doc, - ) - def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): - from pandas.core.apply import frame_apply + See Also + -------- + Series.value_counts: Equivalent method on Series. - axis = self._get_axis_number(axis) + Notes + ----- + The returned Series will have a MultiIndex with one level per input + column but an Index (non-multi) for a single label. By default, rows + that contain any NA values are omitted from the result. By default, + the resulting Series will be in descending order so that the first + element is the most frequently-occurring row. - op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) - result = op.agg() - result = reconstruct_and_relabel_result(result, func, **kwargs) - return result + Examples + -------- + >>> df = pd.DataFrame( + ... {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]}, + ... index=["falcon", "dog", "cat", "ant"], + ... ) + >>> df + num_legs num_wings + falcon 2 2 + dog 4 0 + cat 4 0 + ant 6 0 - agg = aggregate + >>> df.value_counts() + num_legs num_wings + 4 0 2 + 2 2 1 + 6 0 1 + Name: count, dtype: int64 - @doc( - _shared_docs["transform"], - klass=_shared_doc_kwargs["klass"], - axis=_shared_doc_kwargs["axis"], - ) - def transform( - self, func: AggFuncType, axis: Axis = 0, *args, **kwargs - ) -> DataFrame: - from pandas.core.apply import frame_apply + >>> df.value_counts(sort=False) + num_legs num_wings + 2 2 1 + 4 0 2 + 6 0 1 + Name: count, dtype: int64 - op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) - result = op.transform() - assert isinstance(result, DataFrame) - return result + >>> df.value_counts(ascending=True) + num_legs num_wings + 2 2 1 + 6 0 1 + 4 0 2 + Name: count, dtype: int64 - def apply( - self, - func: AggFuncType, - axis: Axis = 0, - raw: bool = False, - result_type: Literal["expand", "reduce", "broadcast"] | None = None, - args=(), - by_row: Literal[False, "compat"] = "compat", - engine: Callable | None | Literal["python", "numba"] = None, - engine_kwargs: dict[str, bool] | None = None, - **kwargs, - ): - """ - Apply a function along an axis of the DataFrame. + idx_diff = result_index.difference(correl.index) - Objects passed to the function are Series objects whose index is - either the DataFrame's index (``axis=0``) or the DataFrame's columns - (``axis=1``). By default (``result_type=None``), the final return type - is inferred from the return type of the applied function. Otherwise, - it depends on the `result_type` argument. The return type of the applied - function is inferred based on the first computed result obtained after - applying the function to a Series object. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Parameters - ---------- - func : function - Function to apply to each column or row. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Axis along which the function is applied: + return correl - * 0 or 'index': apply function to each column. - * 1 or 'columns': apply function to each row. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - raw : bool, default False - Determines if row or column is passed as a Series or ndarray object: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - * ``False`` : passes each row or column as a Series to the - function. - * ``True`` : the passed function will receive ndarray objects - instead. - If you are just applying a NumPy reduction function this will - achieve much better performance. + >>> df = pd.DataFrame( + ... { + ... "first_name": ["John", "Anne", "John", "Beth"], + ... "middle_name": ["Smith", pd.NA, pd.NA, "Louise"], + ... } + ... ) + >>> df + first_name middle_name + 0 John Smith + 1 Anne + 2 John + 3 Beth Louise - result_type : {'expand', 'reduce', 'broadcast', None}, default None - These only act when ``axis=1`` (columns): + >>> df.value_counts() + first_name middle_name + Beth Louise 1 + John Smith 1 + Name: count, dtype: int64 - * 'expand' : list-like results will be turned into columns. - * 'reduce' : returns a Series if possible rather than expanding - list-like results. This is the opposite of 'expand'. - * 'broadcast' : results will be broadcast to the original shape - of the DataFrame, the original index and columns will be - retained. + >>> df.value_counts(dropna=False) + first_name middle_name + Anne NaN 1 + Beth Louise 1 + John Smith 1 + NaN 1 + Name: count, dtype: int64 - The default behaviour (None) depends on the return value of the - applied function: list-like results will be returned as a Series - of those. However if the apply function returns a Series these - are expanded to columns. - args : tuple - Positional arguments to pass to `func` in addition to the - array/series. - by_row : False or "compat", default "compat" - Only has an effect when ``func`` is a listlike or dictlike of funcs - and the func isn't a string. - If "compat", will if possible first translate the func into pandas - methods (e.g. ``Series().apply(np.sum)`` will be translated to - ``Series().sum()``). If that doesn't work, will try call to apply again with - ``by_row=True`` and if that fails, will call apply again with - ``by_row=False`` (backward compatible). - If False, the funcs will be passed the whole Series at once. + >>> df.value_counts("first_name") + first_name + John 2 + Anne 1 + Beth 1 + Name: count, dtype: int64 + """ + if subset is None: + subset = self.columns.tolist() - .. versionadded:: 2.1.0 + name = "proportion" if normalize else "count" + counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() + counts.name = name - engine : decorator or {'python', 'numba'}, optional - Choose the execution engine to use. If not provided the function - will be executed by the regular Python interpreter. + if sort: + counts = counts.sort_values(ascending=ascending) + if normalize: + counts /= counts.sum() - Other options include JIT compilers such Numba and Bodo, which in some - cases can speed up the execution. To use an executor you can provide - the decorators ``numba.jit``, ``numba.njit`` or ``bodo.jit``. You can - also provide the decorator with parameters, like ``numba.jit(nogit=True)``. + # Force MultiIndex for a list_like subset with a single column + if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type] + counts.index = MultiIndex.from_arrays( + [counts.index], names=[counts.index.name] + ) - Not all functions can be executed with all execution engines. In general, - JIT compilers will require type stability in the function (no variable - should change data type during the execution). And not all pandas and - NumPy APIs are supported. Check the engine documentation [1]_ and [2]_ - for limitations. + return counts - .. warning:: + def nlargest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in descending order. - String parameters will stop being supported in a future pandas version. + Return the first `n` rows with the largest values in `columns`, in + descending order. The columns that are not specified are returned as + well, but not used for ordering. - .. versionadded:: 2.2.0 + This method is equivalent to + ``df.sort_values(columns, ascending=False).head(n)``, but more + performant. - engine_kwargs : dict - Pass keyword arguments to the engine. - This is currently only used by the numba engine, - see the documentation for the engine argument for more information. + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. + - ``first`` : prioritize the first occurrence(s) + - ``last`` : prioritize the last occurrence(s) + - ``all`` : keep all the ties of the smallest item even if it means + selecting more than ``n`` items. Returns ------- - Series or DataFrame - Result of applying ``func`` along the given axis of the - DataFrame. + DataFrame + The first `n` rows ordered by the given columns in descending + order. See Also -------- - DataFrame.map: For elementwise operations. - DataFrame.aggregate: Only perform aggregating type operations. - DataFrame.transform: Only perform transforming type operations. + DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in + ascending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. Notes ----- - Functions that mutate the passed object can produce unexpected - behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` - for more details. - - References - ---------- - .. [1] `Numba documentation - `_ - .. [2] `Bodo documentation - `/ + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. Examples -------- - >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 11300, + ... 11300, + ... 11300, + ... ], + ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) >>> df - A B - 0 4 9 - 1 4 9 - 2 4 9 + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI - Using a numpy universal function (in this case the same as - ``np.sqrt(df)``): + In the following example, we will use ``nlargest`` to select the three + rows having the largest values in column "population". - >>> df.apply(np.sqrt) - A B - 0 2.0 3.0 - 1 2.0 3.0 - 2 2.0 3.0 + >>> df.nlargest(3, "population") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT - Using a reducing function on either axis + When using ``keep='last'``, ties are resolved in reverse order: - >>> df.apply(np.sum, axis=0) - A 12 - B 27 - dtype: int64 + >>> df.nlargest(3, "population", keep="last") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN - >>> df.apply(np.sum, axis=1) - 0 13 - 1 13 - 2 13 - dtype: int64 + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the smallest element, all the + ties are kept: - Returning a list-like will result in a Series + >>> df.nlargest(3, "population", keep="all") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN - >>> df.apply(lambda x: [1, 2], axis=1) - 0 [1, 2] - 1 [1, 2] - 2 [1, 2] - dtype: object + However, ``nlargest`` does not keep ``n`` distinct largest elements: - Passing ``result_type='expand'`` will expand list-like results - to columns of a Dataframe + >>> df.nlargest(5, "population", keep="all") + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN - >>> df.apply(lambda x: [1, 2], axis=1, result_type="expand") - 0 1 - 0 1 2 - 1 1 2 - 2 1 2 + To order by the largest values in column "population" and then "GDP", + we can specify multiple columns like in the next example. - Returning a Series inside the function is similar to passing - ``result_type='expand'``. The resulting column names - will be the Series index. + >>> df.nlargest(3, ["population", "GDP"]) + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN + """ + return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - >>> df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1) - foo bar - 0 1 2 - 1 1 2 - 2 1 2 + def nsmallest( + self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" + ) -> DataFrame: + """ + Return the first `n` rows ordered by `columns` in ascending order. - Passing ``result_type='broadcast'`` will ensure the same shape - result, whether list-like or scalar is returned by the function, - and broadcast it along the axis. The resulting column names will - be the originals. + Return the first `n` rows with the smallest values in `columns`, in + ascending order. The columns that are not specified are returned as + well, but not used for ordering. + + This method is equivalent to + ``df.sort_values(columns, ascending=True).head(n)``, but more + performant. + + Parameters + ---------- + n : int + Number of items to retrieve. + columns : list or str + Column name or names to order by. + keep : {'first', 'last', 'all'}, default 'first' + Where there are duplicate values: + + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + - ``all`` : keep all the ties of the largest item even if it means + selecting more than ``n`` items. + + Returns + ------- + DataFrame + DataFrame with the first `n` rows ordered by `columns` in ascending order. + + See Also + -------- + DataFrame.nlargest : Return the first `n` rows ordered by `columns` in + descending order. + DataFrame.sort_values : Sort DataFrame by the values. + DataFrame.head : Return the first `n` rows without re-ordering. + + Examples + -------- + >>> df = pd.DataFrame( + ... { + ... "population": [ + ... 59000000, + ... 65000000, + ... 434000, + ... 434000, + ... 434000, + ... 337000, + ... 337000, + ... 11300, + ... 11300, + ... ], + ... "GDP": [1937894, 2583560, 12011, 4520, 12128, 17036, 182, 38, 311], + ... "alpha-2": ["IT", "FR", "MT", "MV", "BN", "IS", "NR", "TV", "AI"], + ... }, + ... index=[ + ... "Italy", + ... "France", + ... "Malta", + ... "Maldives", + ... "Brunei", + ... "Iceland", + ... "Nauru", + ... "Tuvalu", + ... "Anguilla", + ... ], + ... ) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 337000 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI - >>> df.apply(lambda x: [1, 2], axis=1, result_type="broadcast") - A B - 0 1 2 - 1 1 2 - 2 1 2 + In the following example, we will use ``nsmallest`` to select the + three rows having the smallest values in column "population". - Advanced users can speed up their code by using a Just-in-time (JIT) compiler - with ``apply``. The main JIT compilers available for pandas are Numba and Bodo. - In general, JIT compilation is only possible when the function passed to - ``apply`` has type stability (variables in the function do not change their - type during the execution). + >>> df.nsmallest(3, "population") + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS - >>> import bodo - >>> df.apply(lambda x: x.A + x.B, axis=1, engine=bodo.jit) + When using ``keep='last'``, ties are resolved in reverse order: - Note that JIT compilation is only recommended for functions that take a - significant amount of time to run. Fast functions are unlikely to run faster - with JIT compilation. - """ - if engine is None or isinstance(engine, str): - from pandas.core.apply import frame_apply + >>> df.nsmallest(3, "population", keep="last") + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 337000 182 NR - if engine is None: - engine = "python" + When using ``keep='all'``, the number of element kept can go beyond ``n`` + if there are duplicate values for the largest element, all the + ties are kept. - if engine not in ["python", "numba"]: - raise ValueError(f"Unknown engine '{engine}'") + >>> df.nsmallest(3, "population", keep="all") + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR - op = frame_apply( - self, - func=func, - axis=axis, - raw=raw, - result_type=result_type, - by_row=by_row, - engine=engine, - engine_kwargs=engine_kwargs, - args=args, - kwargs=kwargs, - ) - return op.apply().__finalize__(self, method="apply") - elif hasattr(engine, "__pandas_udf__"): - if result_type is not None: - raise NotImplementedError( - f"{result_type=} only implemented for the default engine" - ) + However, ``nsmallest`` does not keep ``n`` distinct + smallest elements: - agg_axis = self._get_agg_axis(self._get_axis_number(axis)) + >>> df.nsmallest(4, "population", keep="all") + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + Nauru 337000 182 NR - # one axis is empty - if not all(self.shape): - func = cast(Callable, func) - try: - if axis == 0: - r = func(Series([], dtype=np.float64), *args, **kwargs) - else: - r = func( - Series(index=self.columns, dtype=np.float64), - *args, - **kwargs, - ) - except Exception: - pass - else: - if not isinstance(r, Series): - if len(agg_axis): - r = func(Series([], dtype=np.float64), *args, **kwargs) - else: - r = np.nan - - return self._constructor_sliced(r, index=agg_axis) - return self.copy() - - data: DataFrame | np.ndarray = self - if raw: - # This will upcast the whole DataFrame to the same type, - # and likely result in an object 2D array. - # We should probably pass a list of 1D arrays instead, at - # lest for ``axis=0`` - data = self.values - result = engine.__pandas_udf__.apply( - data=data, - func=func, - args=args, - kwargs=kwargs, - decorator=engine, - axis=axis, - ) - if raw: - if result.ndim == 2: - return self._constructor( - result, index=self.index, columns=self.columns - ) - else: - return self._constructor_sliced(result, index=agg_axis) - return result - else: - raise ValueError(f"Unknown engine {engine}") + To order by the smallest values in column "population" and then "GDP", we can + specify multiple columns like in the next example. - def map( - self, func: PythonFuncType, na_action: Literal["ignore"] | None = None, **kwargs - ) -> DataFrame: + >>> df.nsmallest(3, ["population", "GDP"]) + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Nauru 337000 182 NR """ - Apply a function to a Dataframe elementwise. - - .. versionadded:: 2.1.0 - - DataFrame.applymap was deprecated and renamed to DataFrame.map. - - This method applies a function that accepts and returns a scalar - to every element of a DataFrame. - - Parameters - ---------- - func : callable - Python function, returns a single value from a single value. - na_action : {None, 'ignore'}, default None - If 'ignore', propagate NaN values, without passing them to func. - **kwargs - Additional keyword arguments to pass as keywords arguments to - `func`. - - Returns - ------- - DataFrame - Transformed DataFrame. - - See Also - -------- - DataFrame.apply : Apply a function along input axis of DataFrame. - DataFrame.replace: Replace values given in `to_replace` with `value`. - Series.map : Apply a function elementwise on a Series. + return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() + @doc( + Series.swaplevel, + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to swap levels on. 0 or 'index' for row-wise, 1 or + 'columns' for column-wise.""" + ), + examples=dedent( + """\ Examples -------- - >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) + >>> df = pd.DataFrame( + ... {"Grade": ["A", "B", "A", "C"]}, + ... index=[ + ... ["Final exam", "Final exam", "Coursework", "Coursework"], + ... ["History", "Geography", "History", "Geography"], + ... ["January", "February", "March", "April"], + ... ], + ... ) >>> df - 0 1 - 0 1.000 2.120 - 1 3.356 4.567 - - >>> df.map(lambda x: len(str(x))) - 0 1 - 0 3 4 - 1 5 5 + Grade + Final exam History January A + Geography February B + Coursework History March A + Geography April C - Like Series.map, NA values can be ignored: + In the following example, we will swap the levels of the indices. + Here, we will swap the levels column-wise, but levels can be swapped row-wise + in a similar manner. Note that column-wise is the default behaviour. + By not supplying any arguments for i and j, we swap the last and second to + last indices. - >>> df_copy = df.copy() - >>> df_copy.iloc[0, 0] = pd.NA - >>> df_copy.map(lambda x: len(str(x)), na_action="ignore") - 0 1 - 0 NaN 4 - 1 5.0 5 + >>> df.swaplevel() + Grade + Final exam January History A + February Geography B + Coursework March History A + April Geography C - It is also possible to use `map` with functions that are not - `lambda` functions: + By supplying one argument, we can choose which index to swap the last + index with. We can for example swap the first index with the last one as + follows. - >>> df.map(round, ndigits=1) - 0 1 - 0 1.0 2.1 - 1 3.4 4.6 + >>> df.swaplevel(0) + Grade + January History Final exam A + February Geography Final exam B + March History Coursework A + April Geography Coursework C - Note that a vectorized version of `func` often exists, which will - be much faster. You could square each number elementwise. + We can also define explicitly which indices we want to swap by supplying values + for both i and j. Here, we for example swap the first and second indices. - >>> df.map(lambda x: x**2) - 0 1 - 0 1.000000 4.494400 - 1 11.262736 20.857489 + >>> df.swaplevel(0, 1) + Grade + History Final exam January A + Geography Final exam February B + History Coursework March A + Geography Coursework April C""" + ), + ) + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: + result = self.copy(deep=False) - But it's better to avoid map in that case. + axis = self._get_axis_number(axis) - >>> df**2 - 0 1 - 0 1.000000 4.494400 - 1 11.262736 20.857489 - """ - if na_action not in {"ignore", None}: - raise ValueError(f"na_action must be 'ignore' or None. Got {na_action!r}") + if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only swap levels on a hierarchical axis.") - if self.empty: - return self.copy() + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.swaplevel(i, j) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.swaplevel(i, j) + return result - func = functools.partial(func, **kwargs) + idx_diff = result_index.difference(correl.index) - def infer(x): - return x._map_values(func, na_action=na_action) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - return self.apply(infer).__finalize__(self, "map") + return correl # ---------------------------------------------------------------------- - # Merging / joining methods - - def _append( - self, - other, - ignore_index: bool = False, - verify_integrity: bool = False, - sort: bool = False, - ) -> DataFrame: - if isinstance(other, (Series, dict)): - if isinstance(other, dict): - if not ignore_index: - raise TypeError("Can only append a dict if ignore_index=True") - other = Series(other) - if other.name is None and not ignore_index: - raise TypeError( - "Can only append a Series if ignore_index=True " - "or if the Series has a name" - ) + # ndarray-like stats methods - index = Index( - [other.name], - name=( - self.index.names - if isinstance(self.index, MultiIndex) - else self.index.name - ), - ) - row_df = other.to_frame().T - # infer_objects is needed for - # test_append_empty_frame_to_series_with_dateutil_tz - other = row_df.infer_objects().rename_axis(index.names) - elif isinstance(other, list): - if not other: - pass - elif not isinstance(other[0], DataFrame): - other = DataFrame(other) - if self.index.name is not None and not ignore_index: - other.index.name = self.index.name + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + """ + Rearrange index or column levels using input ``order``. - from pandas.core.reshape.concat import concat + idx_diff = result_index.difference(correl.index) - if isinstance(other, (list, tuple)): - to_concat = [self, *other] - else: - to_concat = [self, other] + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - result = concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) - return result.__finalize__(self, method="append") + return correl - def join( - self, - other: DataFrame | Series | Iterable[DataFrame | Series], - on: IndexLabel | None = None, - how: MergeHow = "left", - lsuffix: str = "", - rsuffix: str = "", - sort: bool = False, - validate: JoinValidate | None = None, - ) -> DataFrame: - """ - Join columns of another DataFrame. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Join columns with `other` DataFrame either on index or on a key - column. Efficiently join multiple DataFrame objects by index at once by - passing a list. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Parameters ---------- - other : DataFrame, Series, or a list containing any combination of them - Index should be similar to one of the columns in this one. If a - Series is passed, its name attribute must be set, and that will be - used as the column name in the resulting joined DataFrame. - on : str, list of str, or array-like, optional - Column or index level name(s) in the caller to join on the index - in `other`, otherwise joins index-on-index. If multiple - values given, the `other` DataFrame must have a MultiIndex. Can - pass an array as the join key if it is not already contained in - the calling DataFrame. Like an Excel VLOOKUP operation. - how : {'left', 'right', 'outer', 'inner', 'cross', 'left_anti', 'right_anti'}, - default 'left' - How to handle the operation of the two objects. - - * left: use calling frame's index (or column if on is specified) - * right: use `other`'s index. - * outer: form union of calling frame's index (or column if on is - specified) with `other`'s index, and sort it lexicographically. - * inner: form intersection of calling frame's index (or column if - on is specified) with `other`'s index, preserving the order - of the calling's one. - * cross: creates the cartesian product from both frames, preserves the order - of the left keys. - * left_anti: use set difference of calling frame's index and `other`'s - index. - * right_anti: use set difference of `other`'s index and calling frame's - index. - lsuffix : str, default '' - Suffix to use from left frame's overlapping columns. - rsuffix : str, default '' - Suffix to use from right frame's overlapping columns. - sort : bool, default False - Order result DataFrame lexicographically by the join key. If False, - the order of the join key depends on the join type (how keyword). - validate : str, optional - If specified, checks if join is of specified type. - - * "one_to_one" or "1:1": check if join keys are unique in both left - and right datasets. - * "one_to_many" or "1:m": check if join keys are unique in left dataset. - * "many_to_one" or "m:1": check if join keys are unique in right dataset. - * "many_to_many" or "m:m": allowed, but does not result in checks. - - .. versionadded:: 1.5.0 + order : list of int or list of str + List representing new level order. Reference level by number + (position) or by key (label). + axis : {0 or 'index', 1 or 'columns'}, default 0 + Where to reorder levels. Returns ------- DataFrame - A dataframe containing columns from both the caller and `other`. + DataFrame with indices or columns with reordered levels. - See Also + Examples -------- - DataFrame.merge : For column(s)-on-column(s) operations. + >>> data = { + ... "class": ["Mammals", "Mammals", "Reptiles"], + ... "diet": ["Omnivore", "Carnivore", "Carnivore"], + ... "species": ["Humans", "Dogs", "Snakes"], + ... } + >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) + >>> df = df.set_index(["class", "diet"]) + >>> df + species + class diet + Mammals Omnivore Humans + Carnivore Dogs + Reptiles Carnivore Snakes - Notes - ----- - Parameters `on`, `lsuffix`, and `rsuffix` are not supported when - passing a list of `DataFrame` objects. + Let's reorder the levels of the index: - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "key": ["K0", "K1", "K2", "K3", "K4", "K5"], - ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], - ... } - ... ) + >>> df.reorder_levels(["diet", "class"]) + species + diet class + Omnivore Mammals Humans + Carnivore Mammals Dogs + Reptiles Snakes + """ + axis = self._get_axis_number(axis) + if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover + raise TypeError("Can only reorder levels on a hierarchical axis.") - >>> df - key A - 0 K0 A0 - 1 K1 A1 - 2 K2 A2 - 3 K3 A3 - 4 K4 A4 - 5 K5 A5 + result = self.copy(deep=False) - >>> other = pd.DataFrame({"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}) + if axis == 0: + assert isinstance(result.index, MultiIndex) + result.index = result.index.reorder_levels(order) + else: + assert isinstance(result.columns, MultiIndex) + result.columns = result.columns.reorder_levels(order) + return result - >>> other - key B - 0 K0 B0 - 1 K1 B1 - 2 K2 B2 + # ---------------------------------------------------------------------- + # Arithmetic Methods - Join DataFrames using their indexes. + idx_diff = result_index.difference(correl.index) - >>> df.join(other, lsuffix="_caller", rsuffix="_other") - key_caller A key_other B - 0 K0 A0 K0 B0 - 1 K1 A1 K1 B1 - 2 K2 A2 K2 B2 - 3 K3 A3 NaN NaN - 4 K4 A4 NaN NaN - 5 K5 A5 NaN NaN + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - If we want to join using the key columns, we need to set key to be - the index in both `df` and `other`. The joined DataFrame will have - key as its index. + return correl - >>> df.set_index("key").join(other.set_index("key")) - A B - key - K0 A0 B0 - K1 A1 B1 - K2 A2 B2 - K3 A3 NaN - K4 A4 NaN - K5 A5 NaN + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Another option to join using the key columns is to use the `on` - parameter. DataFrame.join always uses `other`'s index but we can use - any column in `df`. This method preserves the original DataFrame's - index in the result. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.join(other.set_index("key"), on="key") - key A B - 0 K0 A0 B0 - 1 K1 A1 B1 - 2 K2 A2 B2 - 3 K3 A3 NaN - 4 K4 A4 NaN - 5 K5 A5 NaN + with np.errstate(all="ignore"): + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) - Using non-unique key values shows how they are matched. + idx_diff = result_index.difference(correl.index) - >>> df = pd.DataFrame( - ... { - ... "key": ["K0", "K1", "K1", "K3", "K0", "K1"], - ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], - ... } - ... ) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> df - key A - 0 K0 A0 - 1 K1 A1 - 2 K1 A2 - 3 K3 A3 - 4 K0 A4 - 5 K1 A5 + return correl - >>> df.join(other.set_index("key"), on="key", validate="m:1") - key A B - 0 K0 A0 B0 - 1 K1 A1 B1 - 2 K1 A2 B1 - 3 K3 A3 NaN - 4 K0 A4 B0 - 5 K1 A5 B1 + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + Returns + ------- + DataFrame + + Notes + ----- + Caller is responsible for setting np.errstate where relevant. """ - from pandas.core.reshape.concat import concat - from pandas.core.reshape.merge import merge + # Get the appropriate array-op to apply to each column/block's values. + array_op = ops.get_array_op(func) - if isinstance(other, Series): - if other.name is None: - raise ValueError("Other Series must have a name") - other = DataFrame({other.name: other}) + idx_diff = result_index.difference(correl.index) - if isinstance(other, DataFrame): - if how == "cross": - return merge( - self, - other, - how=how, - on=on, - suffixes=(lsuffix, rsuffix), - sort=sort, - validate=validate, + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) ) - return merge( - self, - other, - left_on=on, - how=how, - left_index=on is None, - right_index=True, - suffixes=(lsuffix, rsuffix), - sort=sort, - validate=validate, + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + elif isinstance(right, DataFrame): + assert self.index.equals(right.index) + assert self.columns.equals(right.columns) + # TODO: The previous assertion `assert right._indexed_same(self)` + # fails in cases with empty columns reached via + # _frame_arith_method_with_reindex + + # TODO operate_blockwise expects a manager of the same type + bm = self._mgr.operate_blockwise( + right._mgr, + array_op, ) - else: - if on is not None: - raise ValueError( - "Joining multiple DataFrames only supported for joining on index" - ) + return self._constructor_from_mgr(bm, axes=bm.axes) + + elif isinstance(right, Series) and axis == 1: + # axis=1 means we want to operate row-by-row + assert right.index.equals(self.columns) + + right = right._values + # maybe_align_as_frame ensures we do not have an ndarray here + assert not isinstance(right, np.ndarray) - if rsuffix or lsuffix: - raise ValueError( - "Suffixes not supported when joining multiple DataFrames" - ) + arrays = [ + array_op(_left, _right) + for _left, _right in zip(self._iter_column_arrays(), right) + ] - # Mypy thinks the RHS is a - # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas - # the LHS is an "Iterable[DataFrame]", but in reality both types are - # "Iterable[Union[DataFrame, Series]]" due to the if statements - frames = [cast("DataFrame | Series", self)] + list(other) + elif isinstance(right, Series): + assert right.index.equals(self.index) + right = right._values - can_concat = all(df.index.is_unique for df in frames) + arrays = [array_op(left, right) for left in self._iter_column_arrays()] - # join indexes only using concat - if can_concat: - if how == "left": - res = concat( - frames, axis=1, join="outer", verify_integrity=True, sort=sort - ) - return res.reindex(self.index) - else: - return concat( - frames, axis=1, join=how, verify_integrity=True, sort=sort - ) + else: + raise NotImplementedError(right) - joined = frames[0] + return type(self)._from_arrays( + arrays, self.columns, self.index, verify_integrity=False + ) - for frame in frames[1:]: - joined = merge( - joined, - frame, - how=how, - left_index=True, - right_index=True, - validate=validate, - ) + def _combine_frame(self, other: DataFrame, func, fill_value=None): + # at this point we have `self._indexed_same(other)` - return joined + if fill_value is None: + # since _arith_op may be called in a loop, avoid function call + # overhead if possible by doing this check once + _arith_op = func - @Substitution("") - @Appender(_merge_doc, indents=2) - def merge( - self, - right: DataFrame | Series, - how: MergeHow = "inner", - on: IndexLabel | AnyArrayLike | None = None, - left_on: IndexLabel | AnyArrayLike | None = None, - right_on: IndexLabel | AnyArrayLike | None = None, - left_index: bool = False, - right_index: bool = False, - sort: bool = False, - suffixes: Suffixes = ("_x", "_y"), - copy: bool | lib.NoDefault = lib.no_default, - indicator: str | bool = False, - validate: MergeValidate | None = None, - ) -> DataFrame: - self._check_copy_deprecation(copy) + else: - from pandas.core.reshape.merge import merge + idx_diff = result_index.difference(correl.index) - return merge( - self, - right, - how=how, - on=on, - left_on=left_on, - right_on=right_on, - left_index=left_index, - right_index=right_index, - sort=sort, - suffixes=suffixes, - indicator=indicator, - validate=validate, - ) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - def round( - self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs - ) -> DataFrame: - """ - Round numeric columns in a DataFrame to a variable number of decimal places. + return correl - Parameters - ---------- - decimals : int, dict, Series - Number of decimal places to round each column to. If an int is - given, round each column to the same number of places. - Otherwise dict and Series round to variable numbers of places. - Column names should be in the keys if `decimals` is a - dict-like, or in the index if `decimals` is a Series. Any - columns not included in `decimals` will be left as is. Elements - of `decimals` which are not columns of the input will be - ignored. - *args - Additional keywords have no effect but might be accepted for - compatibility with numpy. - **kwargs - Additional keywords have no effect but might be accepted for - compatibility with numpy. + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Returns ------- DataFrame - A DataFrame with the affected columns rounded to the specified - number of decimal places. - - See Also - -------- - numpy.around : Round a numpy array to the given number of decimals. - Series.round : Round a Series to the given number of decimals. - - Notes - ----- - For values exactly halfway between rounded decimal values, pandas rounds - to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5 - round to 2.0, etc.). - - Examples - -------- - >>> df = pd.DataFrame( - ... [(0.21, 0.32), (0.01, 0.67), (0.66, 0.03), (0.21, 0.18)], - ... columns=["dogs", "cats"], - ... ) - >>> df - dogs cats - 0 0.21 0.32 - 1 0.01 0.67 - 2 0.66 0.03 - 3 0.21 0.18 + """ + left = self - By providing an integer each column is rounded to the same number - of decimal places + # GH#31623, only operate on shared columns + cols, lcol_indexer, rcol_indexer = left.columns.join( + right.columns, how="inner", return_indexers=True + ) - >>> df.round(1) - dogs cats - 0 0.2 0.3 - 1 0.0 0.7 - 2 0.7 0.0 - 3 0.2 0.2 + new_left = left if lcol_indexer is None else left.iloc[:, lcol_indexer] + new_right = right if rcol_indexer is None else right.iloc[:, rcol_indexer] + result = op(new_left, new_right) - With a dict, the number of places for specific columns can be - specified with the column names as key and the number of decimal - places as value + # Do the join on the columns instead of using left._align_for_op + # to avoid constructing two potentially large/sparse DataFrames + join_columns = left.columns.join(right.columns, how="outer") - >>> df.round({"dogs": 1, "cats": 0}) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 + if result.columns.has_duplicates: + # Avoid reindexing with a duplicate axis. + # https://github.com/pandas-dev/pandas/issues/35194 + indexer, _ = result.columns.get_indexer_non_unique(join_columns) + indexer = algorithms.unique1d(indexer) + result = result._reindex_with_indexers( + {1: [join_columns, indexer]}, allow_dups=True + ) + else: + result = result.reindex(join_columns, axis=1) - Using a Series, the number of places for specific columns can be - specified with the column names as index and the number of - decimal places as value + return result - >>> decimals = pd.Series([0, 1], index=["cats", "dogs"]) - >>> df.round(decimals) - dogs cats - 0 0.2 0.0 - 1 0.0 1.0 - 2 0.7 0.0 - 3 0.2 0.0 + def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool: """ - from pandas.core.reshape.concat import concat - - def _dict_round(df: DataFrame, decimals) -> Iterator[Series]: - for col, vals in df.items(): - try: - yield _series_round(vals, decimals[col]) - except KeyError: - yield vals + Check if this is an operation between DataFrames that will need to reindex. + """ + if op is operator.pow or op is roperator.rpow: + # GH#32685 pow has special semantics for operating with null values + return False - def _series_round(ser: Series, decimals: int) -> Series: - if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): - return ser.round(decimals) - return ser + if not isinstance(right, DataFrame): + return False - nv.validate_round(args, kwargs) + if fill_value is None and level is None and axis == 1: + # TODO: any other cases we should handle here? - if isinstance(decimals, (dict, Series)): - if isinstance(decimals, Series) and not decimals.index.is_unique: - raise ValueError("Index of decimals must be unique") - if is_dict_like(decimals) and not all( - is_integer(value) for _, value in decimals.items() + # Intersection is always unique so we have to check the unique columns + left_uniques = self.columns.unique() + right_uniques = right.columns.unique() + cols = left_uniques.intersection(right_uniques) + if len(cols) and not ( + len(cols) == len(left_uniques) and len(cols) == len(right_uniques) ): - raise TypeError("Values in decimals must be integers") - new_cols = list(_dict_round(self, decimals)) - elif is_integer(decimals): - # Dispatch to Block.round - # Argument "decimals" to "round" of "BaseBlockManager" has incompatible - # type "Union[int, integer[Any]]"; expected "int" - new_mgr = self._mgr.round( - decimals=decimals, # type: ignore[arg-type] - ) - return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( - self, method="round" - ) - else: - raise TypeError("decimals must be an integer, a dict-like or a Series") - - if new_cols is not None and len(new_cols) > 0: - return self._constructor( - concat(new_cols, axis=1), index=self.index, columns=self.columns - ).__finalize__(self, method="round") - else: - return self.copy(deep=False) + # TODO: is there a shortcut available when len(cols) == 0? + return True - # ---------------------------------------------------------------------- - # Statistical methods, etc. + return False - def corr( + def _align_for_op( self, - method: CorrelationMethod = "pearson", - min_periods: int = 1, - numeric_only: bool = False, - ) -> DataFrame: + other, + axis: AxisInt, + flex: bool | None = False, + level: Level | None = None, + ): """ - Compute pairwise correlation of columns, excluding NA/null values. + Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. Parameters ---------- - method : {'pearson', 'kendall', 'spearman'} or callable - Method of correlation: + left : DataFrame + right : Any + axis : int + flex : bool or None, default False + Whether this is a flex op, in which case we reindex. + None indicates not to check for alignment. + level : int or level name, default None - * pearson : standard correlation coefficient - * kendall : Kendall Tau correlation coefficient - * spearman : Spearman rank correlation - * callable: callable with input two 1d ndarrays - and returning a float. Note that the returned matrix from corr - will have 1 along the diagonals and will be symmetric - regardless of the callable's behavior. - min_periods : int, optional - Minimum number of observations required per pair of columns - to have a valid result. Currently only available for Pearson - and Spearman correlation. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + Returns + ------- + left : DataFrame + right : Any + """ + left, right = self, other - .. versionadded:: 1.5.0 + def to_series(right): + msg = ( + "Unable to coerce to Series, " + "length must be {req_len}: given {given_len}" + ) - .. versionchanged:: 2.0.0 - The default value of ``numeric_only`` is now ``False``. + # pass dtype to avoid doing inference, which would break consistency + # with Index/Series ops + dtype = None + if getattr(right, "dtype", None) == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object - Returns - ------- - DataFrame - Correlation matrix. + if axis == 0: + if len(left.index) != len(right): + raise ValueError( + msg.format(req_len=len(left.index), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.index, dtype=dtype) + else: + if len(left.columns) != len(right): + raise ValueError( + msg.format(req_len=len(left.columns), given_len=len(right)) + ) + right = left._constructor_sliced(right, index=left.columns, dtype=dtype) + return right - See Also - -------- - DataFrame.corrwith : Compute pairwise correlation with another - DataFrame or Series. - Series.corr : Compute the correlation between two Series. + if isinstance(right, np.ndarray): + if right.ndim == 1: + right = to_series(right) - Notes - ----- - Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + elif right.ndim == 2: + # We need to pass dtype=right.dtype to retain object dtype + # otherwise we lose consistency with Index and array ops + dtype = None + if right.dtype == object: + # can't pass right.dtype unconditionally as that would break on e.g. + # datetime64[h] ndarray + dtype = object - * `Pearson correlation coefficient `_ - * `Kendall rank correlation coefficient `_ - * `Spearman's rank correlation coefficient `_ + if right.shape == left.shape: + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) - Examples - -------- - >>> def histogram_intersection(a, b): - ... v = np.minimum(a, b).sum().round(decimals=1) - ... return v - >>> df = pd.DataFrame( - ... [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], - ... columns=["dogs", "cats"], - ... ) - >>> df.corr(method=histogram_intersection) - dogs cats - dogs 1.0 0.3 - cats 0.3 1.0 + elif right.shape[0] == left.shape[0] and right.shape[1] == 1: + # Broadcast across columns + right = np.broadcast_to(right, left.shape) + right = left._constructor( + right, index=left.index, columns=left.columns, dtype=dtype + ) - >>> df = pd.DataFrame( - ... [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], columns=["dogs", "cats"] - ... ) - >>> df.corr(min_periods=3) - dogs cats - dogs 1.0 NaN - cats NaN 1.0 - """ # noqa: E501 - data = self._get_numeric_data() if numeric_only else self - cols = data.columns - idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + elif right.shape[1] == left.shape[1] and right.shape[0] == 1: + # Broadcast along rows + right = to_series(right[0, :]) - if method == "pearson": - correl = libalgos.nancorr(mat, minp=min_periods) - elif method == "spearman": - correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall" or callable(method): - if min_periods is None: - min_periods = 1 - mat = mat.T - corrf = nanops.get_corr_func(method) - K = len(cols) - correl = np.empty((K, K), dtype=float) - mask = np.isfinite(mat) - for i, ac in enumerate(mat): - for j, bc in enumerate(mat): - if i > j: - continue + else: + raise ValueError( + "Unable to coerce to DataFrame, shape " + f"must be {left.shape}: given {right.shape}" + ) - valid = mask[i] & mask[j] - if valid.sum() < min_periods: - c = np.nan - elif i == j: - c = 1.0 - elif not valid.all(): - c = corrf(ac[valid], bc[valid]) - else: - c = corrf(ac, bc) - correl[i, j] = c - correl[j, i] = c - else: - raise ValueError( - "method must be either 'pearson', " - "'spearman', 'kendall', or a callable, " - f"'{method}' was supplied" + elif right.ndim > 2: + raise ValueError( + "Unable to coerce to Series/DataFrame, " + f"dimension must be <= 2: {right.shape}" + ) + + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" ) + right = left._maybe_align_series_as_frame(right, axis) - result = self._constructor(correl, index=idx, columns=cols, copy=False) - return result.__finalize__(self, method="corr") + return left, right - def cov( - self, - min_periods: int | None = None, - ddof: int | None = 1, - numeric_only: bool = False, - ) -> DataFrame: + def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): """ - Compute pairwise covariance of columns, excluding NA/null values. + If the Series operand is not EA-dtype, we can broadcast to 2D and operate + blockwise. + """ + rvalues = series._values + if not isinstance(rvalues, np.ndarray): + # TODO(EA2D): no need to special-case with 2D EAs + if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): + # We can losslessly+cheaply cast to ndarray + rvalues = np.asarray(rvalues) + else: + return series - Compute the pairwise covariance among the series of a DataFrame. - The returned data frame is the `covariance matrix - `__ of the columns - of the DataFrame. + if axis == 0: + rvalues = rvalues.reshape(-1, 1) + else: + rvalues = rvalues.reshape(1, -1) - Both NA and null values are automatically excluded from the - calculation. (See the note below about bias from missing values.) - A threshold can be set for the minimum number of - observations for each value created. Comparisons with observations - below this threshold will be returned as ``NaN``. + rvalues = np.broadcast_to(rvalues, self.shape) + # pass dtype to avoid doing inference + return self._constructor( + rvalues, + index=self.index, + columns=self.columns, + dtype=rvalues.dtype, + ) - This method is generally used for the analysis of time series data to - understand the relationship between different measures - across time. + def _flex_arith_method( + self, other, op, *, axis: Axis = "columns", level=None, fill_value=None + ): + axis = self._get_axis_number(axis) if axis is not None else 1 - Parameters - ---------- - min_periods : int, optional - Minimum number of observations required per pair of columns - to have a valid result. + if self._should_reindex_frame_op(other, op, axis, fill_value, level): + return self._arith_method_with_reindex(other, op) - ddof : int, default 1 - Delta degrees of freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements. - This argument is applicable only when no ``nan`` is in the dataframe. + idx_diff = result_index.difference(correl.index) - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - .. versionadded:: 1.5.0 + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + def _construct_result(self, result) -> DataFrame: + """ + Wrap the result of an arithmetic, comparison, or logical operation. - .. versionchanged:: 2.0.0 - The default value of ``numeric_only`` is now ``False``. + Parameters + ---------- + result : DataFrame Returns ------- DataFrame - The covariance matrix of the series of the DataFrame. - - See Also - -------- - Series.cov : Compute covariance with another Series. - core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample - covariance. - core.window.expanding.Expanding.cov : Expanding sample covariance. - core.window.rolling.Rolling.cov : Rolling sample covariance. - - Notes - ----- - Returns the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-ddof. - - For DataFrames that have Series that are missing data (assuming that - data is `missing at random - `__) - the returned covariance matrix will be an unbiased estimate - of the variance and covariance between the member Series. - - However, for many applications this estimate may not be acceptable - because the estimate covariance matrix is not guaranteed to be positive - semi-definite. This could lead to estimate correlations having - absolute values which are greater than one, and/or a non-invertible - covariance matrix. See `Estimation of covariance matrices - `__ for more details. - - Examples - -------- - >>> df = pd.DataFrame( - ... [(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"] - ... ) - >>> df.cov() - dogs cats - dogs 0.666667 -1.000000 - cats -1.000000 1.666667 + """ + out = self._constructor(result, copy=False).__finalize__(self) + # Pin columns instead of passing to constructor for compat with + # non-unique columns case + out.columns = self.columns + out.index = self.index + return out - >>> np.random.seed(42) - >>> df = pd.DataFrame( - ... np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"] - ... ) - >>> df.cov() - a b c d e - a 0.998438 -0.020161 0.059277 -0.008943 0.014144 - b -0.020161 1.059352 -0.008543 -0.024738 0.009826 - c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 - d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 - e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = self // other + mod = self - div * other + return div, mod - **Minimum number of periods** + def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: + # Naive implementation, room for optimization + div = other // self + mod = other - div * self + return div, mod - This method also supports an optional ``min_periods`` keyword - that specifies the required minimum number of non-NA observations for - each column pair in order to have a valid result: + def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): + axis = self._get_axis_number(axis) if axis is not None else 1 - >>> np.random.seed(42) - >>> df = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) - >>> df.loc[df.index[:5], "a"] = np.nan - >>> df.loc[df.index[5:10], "b"] = np.nan - >>> df.cov(min_periods=12) - a b c - a 0.316741 NaN -0.150812 - b NaN 1.248003 0.191417 - c -0.150812 0.191417 0.895202 - """ - data = self._get_numeric_data() if numeric_only else self - if any(blk.dtype.kind in "mM" for blk in self._mgr.blocks): - msg = ( - "DataFrame contains columns with dtype datetime64 " - "or timedelta64, which are not supported for cov." - ) - raise TypeError(msg) - cols = data.columns - idx = cols.copy() - mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) + self, other = self._align_for_op(other, axis, flex=True, level=level) - if notna(mat).all(): - if min_periods is not None and min_periods > len(mat): - base_cov = np.empty((mat.shape[1], mat.shape[1])) - base_cov.fill(np.nan) - else: - base_cov = np.cov(mat.T, ddof=ddof) - base_cov = base_cov.reshape((len(cols), len(cols))) - else: - base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) + new_data = self._dispatch_frame_op(other, op, axis=axis) + return self._construct_result(new_data) - result = self._constructor(base_cov, index=idx, columns=cols, copy=False) - return result.__finalize__(self, method="cov") + @Appender(ops.make_flex_doc("eq", "dataframe")) + def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) - def corrwith( - self, - other: DataFrame | Series, - axis: Axis = 0, - drop: bool = False, - method: CorrelationMethod = "pearson", - numeric_only: bool = False, - min_periods: int | None = None, - ) -> Series: - """ - Compute pairwise correlation. + @Appender(ops.make_flex_doc("ne", "dataframe")) + def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) - Pairwise correlation is computed between rows or columns of - DataFrame with rows or columns of Series or DataFrame. DataFrames - are first aligned along both axes before computing the - correlations. + @Appender(ops.make_flex_doc("le", "dataframe")) + def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.le, axis=axis, level=level) - Parameters - ---------- - other : DataFrame, Series - Object with which to compute correlations. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for - column-wise. - drop : bool, default False - Drop missing indices from result. - method : {'pearson', 'kendall', 'spearman'} or callable - Method of correlation: + @Appender(ops.make_flex_doc("lt", "dataframe")) + def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) - * pearson : standard correlation coefficient - * kendall : Kendall Tau correlation coefficient - * spearman : Spearman rank correlation - * callable: callable with input two 1d ndarrays - and returning a float. + @Appender(ops.make_flex_doc("ge", "dataframe")) + def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + @Appender(ops.make_flex_doc("gt", "dataframe")) + def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: + return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) - min_periods : int, optional - Minimum number of observations needed to have a valid result. + @Appender(ops.make_flex_doc("add", "dataframe")) + def add( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.add, level=level, fill_value=fill_value, axis=axis + ) - .. versionadded:: 1.5.0 + @Appender(ops.make_flex_doc("radd", "dataframe")) + def radd( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.radd, level=level, fill_value=fill_value, axis=axis + ) - .. versionchanged:: 2.0.0 - The default value of ``numeric_only`` is now ``False``. + @Appender(ops.make_flex_doc("sub", "dataframe")) + def sub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.sub, level=level, fill_value=fill_value, axis=axis + ) - Returns - ------- - Series - Pairwise correlations. + subtract = sub - See Also - -------- - DataFrame.corr : Compute pairwise correlation of columns. + @Appender(ops.make_flex_doc("rsub", "dataframe")) + def rsub( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rsub, level=level, fill_value=fill_value, axis=axis + ) - Examples - -------- - >>> index = ["a", "b", "c", "d", "e"] - >>> columns = ["one", "two", "three", "four"] - >>> df1 = pd.DataFrame( - ... np.arange(20).reshape(5, 4), index=index, columns=columns - ... ) - >>> df2 = pd.DataFrame( - ... np.arange(16).reshape(4, 4), index=index[:4], columns=columns - ... ) - >>> df1.corrwith(df2) - one 1.0 - two 1.0 - three 1.0 - four 1.0 - dtype: float64 + @Appender(ops.make_flex_doc("mul", "dataframe")) + def mul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.mul, level=level, fill_value=fill_value, axis=axis + ) - >>> df2.corrwith(df1, axis=1) - a 1.0 - b 1.0 - c 1.0 - d 1.0 - e NaN - dtype: float64 - """ - axis = self._get_axis_number(axis) - this = self._get_numeric_data() if numeric_only else self + multiply = mul - if isinstance(other, Series): - return this.apply( - lambda x: other.corr(x, method=method, min_periods=min_periods), - axis=axis, - ) + @Appender(ops.make_flex_doc("rmul", "dataframe")) + def rmul( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rmul, level=level, fill_value=fill_value, axis=axis + ) - if numeric_only: - other = other._get_numeric_data() - left, right = this.align(other, join="inner") + @Appender(ops.make_flex_doc("truediv", "dataframe")) + def truediv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.truediv, level=level, fill_value=fill_value, axis=axis + ) - if axis == 1: - left = left.T - right = right.T + div = truediv + divide = truediv - if method == "pearson": - # mask missing values - left = left + right * 0 - right = right + left * 0 + idx_diff = result_index.difference(correl.index) - # demeaned data - ldem = left - left.mean(numeric_only=numeric_only) - rdem = right - right.mean(numeric_only=numeric_only) + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - num = (ldem * rdem).sum() - dom = ( - (left.count() - 1) - * left.std(numeric_only=numeric_only) - * right.std(numeric_only=numeric_only) - ) + return correl - correl = num / dom + # ---------------------------------------------------------------------- + # ndarray-like stats methods - elif method in ["kendall", "spearman"] or callable(method): + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - def c(x): - return nanops.nancorr(x[0], x[1], method=method) + rdiv = rtruediv - correl = self._constructor_sliced( - map(c, zip(left.values.T, right.values.T)), - index=left.columns, - copy=False, - ) + @Appender(ops.make_flex_doc("floordiv", "dataframe")) + def floordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.floordiv, level=level, fill_value=fill_value, axis=axis + ) - else: - raise ValueError( - f"Invalid method {method} was passed, " - "valid methods are: 'pearson', 'kendall', " - "'spearman', or callable" - ) + @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) + def rfloordiv( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis + ) - if not drop: - # Find non-matching labels along the given axis - # and append missing correlations (GH 22375) - raxis: AxisInt = 1 if axis == 0 else 0 - result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + @Appender(ops.make_flex_doc("mod", "dataframe")) + def mod( + self, other, axis: Axis = "columns", level=None, fill_value=None + ) -> DataFrame: + return self._flex_arith_method( + other, operator.mod, level=level, fill_value=fill_value, axis=axis + ) + + def melt( + self, + id_vars=None, + value_vars=None, + var_name=None, + value_name="value", + col_level=None, + ignore_index=True, + ) -> DataFrame: + """ idx_diff = result_index.difference(correl.index) if len(idx_diff) > 0: @@ -11607,2136 +19315,1758 @@ def c(x): # ---------------------------------------------------------------------- # ndarray-like stats methods - def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: - """ - Count non-NA cells for each column or row. - - The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Parameters ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - If 0 or 'index' counts are generated for each column. - If 1 or 'columns' counts are generated for each row. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + id_vars : scalar, tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : scalar, tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar, default None + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column, can't be an existing column label. + col_level : scalar, optional + If columns are a MultiIndex then use this level to melt. + ignore_index : bool, default True + If True, original index is ignored. If False, original index is retained. + Index labels will be repeated as necessary. Returns ------- - Series - For each column/row the number of non-NA/null entries. + DataFrame + Unpivoted DataFrame. See Also -------- - Series.count: Number of non-NA elements in a Series. - DataFrame.value_counts: Count unique combinations of columns. - DataFrame.shape: Number of DataFrame rows and columns (including NA - elements). - DataFrame.isna: Boolean same-sized DataFrame showing places of NA - elements. + melt : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Notes + ----- + Reference :ref:`the user guide ` for more examples. Examples -------- - Constructing DataFrame from a dictionary: - >>> df = pd.DataFrame( ... { - ... "Person": ["John", "Myla", "Lewis", "John", "Myla"], - ... "Age": [24.0, np.nan, 21.0, 33, 26], - ... "Single": [False, True, True, True, False], + ... "A": {0: "a", 1: "b", 2: "c"}, + ... "B": {0: 1, 1: 3, 2: 5}, + ... "C": {0: 2, 1: 4, 2: 6}, ... } ... ) >>> df - Person Age Single - 0 John 24.0 False - 1 Myla NaN True - 2 Lewis 21.0 True - 3 John 33.0 True - 4 Myla 26.0 False - - Notice the uncounted NA values: - - >>> df.count() - Person 5 - Age 4 - Single 5 - dtype: int64 - - Counts for each **row**: - - >>> df.count(axis="columns") - 0 3 - 1 2 - 2 3 - 3 3 - 4 3 - dtype: int64 - """ - axis = self._get_axis_number(axis) - - if numeric_only: - frame = self._get_numeric_data() - else: - frame = self - - # GH #423 - if len(frame._get_axis(axis)) == 0: - result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) - else: - result = notna(frame).sum(axis=axis) - - return result.astype("int64").__finalize__(self, method="count") + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 - def _reduce( - self, - op, - name: str, - *, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, - filter_type=None, - **kwds, - ): - assert filter_type is None or filter_type == "bool", filter_type - out_dtype = "bool" if filter_type == "bool" else None + >>> df.melt(id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 - if axis is not None: - axis = self._get_axis_number(axis) + >>> df.melt(id_vars=["A"], value_vars=["B", "C"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 - def func(values: np.ndarray): - # We only use this in the case that operates on self.values - return op(values, axis=axis, skipna=skipna, **kwds) + The names of 'variable' and 'value' columns can be customized: - def blk_func(values, axis: Axis = 1): - if isinstance(values, ExtensionArray): - if not is_1d_only_ea_dtype(values.dtype): - return values._reduce(name, axis=1, skipna=skipna, **kwds) - return values._reduce(name, skipna=skipna, keepdims=True, **kwds) - else: - return op(values, axis=axis, skipna=skipna, **kwds) + >>> df.melt( + ... id_vars=["A"], + ... value_vars=["B"], + ... var_name="myVarname", + ... value_name="myValname", + ... ) + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 - def _get_data() -> DataFrame: - if filter_type is None: - data = self._get_numeric_data() - else: - # GH#25101, GH#24434 - assert filter_type == "bool" - data = self._get_bool_data() - return data + Original index values can be kept around: - # Case with EAs see GH#35881 - df = self - if numeric_only: - df = _get_data() - if axis is None: - dtype = find_common_type([block.values.dtype for block in df._mgr.blocks]) - if isinstance(dtype, ExtensionDtype): - df = df.astype(dtype) - arr = concat_compat(list(df._iter_column_arrays())) - return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) - return func(df.values) - elif axis == 1: - if len(df.index) == 0: - # Taking a transpose would result in no columns, losing the dtype. - # In the empty case, reducing along axis 0 or 1 gives the same - # result dtype, so reduce with axis=0 and ignore values - result = df._reduce( - op, - name, - axis=0, - skipna=skipna, - numeric_only=False, - filter_type=filter_type, - **kwds, - ).iloc[:0] - result.index = df.index - return result + >>> df.melt(id_vars=["A"], value_vars=["B", "C"], ignore_index=False) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 0 a C 2 + 1 b C 4 + 2 c C 6 - # kurtosis excluded since groupby does not implement it - if df.shape[1] and name != "kurt": - dtype = find_common_type( - [block.values.dtype for block in df._mgr.blocks] - ) - if isinstance(dtype, ExtensionDtype): - # GH 54341: fastpath for EA-backed axis=1 reductions - # This flattens the frame into a single 1D array while keeping - # track of the row and column indices of the original frame. Once - # flattened, grouping by the row indices and aggregating should - # be equivalent to transposing the original frame and aggregating - # with axis=0. - name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) - df = df.astype(dtype) - arr = concat_compat(list(df._iter_column_arrays())) - nrows, ncols = df.shape - row_index = np.tile(np.arange(nrows), ncols) - col_index = np.repeat(np.arange(ncols), nrows) - ser = Series(arr, index=col_index, copy=False) - # GroupBy will raise a warning with SeriesGroupBy as the object, - # likely confusing users - with rewrite_warning( - target_message=( - f"The behavior of SeriesGroupBy.{name} with all-NA values" - ), - target_category=FutureWarning, - new_message=( - f"The behavior of {type(self).__name__}.{name} with all-NA " - "values, or any-NA and skipna=False, is deprecated. In " - "a future version this will raise ValueError" - ), - ): - result = ser.groupby(row_index).agg(name, **kwds) - result.index = df.index - if not skipna and name not in ("any", "all"): - mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) - other = -1 if name in ("idxmax", "idxmin") else lib.no_default - result = result.mask(mask, other) - return result - - df = df.T - - # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager.reduce - res = df._mgr.reduce(blk_func) - out = df._constructor_from_mgr(res, axes=res.axes).iloc[0] - if out_dtype is not None and out.dtype != "boolean": - out = out.astype(out_dtype) - elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]: - out = out.astype(object) - elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): - # Even if we are object dtype, follow numpy and return - # float64, see test_apply_funcs_over_empty - out = out.astype(np.float64) + If you have multi-index columns: - return out + >>> df.columns = [list("ABC"), list("DEF")] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 - def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: - """ - Special case for _reduce to try to avoid a potentially-expensive transpose. + >>> df.melt(col_level=0, id_vars=["A"], value_vars=["B"]) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 - Apply the reduction block-wise along axis=1 and then reduce the resulting - 1D arrays. + >>> df.melt(id_vars=[("A", "D")], value_vars=[("B", "E")]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 """ - if name == "all": - result = np.ones(len(self), dtype=bool) - ufunc = np.logical_and - elif name == "any": - result = np.zeros(len(self), dtype=bool) - # error: Incompatible types in assignment - # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], - # Literal[20], Literal[False]]", variable has type - # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], - # Literal[True]]") - ufunc = np.logical_or # type: ignore[assignment] - else: - raise NotImplementedError(name) - - for blocks in self._mgr.blocks: - middle = func(blocks.values, axis=0, skipna=skipna) - result = ufunc(result, middle) - - res_ser = self._constructor_sliced(result, index=self.index, copy=False) - return res_ser - - # error: Signature of "any" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def any( - self, - *, - axis: Axis = ..., - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series: ... + return melt( + self, + id_vars=id_vars, + value_vars=value_vars, + var_name=var_name, + value_name=value_name, + col_level=col_level, + ignore_index=ignore_index, + ).__finalize__(self, method="melt") - @overload - def any( - self, - *, - axis: None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> bool: ... + # ---------------------------------------------------------------------- + # Time series-related - @overload - def any( - self, - *, - axis: Axis | None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series | bool: ... + @doc( + Series.diff, + klass="DataFrame", + extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " + "Take difference over rows (0) or columns (1).\n", + other_klass="Series", + examples=dedent( + """ + Difference with previous row - @doc(make_doc("any", ndim=1)) - def any( - self, - *, - axis: Axis | None = 0, - bool_only: bool = False, - skipna: bool = True, - **kwargs, - ) -> Series | bool: - result = self._logical_func( - "any", nanops.nanany, axis, bool_only, skipna, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="any") - return result + >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], + ... 'b': [1, 1, 2, 3, 5, 8], + ... 'c': [1, 4, 9, 16, 25, 36]}) + >>> df + a b c + 0 1 1 1 + 1 2 1 4 + 2 3 2 9 + 3 4 3 16 + 4 5 5 25 + 5 6 8 36 - @overload - def all( - self, - *, - axis: Axis = ..., - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series: ... + >>> df.diff() + a b c + 0 NaN NaN NaN + 1 1.0 0.0 3.0 + 2 1.0 1.0 5.0 + 3 1.0 1.0 7.0 + 4 1.0 2.0 9.0 + 5 1.0 3.0 11.0 - @overload - def all( - self, - *, - axis: None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> bool: ... + Difference with previous column - @overload - def all( - self, - *, - axis: Axis | None, - bool_only: bool = ..., - skipna: bool = ..., - **kwargs, - ) -> Series | bool: ... + >>> df.diff(axis=1) + a b c + 0 NaN 0 0 + 1 NaN -1 3 + 2 NaN -1 7 + 3 NaN -1 13 + 4 NaN 0 20 + 5 NaN 2 28 - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="all") - @doc(make_doc("all", ndim=1)) - def all( - self, - axis: Axis | None = 0, - bool_only: bool = False, - skipna: bool = True, - **kwargs, - ) -> Series | bool: - result = self._logical_func( - "all", nanops.nanall, axis, bool_only, skipna, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="all") - return result + Difference with 3rd previous row - # error: Signature of "min" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def min( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + >>> df.diff(periods=3) + a b c + 0 NaN NaN NaN + 1 NaN NaN NaN + 2 NaN NaN NaN + 3 3.0 2.0 15.0 + 4 3.0 4.0 21.0 + 5 3.0 6.0 27.0 - @overload - def min( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + Difference with following row - @overload - def min( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + >>> df.diff(periods=-1) + a b c + 0 -1.0 0.0 -3.0 + 1 -1.0 -1.0 -5.0 + 2 -1.0 -1.0 -7.0 + 3 -1.0 -2.0 -9.0 + 4 -1.0 -3.0 -11.0 + 5 NaN NaN NaN - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="min") - @doc(make_doc("min", ndim=2)) - def min( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().min( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="min") - return result + Overflow in input dtype - # error: Signature of "max" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def max( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) + >>> df.diff() + a + 0 NaN + 1 255.0""" + ), + ) + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: + if not lib.is_integer(periods): + if not (is_float(periods) and periods.is_integer()): + raise ValueError("periods must be an integer") + periods = int(periods) - @overload - def max( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + axis = self._get_axis_number(axis) + if axis == 1: + if periods != 0: + # in the periods == 0 case, this is equivalent diff of 0 periods + # along axis=0, and the Manager method may be somewhat more + # performant, so we dispatch in that case. + return self - self.shift(periods, axis=axis) + # With periods=0 this is equivalent to a diff with axis=0 + axis = 0 - @overload - def max( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + new_data = self._mgr.diff(n=periods) + res_df = self._constructor_from_mgr(new_data, axes=new_data.axes) + return res_df.__finalize__(self, "diff") - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="max") - @doc(make_doc("max", ndim=2)) - def max( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().max( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="max") - return result + # ---------------------------------------------------------------------- + # Function application - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sum") - def sum( + def _gotitem( self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - min_count: int = 0, - **kwargs, - ) -> Series: + key: IndexLabel, + ndim: int, + subset: DataFrame | Series | None = None, + ) -> DataFrame | Series: """ - Return the sum of the values over the requested axis. - - This is equivalent to the method ``numpy.sum``. + Sub-classes to define. Return a sliced object. Parameters ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. - - .. warning:: + key : string / list of selections + ndim : {1, 2} + requested ndim of result + subset : object, default None + subset to act on + """ + if subset is None: + subset = self + elif subset.ndim == 1: # is Series + return subset - The behavior of DataFrame.sum with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). + # TODO: _shallow_copy(subset)? + return subset[key] - .. versionadded:: 2.0.0 + _agg_see_also_doc = dedent( + """ + See Also + -------- + DataFrame.apply : Perform any type of operations. + DataFrame.transform : Perform transformation type operations. + DataFrame.groupby : Perform operations over groups. + DataFrame.resample : Perform operations over resampled bins. + DataFrame.rolling : Perform operations over rolling window. + DataFrame.expanding : Perform operations over expanding window. + core.window.ewm.ExponentialMovingWindow : Perform operation over exponential + weighted window. + """ + ) - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer than - ``min_count`` non-NA values are present the result will be NA. - **kwargs - Additional keyword arguments to be passed to the function. + _agg_examples_doc = dedent( + """ + Examples + -------- + >>> df = pd.DataFrame([[1, 2, 3], + ... [4, 5, 6], + ... [7, 8, 9], + ... [np.nan, np.nan, np.nan]], + ... columns=['A', 'B', 'C']) - Returns - ------- - Series or scalar - Sum over requested axis. + Aggregate these functions over the rows. - See Also - -------- - Series.sum : Return the sum over Series values. - DataFrame.mean : Return the mean of the values over the requested axis. - DataFrame.median : Return the median of the values over the requested axis. - DataFrame.mode : Get the mode(s) of each element along the requested axis. - DataFrame.std : Return the standard deviation of the values over the - requested axis. + >>> df.agg(['sum', 'min']) + A B C + sum 12.0 15.0 18.0 + min 1.0 2.0 3.0 - Examples - -------- - >>> idx = pd.MultiIndex.from_arrays( - ... [["warm", "warm", "cold", "cold"], ["dog", "falcon", "fish", "spider"]], - ... names=["blooded", "animal"], - ... ) - >>> s = pd.Series([4, 2, 0, 8], name="legs", index=idx) - >>> s - blooded animal - warm dog 4 - falcon 2 - cold fish 0 - spider 8 - Name: legs, dtype: int64 + Different aggregations per column. - >>> s.sum() - 14 + >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) + A B + sum 12.0 NaN + min 1.0 2.0 + max NaN 8.0 - By default, the sum of an empty or all-NA Series is ``0``. + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. - >>> pd.Series([], dtype="float64").sum() # min_count=0 is the default - 0.0 + >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 - This can be controlled with the ``min_count`` parameter. For example, if - you'd like the sum of an empty series to be NaN, pass ``min_count=1``. + Aggregate over the columns. - >>> pd.Series([], dtype="float64").sum(min_count=1) - nan + >>> df.agg("mean", axis="columns") + 0 2.0 + 1 5.0 + 2 8.0 + 3 NaN + dtype: float64 + """ + ) - Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and - empty series identically. + @doc( + _shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + see_also=_agg_see_also_doc, + examples=_agg_examples_doc, + ) + def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): + from pandas.core.apply import frame_apply - >>> pd.Series([np.nan]).sum() - 0.0 + axis = self._get_axis_number(axis) - >>> pd.Series([np.nan]).sum(min_count=1) - nan - """ - result = super().sum( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="sum") + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.agg() + result = reconstruct_and_relabel_result(result, func, **kwargs) return result - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="prod") - def prod( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - min_count: int = 0, - **kwargs, - ) -> Series: - """ - Return the product of the values over the requested axis. + idx_diff = result_index.difference(correl.index) - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - .. warning:: + return correl - The behavior of DataFrame.prod with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). + # ---------------------------------------------------------------------- + # ndarray-like stats methods - .. versionadded:: 2.0.0 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. + @doc( + _shared_docs["transform"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: + from pandas.core.apply import frame_apply - min_count : int, default 0 - The required number of valid values to perform the operation. If fewer than - ``min_count`` non-NA values are present the result will be NA. - **kwargs - Additional keyword arguments to be passed to the function. + op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) + result = op.transform() + assert isinstance(result, DataFrame) + return result - Returns - ------- - Series or scalar - The product of the values over the requested axis. + def apply( + self, + func: AggFuncType, + axis: Axis = 0, + idx_diff = result_index.difference(correl.index) - See Also - -------- - Series.sum : Return the sum. - Series.min : Return the minimum. - Series.max : Return the maximum. - Series.idxmin : Return the index of the minimum. - Series.idxmax : Return the index of the maximum. - DataFrame.sum : Return the sum over the requested axis. - DataFrame.min : Return the minimum over the requested axis. - DataFrame.max : Return the maximum over the requested axis. - DataFrame.idxmin : Return the index of the minimum over the requested axis. - DataFrame.idxmax : Return the index of the maximum over the requested axis. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Examples - -------- - By default, the product of an empty or all-NA Series is ``1`` + return correl - >>> pd.Series([], dtype="float64").prod() - 1.0 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - This can be controlled with the ``min_count`` parameter + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + """ + Apply a function along an axis of the DataFrame. - >>> pd.Series([], dtype="float64").prod(min_count=1) - nan + Objects passed to the function are Series objects whose index is + either the DataFrame's index (``axis=0``) or the DataFrame's columns + (``axis=1``). By default (``result_type=None``), the final return type + is inferred from the return type of the applied function. Otherwise, + it depends on the `result_type` argument. - Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and - empty series identically. + Parameters + ---------- + func : function + Function to apply to each column or row. + axis : {0 or 'index', 1 or 'columns'}, default 0 + idx_diff = result_index.difference(correl.index) - >>> pd.Series([np.nan]).prod() - 1.0 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - >>> pd.Series([np.nan]).prod(min_count=1) - nan - """ - result = super().prod( - axis=axis, - skipna=skipna, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="prod") - return result + return correl - # error: Signature of "mean" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def mean( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + # ---------------------------------------------------------------------- + # ndarray-like stats methods - @overload - def mean( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @overload - def mean( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="mean") - @doc(make_doc("mean", ndim=2)) - def mean( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().mean( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="mean") - return result + raw : bool, default False + Determines if row or column is passed as a Series or ndarray object: - # error: Signature of "median" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def median( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + * ``False`` : passes each row or column as a Series to the + function. + * ``True`` : the passed function will receive ndarray objects + instead. + If you are just applying a NumPy reduction function this will + achieve much better performance. - @overload - def median( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + result_type : {'expand', 'reduce', 'broadcast', None}, default None + These only act when ``axis=1`` (columns): - @overload - def median( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + * 'expand' : list-like results will be turned into columns. + * 'reduce' : returns a Series if possible rather than expanding + list-like results. This is the opposite of 'expand'. + * 'broadcast' : results will be broadcast to the original shape + of the DataFrame, the original index and columns will be + retained. - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="median") - @doc(make_doc("median", ndim=2)) - def median( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - result = super().median( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="median") - return result + The default behaviour (None) depends on the return value of the + applied function: list-like results will be returned as a Series + of those. However if the apply function returns a Series these + are expanded to columns. + args : tuple + Positional arguments to pass to `func` in addition to the + array/series. + by_row : False or "compat", default "compat" + Only has an effect when ``func`` is a listlike or dictlike of funcs + and the func isn't a string. + If "compat", will if possible first translate the func into pandas + methods (e.g. ``Series().apply(np.sum)`` will be translated to + ``Series().sum()``). If that doesn't work, will try call to apply again with + ``by_row=True`` and if that fails, will call apply again with + ``by_row=False`` (backward compatible). + If False, the funcs will be passed the whole Series at once. - # error: Signature of "sem" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def sem( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + .. versionadded:: 2.1.0 - @overload - def sem( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + engine : {'python', 'numba'}, default 'python' + Choose between the python (default) engine or the numba engine in apply. - @overload - def sem( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + The numba engine will attempt to JIT compile the passed function, + which may result in speedups for large DataFrames. + It also supports the following engine_kwargs : - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="sem") - def sem( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased standard error of the mean over requested axis. + - nopython (compile the function in nopython mode) + - nogil (release the GIL inside the JIT compiled function) + - parallel (try to apply the function in parallel over the DataFrame) - Normalized by N-1 by default. This can be changed using the ddof argument + Note: Due to limitations within numba/how pandas interfaces with numba, + you should only use this if raw=True - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. + Note: The numba compiler only supports a subset of + valid Python/numpy operations. - .. warning:: + Please read more about the `supported python features + `_ + and `supported numpy features + `_ + in numba to learn what you can or cannot use in the passed function. - The behavior of DataFrame.sem with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). + .. versionadded:: 2.2.0 - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : - Additional keywords passed. + engine_kwargs : dict + Pass keyword arguments to the engine. + This is currently only used by the numba engine, + see the documentation for the engine argument for more information. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. Returns ------- - Series or DataFrame (if level specified) - Unbiased standard error of the mean over requested axis. + Series or DataFrame + Result of applying ``func`` along the given axis of the + DataFrame. See Also -------- - DataFrame.var : Return unbiased variance over requested axis. - DataFrame.std : Returns sample standard deviation over requested axis. + DataFrame.map: For elementwise operations. + DataFrame.aggregate: Only perform aggregating type operations. + DataFrame.transform: Only perform transforming type operations. + + Notes + ----- + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. Examples -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.sem().round(6) - 0.57735 + >>> df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + >>> df + A B + 0 4 9 + 1 4 9 + 2 4 9 - With a DataFrame + Using a numpy universal function (in this case the same as + ``np.sqrt(df)``): - >>> df = pd.DataFrame({"a": [1, 2], "b": [2, 3]}, index=["tiger", "zebra"]) - >>> df - a b - tiger 1 2 - zebra 2 3 - >>> df.sem() - a 0.5 - b 0.5 - dtype: float64 + >>> df.apply(np.sqrt) + A B + 0 2.0 3.0 + 1 2.0 3.0 + 2 2.0 3.0 - Using axis=1 + Using a reducing function on either axis + + >>> df.apply(np.sum, axis=0) + A 12 + B 27 + dtype: int64 + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.sem(axis=1) - tiger 0.5 - zebra 0.5 - dtype: float64 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - In this case, `numeric_only` should be set to `True` - to avoid getting an error. + Returning a list-like will result in a Series - >>> df = pd.DataFrame({"a": [1, 2], "b": ["T", "Z"]}, index=["tiger", "zebra"]) - >>> df.sem(numeric_only=True) - a 0.5 - dtype: float64 - """ - result = super().sem( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="sem") - return result + >>> df.apply(lambda x: [1, 2], axis=1) + 0 [1, 2] + 1 [1, 2] + 2 [1, 2] + dtype: object - # error: Signature of "var" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def var( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + Passing ``result_type='expand'`` will expand list-like results + to columns of a Dataframe - @overload - def var( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + idx_diff = result_index.difference(correl.index) - @overload - def var( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="var") - def var( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased variance over requested axis. + return correl - Normalized by N-1 by default. This can be changed using the ddof argument. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - .. warning:: + Returning a Series inside the function is similar to passing + ``result_type='expand'``. The resulting column names + will be the Series index. - The behavior of DataFrame.var with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). + >>> df.apply(lambda x: pd.Series([1, 2], index=["foo", "bar"]), axis=1) + foo bar + 0 1 2 + 1 1 2 + 2 1 2 - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : - Additional keywords passed. + Passing ``result_type='broadcast'`` will ensure the same shape + result, whether list-like or scalar is returned by the function, + and broadcast it along the axis. The resulting column names will + be the originals. - Returns - ------- - Series or scalaer - Unbiased variance over requested axis. + idx_diff = result_index.difference(correl.index) - See Also - -------- - numpy.var : Equivalent function in NumPy. - Series.var : Return unbiased variance over Series values. - Series.std : Return standard deviation over Series values. - DataFrame.std : Return standard deviation of the values over - the requested axis. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "person_id": [0, 1, 2, 3], - ... "age": [21, 25, 62, 43], - ... "height": [1.61, 1.87, 1.49, 2.01], - ... } - ... ).set_index("person_id") - >>> df - age height - person_id - 0 21 1.61 - 1 25 1.87 - 2 62 1.49 - 3 43 2.01 - - >>> df.var() - age 352.916667 - height 0.056367 - dtype: float64 + return correl - Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.var(ddof=0) - age 264.687500 - height 0.042275 - dtype: float64 - """ - result = super().var( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" ) - if isinstance(result, Series): - result = result.__finalize__(self, method="var") - return result + return op.apply().__finalize__(self, method="apply") - # error: Signature of "std" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def std( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + def map( + self, func: PythonFuncType, na_action: str | None = None, **kwargs + ) -> DataFrame: + """ + Apply a function to a Dataframe elementwise. - @overload - def std( - self, - *, - axis: None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + .. versionadded:: 2.1.0 - @overload - def std( - self, - *, - axis: Axis | None, - skipna: bool = ..., - ddof: int = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + DataFrame.applymap was deprecated and renamed to DataFrame.map. - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="std") - def std( - self, - axis: Axis | None = 0, - skipna: bool = True, - ddof: int = 1, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return sample standard deviation over requested axis. + idx_diff = result_index.difference(correl.index) - Normalized by N-1 by default. This can be changed using the ddof argument. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Parameters - ---------- - axis : {index (0), columns (1)} - For `Series` this parameter is unused and defaults to 0. + return correl - .. warning:: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - The behavior of DataFrame.std with ``axis=None`` is deprecated, - in a future version this will reduce over both axes and return a scalar - To retain the old behavior, pass axis=0 (or do not pass axis). + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is N - ddof, - where N represents the number of elements. - numeric_only : bool, default False - Include only float, int, boolean columns. Not implemented for Series. - **kwargs : dict - Additional keyword arguments to be passed to the function. + Parameters + ---------- + func : callable + Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NaN values, without passing them to func. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `func`. Returns ------- - Series or scalar - Standard deviation over requested axis. + DataFrame + Transformed DataFrame. See Also -------- - Series.std : Return standard deviation over Series values. - DataFrame.mean : Return the mean of the values over the requested axis. - DataFrame.median : Return the median of the values over the requested axis. - DataFrame.mode : Get the mode(s) of each element along the requested axis. - DataFrame.sum : Return the sum of the values over the requested axis. + DataFrame.apply : Apply a function along input axis of DataFrame. + DataFrame.replace: Replace values given in `to_replace` with `value`. + Series.map : Apply a function elementwise on a Series. - Notes - ----- - To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the - default `ddof=1`) + idx_diff = result_index.difference(correl.index) - Examples - -------- - >>> df = pd.DataFrame( - ... { - ... "person_id": [0, 1, 2, 3], - ... "age": [21, 25, 62, 43], - ... "height": [1.61, 1.87, 1.49, 2.01], - ... } - ... ).set_index("person_id") - >>> df - age height - person_id - 0 21 1.61 - 1 25 1.87 - 2 62 1.49 - 3 43 2.01 - - The standard deviation of the columns can be found as follows: - - >>> df.std() - age 18.786076 - height 0.237417 - dtype: float64 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl - Alternatively, `ddof=0` can be set to normalize by N instead of N-1: + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df.std(ddof=0) - age 16.269219 - height 0.205609 - dtype: float64 - """ - result = super().std( - axis=axis, skipna=skipna, ddof=ddof, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="std") - return result + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - # error: Signature of "skew" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def skew( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.map(lambda x: len(str(x)), na_action="ignore") + 0 1 + 0 NaN 4 + 1 5.0 5 - @overload - def skew( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + It is also possible to use `map` with functions that are not + `lambda` functions: - @overload - def skew( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + >>> df.map(round, ndigits=1) + 0 1 + 0 1.0 2.1 + 1 3.4 4.6 - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="skew") - def skew( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased skew over requested axis. + idx_diff = result_index.difference(correl.index) - Normalized by N-1. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods - For DataFrames, specifying ``axis=None`` will apply the aggregation - across both axes. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - .. versionadded:: 2.0.0 + >>> df.map(lambda x: x**2) + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. + But it's better to avoid map in that case. - **kwargs - Additional keyword arguments to be passed to the function. + >>> df**2 + 0 1 + 0 1.000000 4.494400 + 1 11.262736 20.857489 + """ + if na_action not in {"ignore", None}: + raise ValueError(f"na_action must be 'ignore' or None. Got {na_action!r}") - Returns - ------- - Series or scalar - Unbiased skew over requested axis. + idx_diff = result_index.difference(correl.index) - See Also - -------- - Dataframe.kurt : Returns unbiased kurtosis over requested axis. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Examples - -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.skew() - 0.0 + return correl - With a DataFrame + # ---------------------------------------------------------------------- + # ndarray-like stats methods - >>> df = pd.DataFrame( - ... {"a": [1, 2, 3], "b": [2, 3, 4], "c": [1, 3, 5]}, - ... index=["tiger", "zebra", "cow"], - ... ) - >>> df - a b c - tiger 1 2 1 - zebra 2 3 3 - cow 3 4 5 - >>> df.skew() - a 0.0 - b 0.0 - c 0.0 - dtype: float64 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Using axis=1 + func = functools.partial(func, **kwargs) - >>> df.skew(axis=1) - tiger 1.732051 - zebra -1.732051 - cow 0.000000 - dtype: float64 + def infer(x): + return x._map_values(func, na_action=na_action) - In this case, `numeric_only` should be set to `True` to avoid - getting an error. + return self.apply(infer).__finalize__(self, "map") - >>> df = pd.DataFrame( - ... {"a": [1, 2, 3], "b": ["T", "Z", "X"]}, index=["tiger", "zebra", "cow"] - ... ) - >>> df.skew(numeric_only=True) - a 0.0 - dtype: float64 - """ - result = super().skew( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="skew") - return result + # ---------------------------------------------------------------------- + # Merging / joining methods - # error: Signature of "kurt" incompatible with supertype "NDFrame" - @overload # type: ignore[override] - def kurt( - self, - *, - axis: Axis = ..., - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series: ... + idx_diff = result_index.difference(correl.index) - @overload - def kurt( - self, - *, - axis: None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Any: ... + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @overload - def kurt( - self, - *, - axis: Axis | None, - skipna: bool = ..., - numeric_only: bool = ..., - **kwargs, - ) -> Series | Any: ... + return correl - @deprecate_nonkeyword_arguments(version="4.0", allowed_args=["self"], name="kurt") - def kurt( - self, - axis: Axis | None = 0, - skipna: bool = True, - numeric_only: bool = False, - **kwargs, - ) -> Series | Any: - """ - Return unbiased kurtosis over requested axis. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Kurtosis obtained using Fisher's definition of - kurtosis (kurtosis of normal == 0.0). Normalized by N-1. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Parameters - ---------- - axis : {index (0), columns (1)} - Axis for the function to be applied on. - For `Series` this parameter is unused and defaults to 0. + index = Index( + [other.name], + name=self.index.names + if isinstance(self.index, MultiIndex) + else self.index.name, + ) + row_df = other.to_frame().T + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects().rename_axis(index.names) + elif isinstance(other, list): + if not other: + pass + elif not isinstance(other[0], DataFrame): + other = DataFrame(other) + if self.index.name is not None and not ignore_index: + other.index.name = self.index.name - For DataFrames, specifying ``axis=None`` will apply the aggregation - across both axes. + from pandas.core.reshape.concat import concat - .. versionadded:: 2.0.0 + if isinstance(other, (list, tuple)): + to_concat = [self, *other] + else: + to_concat = [self, other] - skipna : bool, default True - Exclude NA/null values when computing the result. - numeric_only : bool, default False - Include only float, int, boolean columns. + idx_diff = result_index.difference(correl.index) - **kwargs - Additional keyword arguments to be passed to the function. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Returns - ------- - Series or scalar - Unbiased kurtosis over requested axis. + return correl - See Also - -------- - Dataframe.kurtosis : Returns unbiased kurtosis over requested axis. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Examples - -------- - >>> s = pd.Series([1, 2, 2, 3], index=["cat", "dog", "dog", "mouse"]) - >>> s - cat 1 - dog 2 - dog 2 - mouse 3 - dtype: int64 - >>> s.kurt() - 1.5 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + ) + return result.__finalize__(self, method="append") - With a DataFrame + def join( + self, + other: DataFrame | Series | Iterable[DataFrame | Series], + on: IndexLabel | None = None, + how: MergeHow = "left", + lsuffix: str = "", + rsuffix: str = "", + sort: bool = False, + validate: JoinValidate | None = None, + ) -> DataFrame: + """ + Join columns of another DataFrame. - >>> df = pd.DataFrame( - ... {"a": [1, 2, 2, 3], "b": [3, 4, 4, 4]}, - ... index=["cat", "dog", "dog", "mouse"], - ... ) - >>> df - a b - cat 1 3 - dog 2 4 - dog 2 4 - mouse 3 4 - >>> df.kurt() - a 1.5 - b 4.0 - dtype: float64 + Join columns with `other` DataFrame either on index or on a key + column. Efficiently join multiple DataFrame objects by index at once by + passing a list. - With axis=None + idx_diff = result_index.difference(correl.index) - >>> df.kurt(axis=None).round(6) - -0.988693 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Using axis=1 + return correl - >>> df = pd.DataFrame( - ... {"a": [1, 2], "b": [3, 4], "c": [3, 4], "d": [1, 2]}, - ... index=["cat", "dog"], - ... ) - >>> df.kurt(axis=1) - cat -6.0 - dog -6.0 - dtype: float64 - """ - result = super().kurt( - axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs - ) - if isinstance(result, Series): - result = result.__finalize__(self, method="kurt") - return result + # ---------------------------------------------------------------------- + # ndarray-like stats methods - # error: Incompatible types in assignment - kurtosis = kurt # type: ignore[assignment] - product = prod + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - @doc(make_doc("cummin", ndim=2)) - def cummin( - self, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, - *args, - **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cummin(data, axis, skipna, *args, **kwargs) + * left: use calling frame's index (or column if on is specified) + * right: use `other`'s index. + * outer: form union of calling frame's index (or column if on is + specified) with `other`'s index, and sort it lexicographically. + * inner: form intersection of calling frame's index (or column if + on is specified) with `other`'s index, preserving the order + of the calling's one. + * cross: creates the cartesian product from both frames, preserves the order + of the left keys. + lsuffix : str, default '' + Suffix to use from left frame's overlapping columns. + rsuffix : str, default '' + Suffix to use from right frame's overlapping columns. + sort : bool, default False + Order result DataFrame lexicographically by the join key. If False, + the order of the join key depends on the join type (how keyword). + validate : str, optional + If specified, checks if join is of specified type. - @doc(make_doc("cummax", ndim=2)) - def cummax( - self, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, - *args, - **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cummax(data, axis, skipna, *args, **kwargs) + * "one_to_one" or "1:1": check if join keys are unique in both left + and right datasets. + * "one_to_many" or "1:m": check if join keys are unique in left dataset. + * "many_to_one" or "m:1": check if join keys are unique in right dataset. + * "many_to_many" or "m:m": allowed, but does not result in checks. - @doc(make_doc("cumsum", ndim=2)) - def cumsum( - self, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, - *args, - **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cumsum(data, axis, skipna, *args, **kwargs) + .. versionadded:: 1.5.0 - @doc(make_doc("cumprod", 2)) - def cumprod( - self, - axis: Axis = 0, - skipna: bool = True, - numeric_only: bool = False, - *args, - **kwargs, - ) -> Self: - data = self._get_numeric_data() if numeric_only else self - return NDFrame.cumprod(data, axis, skipna, *args, **kwargs) + idx_diff = result_index.difference(correl.index) - def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: - """ - Count number of distinct elements in specified axis. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Return Series with number of distinct elements. Can ignore NaN - values. + return correl - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for - column-wise. - dropna : bool, default True - Don't include NaN in the counts. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Returns - ------- - Series - Series with counts of unique values per row or column, depending on `axis`. + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" See Also -------- - Series.nunique: Method nunique for Series. - DataFrame.count: Count non-NA cells for each column or row. + DataFrame.merge : For column(s)-on-column(s) operations. + + Notes + ----- + Parameters `on`, `lsuffix`, and `rsuffix` are not supported when + passing a list of `DataFrame` objects. Examples -------- - >>> df = pd.DataFrame({"A": [4, 5, 6], "B": [4, 1, 1]}) - >>> df.nunique() - A 3 - B 2 - dtype: int64 + >>> df = pd.DataFrame( + ... { + ... "key": ["K0", "K1", "K2", "K3", "K4", "K5"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } + ... ) - >>> df.nunique(axis=1) - 0 1 - 1 2 - 2 2 - dtype: int64 - """ - return self.apply(Series.nunique, axis=axis, dropna=dropna) + idx_diff = result_index.difference(correl.index) - def idxmin( - self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False - ) -> Series: - """ - Return index of first occurrence of minimum over requested axis. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - NA/null values are excluded. + return correl - Parameters - ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire DataFrame is NA, - or if ``skipna=False`` and there is an NA value, this method - will raise a ``ValueError``. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - .. versionadded:: 1.5.0 + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - Returns - ------- - Series - Indexes of minima along the specified axis. + >>> other = pd.DataFrame({"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}) - Raises - ------ - ValueError - * If the row/column is empty + idx_diff = result_index.difference(correl.index) - See Also - -------- - Series.idxmin : Return index of the minimum element. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Notes - ----- - This method is the DataFrame version of ``ndarray.argmin``. + return correl - Examples - -------- - Consider a dataset containing food consumption in Argentina. + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + >>> df.set_index("key").join(other.set_index("key")) + A B + key + K0 A0 B0 + K1 A1 B1 + K2 A2 B2 + K3 A3 NaN + K4 A4 NaN + K5 A5 NaN + + Another option to join using the key columns is to use the `on` + parameter. DataFrame.join always uses `other`'s index but we can + use any column in `df`. This method preserves the original DataFrame's + index in the result. + + >>> df.join(other.set_index("key"), on="key") + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K2 A2 B2 + 3 K3 A3 NaN + 4 K4 A4 NaN + 5 K5 A5 NaN + + Using non-unique key values shows how they are matched. >>> df = pd.DataFrame( ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } - ... }, - ... index=["Pork", "Wheat Products", "Beef"], + ... "key": ["K0", "K1", "K1", "K3", "K0", "K1"], + ... "A": ["A0", "A1", "A2", "A3", "A4", "A5"], + ... } ... ) >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 + key A + 0 K0 A0 + 1 K1 A1 + 2 K1 A2 + 3 K3 A3 + 4 K0 A4 + 5 K1 A5 - By default, it returns the index for the minimum value in each column. + >>> df.join(other.set_index("key"), on="key", validate="m:1") + key A B + 0 K0 A0 B0 + 1 K1 A1 B1 + 2 K1 A2 B1 + 3 K3 A3 NaN + 4 K0 A4 B0 + 5 K1 A5 B1 + """ + idx_diff = result_index.difference(correl.index) - >>> df.idxmin() - consumption Pork - co2_emissions Wheat Products - dtype: object + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - To return the index for the minimum value in each row, use ``axis="columns"``. + return correl - >>> df.idxmin(axis="columns") - Pork consumption - Wheat Products co2_emissions - Beef consumption - dtype: object - """ - axis = self._get_axis_number(axis) + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - if self.empty and len(self.axes[axis]): - axis_dtype = self.axes[axis].dtype - return self._constructor_sliced(dtype=axis_dtype) + if isinstance(other, Series): + if other.name is None: + raise ValueError("Other Series must have a name") + other = DataFrame({other.name: other}) - if numeric_only: - data = self._get_numeric_data() + if isinstance(other, DataFrame): + if how == "cross": + return merge( + self, + other, + how=how, + on=on, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) + return merge( + self, + other, + left_on=on, + how=how, + left_index=on is None, + right_index=True, + suffixes=(lsuffix, rsuffix), + sort=sort, + validate=validate, + ) else: - data = self + if on is not None: + raise ValueError( + "Joining multiple DataFrames only supported for joining on index" + ) + + if rsuffix or lsuffix: + raise ValueError( + "Suffixes not supported when joining multiple DataFrames" + ) + + # Mypy thinks the RHS is a + # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas + # the LHS is an "Iterable[DataFrame]", but in reality both types are + # "Iterable[Union[DataFrame, Series]]" due to the if statements + frames = [cast("DataFrame | Series", self)] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + idx_diff = result_index.difference(correl.index) + + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) + + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" + + joined = frames[0] + + for frame in frames[1:]: + joined = merge( + joined, + frame, + how=how, + left_index=True, + right_index=True, + validate=validate, + ) - res = data._reduce( - nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values - # indices will always be np.ndarray since axis is not N + return joined - if (indices == -1).any(): - warnings.warn( - f"The behavior of {type(self).__name__}.idxmin with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), - ) + @Substitution("") + @Appender(_merge_doc, indents=2) + def merge( + self, + right: DataFrame | Series, + how: MergeHow = "inner", + on: IndexLabel | AnyArrayLike | None = None, + left_on: IndexLabel | AnyArrayLike | None = None, + right_on: IndexLabel | AnyArrayLike | None = None, + left_index: bool = False, + right_index: bool = False, + sort: bool = False, + suffixes: Suffixes = ("_x", "_y"), + copy: bool | None = None, + indicator: str | bool = False, + validate: MergeValidate | None = None, + ) -> DataFrame: + from pandas.core.reshape.merge import merge - index = data._get_axis(axis) - result = algorithms.take( - index._values, indices, allow_fill=True, fill_value=index._na_value + return merge( + self, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + left_index=left_index, + right_index=right_index, + sort=sort, + suffixes=suffixes, + indicator=indicator, + validate=validate, ) - final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) - return final_result.__finalize__(self, method="idxmin") - def idxmax( - self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False - ) -> Series: + def round( + self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs + ) -> DataFrame: """ - Return index of first occurrence of maximum over requested axis. - - NA/null values are excluded. + Round a DataFrame to a variable number of decimal places. Parameters ---------- - axis : {{0 or 'index', 1 or 'columns'}}, default 0 - The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. - skipna : bool, default True - Exclude NA/null values. If the entire DataFrame is NA, - or if ``skipna=False`` and there is an NA value, this method - will raise a ``ValueError``. - numeric_only : bool, default False - Include only `float`, `int` or `boolean` data. - - .. versionadded:: 1.5.0 + decimals : int, dict, Series + Number of decimal places to round each column to. If an int is + given, round each column to the same number of places. + Otherwise dict and Series round to variable numbers of places. + Column names should be in the keys if `decimals` is a + dict-like, or in the index if `decimals` is a Series. Any + columns not included in `decimals` will be left as is. Elements + of `decimals` which are not columns of the input will be + ignored. + *args + Additional keywords have no effect but might be accepted for + compatibility with numpy. + **kwargs + Additional keywords have no effect but might be accepted for + compatibility with numpy. Returns ------- - Series - Indexes of maxima along the specified axis. - - Raises - ------ - ValueError - * If the row/column is empty + DataFrame + A DataFrame with the affected columns rounded to the specified + number of decimal places. See Also -------- - Series.idxmax : Return index of the maximum element. + numpy.around : Round a numpy array to the given number of decimals. + Series.round : Round a Series to the given number of decimals. Notes ----- - This method is the DataFrame version of ``ndarray.argmax``. + For values exactly halfway between rounded decimal values, pandas rounds + to the nearest even value (e.g. -0.5 and 0.5 round to 0.0, 1.5 and 2.5 + round to 2.0, etc.). Examples -------- - Consider a dataset containing food consumption in Argentina. - >>> df = pd.DataFrame( - ... { - ... { - ... "consumption": [10.51, 103.11, 55.48], - ... "co2_emissions": [37.2, 19.66, 1712], - ... } - ... }, - ... index=["Pork", "Wheat Products", "Beef"], + ... [(0.21, 0.32), (0.01, 0.67), (0.66, 0.03), (0.21, 0.18)], + ... columns=["dogs", "cats"], ... ) - >>> df - consumption co2_emissions - Pork 10.51 37.20 - Wheat Products 103.11 19.66 - Beef 55.48 1712.00 + dogs cats + 0 0.21 0.32 + 1 0.01 0.67 + 2 0.66 0.03 + 3 0.21 0.18 - By default, it returns the index for the maximum value in each column. + By providing an integer each column is rounded to the same number + of decimal places - >>> df.idxmax() - consumption Wheat Products - co2_emissions Beef - dtype: object + >>> df.round(1) + dogs cats + 0 0.2 0.3 + 1 0.0 0.7 + 2 0.7 0.0 + 3 0.2 0.2 - To return the index for the maximum value in each row, use ``axis="columns"``. + With a dict, the number of places for specific columns can be + specified with the column names as key and the number of decimal + places as value - >>> df.idxmax(axis="columns") - Pork co2_emissions - Wheat Products consumption - Beef co2_emissions - dtype: object + >>> df.round({"dogs": 1, "cats": 0}) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 + + Using a Series, the number of places for specific columns can be + specified with the column names as index and the number of + decimal places as value + + >>> decimals = pd.Series([0, 1], index=["cats", "dogs"]) + >>> df.round(decimals) + dogs cats + 0 0.2 0.0 + 1 0.0 1.0 + 2 0.7 0.0 + 3 0.2 0.0 """ - axis = self._get_axis_number(axis) + from pandas.core.reshape.concat import concat - if self.empty and len(self.axes[axis]): - axis_dtype = self.axes[axis].dtype - return self._constructor_sliced(dtype=axis_dtype) + def _dict_round(df: DataFrame, decimals) -> Iterator[Series]: + for col, vals in df.items(): + try: + yield _series_round(vals, decimals[col]) + except KeyError: + yield vals - if numeric_only: - data = self._get_numeric_data() - else: - data = self + def _series_round(ser: Series, decimals: int) -> Series: + if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): + return ser.round(decimals) + return ser - res = data._reduce( - nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False - ) - indices = res._values - # indices will always be 1d array since axis is not None + nv.validate_round(args, kwargs) - if (indices == -1).any(): - warnings.warn( - f"The behavior of {type(self).__name__}.idxmax with all-NA " - "values, or any-NA and skipna=False, is deprecated. In a future " - "version this will raise ValueError", - FutureWarning, - stacklevel=find_stack_level(), + if isinstance(decimals, (dict, Series)): + if isinstance(decimals, Series) and not decimals.index.is_unique: + raise ValueError("Index of decimals must be unique") + if is_dict_like(decimals) and not all( + is_integer(value) for _, value in decimals.items() + ): + raise TypeError("Values in decimals must be integers") + new_cols = list(_dict_round(self, decimals)) + elif is_integer(decimals): + # Dispatch to Block.round + # Argument "decimals" to "round" of "BaseBlockManager" has incompatible + # type "Union[int, integer[Any]]"; expected "int" + new_mgr = self._mgr.round( + decimals=decimals, # type: ignore[arg-type] ) + return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( + self, method="round" + ) + else: + raise TypeError("decimals must be an integer, a dict-like or a Series") - index = data._get_axis(axis) - result = algorithms.take( - index._values, indices, allow_fill=True, fill_value=index._na_value - ) - final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) - return final_result.__finalize__(self, method="idxmax") - - def _get_agg_axis(self, axis_num: int) -> Index: - """ - Let's be explicit about this. - """ - if axis_num == 0: - return self.columns - elif axis_num == 1: - return self.index + if new_cols is not None and len(new_cols) > 0: + return self._constructor( + concat(new_cols, axis=1), index=self.index, columns=self.columns + ).__finalize__(self, method="round") else: - raise ValueError(f"Axis must be 0 or 1 (got {axis_num!r})") + return self.copy(deep=False) + + # ---------------------------------------------------------------------- + # Statistical methods, etc. - def mode( - self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True + def corr( + self, + method: CorrelationMethod = "pearson", + min_periods: int = 1, + numeric_only: bool = False, ) -> DataFrame: """ - Get the mode(s) of each element along the selected axis. - - The mode of a set of values is the value that appears most often. - It can be multiple values. + Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to iterate over while searching for the mode: - - * 0 or 'index' : get mode of each column - * 1 or 'columns' : get mode of each row. + method : {'pearson', 'kendall', 'spearman'} or callable + Method of correlation: + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. Note that the returned matrix from corr + will have 1 along the diagonals and will be symmetric + regardless of the callable's behavior. + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. Currently only available for Pearson + and Spearman correlation. numeric_only : bool, default False - If True, only apply to numeric columns. - dropna : bool, default True - Don't consider counts of NaN/NaT. + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. Returns ------- DataFrame - The modes of each column or row. + Correlation matrix. See Also -------- - Series.mode : Return the highest frequency value in a Series. - Series.value_counts : Return the counts of values in a Series. + DataFrame.corrwith : Compute pairwise correlation with another + DataFrame or Series. + Series.corr : Compute the correlation between two Series. + + Notes + ----- + Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. + + * `Pearson correlation coefficient `_ + * `Kendall rank correlation coefficient `_ + * `Spearman's rank correlation coefficient `_ Examples -------- + >>> def histogram_intersection(a, b): + ... v = np.minimum(a, b).sum().round(decimals=1) + ... return v >>> df = pd.DataFrame( - ... [ - ... ("bird", 2, 2), - ... ("mammal", 4, np.nan), - ... ("arthropod", 8, 0), - ... ("bird", 2, np.nan), - ... ], - ... index=("falcon", "horse", "spider", "ostrich"), - ... columns=("species", "legs", "wings"), + ... [(0.2, 0.3), (0.0, 0.6), (0.6, 0.0), (0.2, 0.1)], + ... columns=["dogs", "cats"], ... ) - >>> df - species legs wings - falcon bird 2 2.0 - horse mammal 4 NaN - spider arthropod 8 0.0 - ostrich bird 2 NaN - - By default, missing values are not considered, and the mode of wings - are both 0 and 2. Because the resulting DataFrame has two rows, - the second row of ``species`` and ``legs`` contains ``NaN``. + >>> df.corr(method=histogram_intersection) + dogs cats + dogs 1.0 0.3 + cats 0.3 1.0 - >>> df.mode() - species legs wings - 0 bird 2.0 0.0 - 1 NaN NaN 2.0 + >>> df = pd.DataFrame( + ... [(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], columns=["dogs", "cats"] + ... ) + >>> df.corr(min_periods=3) + dogs cats + dogs 1.0 NaN + cats NaN 1.0 + """ # noqa: E501 + data = self._get_numeric_data() if numeric_only else self + cols = data.columns + idx = cols.copy() + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) - Setting ``dropna=False`` ``NaN`` values are considered and they can be - the mode (like for wings). + idx_diff = result_index.difference(correl.index) - >>> df.mode(dropna=False) - species legs wings - 0 bird 2 NaN + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Setting ``numeric_only=True``, only the mode of numeric columns is - computed, and columns of other types are ignored. + return correl - >>> df.mode(numeric_only=True) - legs wings - 0 2.0 0.0 - 1 NaN 2.0 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - To compute the mode over columns and not rows, use the axis parameter: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df.mode(axis="columns", numeric_only=True) - 0 1 - falcon 2.0 NaN - horse 4.0 NaN - spider 0.0 8.0 - ostrich 2.0 NaN - """ - data = self if not numeric_only else self._get_numeric_data() + valid = mask[i] & mask[j] + if valid.sum() < min_periods: + c = np.nan + elif i == j: + c = 1.0 + elif not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + else: + raise ValueError( + "method must be either 'pearson', " + "'spearman', 'kendall', or a callable, " + f"'{method}' was supplied" + ) - def f(s): - return s.mode(dropna=dropna) + result = self._constructor(correl, index=idx, columns=cols, copy=False) + return result.__finalize__(self, method="corr") - data = data.apply(f, axis=axis) - # Ensure index is type stable (should always use int index) - if data.empty: - data.index = default_index(0) + def cov( + self, + idx_diff = result_index.difference(correl.index) - return data + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - @overload - def quantile( - self, - q: float = ..., - axis: Axis = ..., - numeric_only: bool = ..., - interpolation: QuantileInterpolation = ..., - method: Literal["single", "table"] = ..., - ) -> Series: ... + return correl - @overload - def quantile( - self, - q: AnyArrayLike | Sequence[float], - axis: Axis = ..., - numeric_only: bool = ..., - interpolation: QuantileInterpolation = ..., - method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: ... + # ---------------------------------------------------------------------- + # ndarray-like stats methods - @overload - def quantile( - self, - q: float | AnyArrayLike | Sequence[float] = ..., - axis: Axis = ..., - numeric_only: bool = ..., - interpolation: QuantileInterpolation = ..., - method: Literal["single", "table"] = ..., - ) -> Series | DataFrame: ... + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - def quantile( - self, - q: float | AnyArrayLike | Sequence[float] = 0.5, - axis: Axis = 0, - numeric_only: bool = False, - interpolation: QuantileInterpolation = "linear", - method: Literal["single", "table"] = "single", - ) -> Series | DataFrame: - """ - Return values at the given quantile over requested axis. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + This argument is applicable only when no ``nan`` is in the dataframe. - Parameters - ---------- - q : float or array-like, default 0.5 (50% quantile) - Value between 0 <= q <= 1, the quantile(s) to compute. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. numeric_only : bool, default False Include only `float`, `int` or `boolean` data. + .. versionadded:: 1.5.0 + .. versionchanged:: 2.0.0 The default value of ``numeric_only`` is now ``False``. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j`: - - * linear: `i + (j - i) * fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - * lower: `i`. - * higher: `j`. - * nearest: `i` or `j` whichever is nearest. - * midpoint: (`i` + `j`) / 2. - method : {'single', 'table'}, default 'single' - Whether to compute quantiles per-column ('single') or over all columns - ('table'). When 'table', the only allowed interpolation methods are - 'nearest', 'lower', and 'higher'. + .. deprecated:: 3.0.0 Returns ------- - Series or DataFrame + idx_diff = result_index.difference(correl.index) - If ``q`` is an array, a DataFrame will be returned where the - index is ``q``, the columns are the columns of self, and the - values are the quantiles. - If ``q`` is a float, a Series will be returned where the - index is the columns of self and the values are the quantiles. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - See Also - -------- - core.window.rolling.Rolling.quantile: Rolling quantile. - numpy.percentile: Numpy function to compute the percentile. + return correl + + # ---------------------------------------------------------------------- + # ndarray-like stats methods + + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" Examples -------- >>> df = pd.DataFrame( - ... np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), columns=["a", "b"] + ... [(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"] ... ) - >>> df.quantile(0.1) - a 1.3 - b 3.7 - Name: 0.1, dtype: float64 - >>> df.quantile([0.1, 0.5]) - a b - 0.1 1.3 3.7 - 0.5 2.5 55.0 - - Specifying `method='table'` will compute the quantile over all columns. - - >>> df.quantile(0.1, method="table", interpolation="nearest") - a 1 - b 1 - Name: 0.1, dtype: int64 - >>> df.quantile([0.1, 0.5], method="table", interpolation="nearest") - a b - 0.1 1 1 - 0.5 3 100 - - Specifying `numeric_only=False` will also compute the quantile of - datetime and timedelta data. + >>> df.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + >>> np.random.seed(42) >>> df = pd.DataFrame( - ... { - ... "A": [1, 2], - ... "B": [pd.Timestamp("2010"), pd.Timestamp("2011")], - ... "C": [pd.Timedelta("1 days"), pd.Timedelta("2 days")], - ... } + ... np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"] ... ) - >>> df.quantile(0.5, numeric_only=False) - A 1.5 - B 2010-07-02 12:00:00 - C 1 days 12:00:00 - Name: 0.5, dtype: object - """ - validate_percentile(q) - axis = self._get_axis_number(axis) - - if not is_list_like(q): - # BlockManager.quantile expects listlike, so we wrap and unwrap here - # error: List item 0 has incompatible type "float | ExtensionArray | - # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" - res_df = self.quantile( - [q], # type: ignore[list-item] - axis=axis, - numeric_only=numeric_only, - interpolation=interpolation, - method=method, - ) - if method == "single": - res = res_df.iloc[0] - else: - # cannot directly iloc over sparse arrays - res = res_df.T.iloc[:, 0] - if axis == 1 and len(self) == 0: - # GH#41544 try to get an appropriate dtype - dtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(dtype): - return res.astype(dtype) - return res - - q = Index(q, dtype=np.float64) - data = self._get_numeric_data() if numeric_only else self - - if axis == 1: - data = data.T - - if len(data.columns) == 0: - # GH#23925 _get_numeric_data may have dropped all columns - cols = self.columns[:0] - - dtype = np.float64 - if axis == 1: - # GH#41544 try to get an appropriate dtype - cdtype = find_common_type(list(self.dtypes)) - if needs_i8_conversion(cdtype): - dtype = cdtype - - res = self._constructor([], index=q, columns=cols, dtype=dtype) - return res.__finalize__(self, method="quantile") - - valid_method = {"single", "table"} - if method not in valid_method: - raise ValueError( - f"Invalid method: {method}. Method must be in {valid_method}." - ) - if method == "single": - res = data._mgr.quantile(qs=q, interpolation=interpolation) - elif method == "table": - valid_interpolation = {"nearest", "lower", "higher"} - if interpolation not in valid_interpolation: - raise ValueError( - f"Invalid interpolation: {interpolation}. " - f"Interpolation must be in {valid_interpolation}" - ) - # handle degenerate case - if len(data) == 0: - if data.ndim == 2: - dtype = find_common_type(list(self.dtypes)) - else: - dtype = self.dtype - return self._constructor([], index=q, columns=data.columns, dtype=dtype) - - q_idx = np.quantile(np.arange(len(data)), q, method=interpolation) - - by = data.columns - if len(by) > 1: - keys = [data._get_label_or_level_values(x) for x in by] - indexer = lexsort_indexer(keys) - else: - k = data._get_label_or_level_values(by[0]) - indexer = nargsort(k) + >>> df.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 - res = data._mgr.take(indexer[q_idx], verify=False) - res.axes[1] = q + **Minimum number of periods** - result = self._constructor_from_mgr(res, axes=res.axes) - return result.__finalize__(self, method="quantile") + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: - def to_timestamp( - self, - freq: Frequency | None = None, - how: ToTimestampHow = "start", - axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, - ) -> DataFrame: + >>> np.random.seed(42) + >>> df = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + >>> df.loc[df.index[:5], "a"] = np.nan + >>> df.loc[df.index[5:10], "b"] = np.nan + >>> df.cov(min_periods=12) + a b c + a 0.316741 NaN -0.150812 + b NaN 1.248003 0.191417 + c -0.150812 0.191417 0.895202 """ - Cast PeriodIndex to DatetimeIndex of timestamps, at *beginning* of period. - - This can be changed to the *end* of the period, by specifying `how="e"`. - - Parameters - ---------- - freq : str, default frequency of PeriodIndex - Desired frequency. - how : {'s', 'e', 'start', 'end'} - Convention for converting period to timestamp; start of period - vs. end. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default). - copy : bool, default False - If False then underlying input data is not copied. - - .. note:: - The `copy` keyword will change behavior in pandas 3.0. - `Copy-on-Write - `__ - will be enabled by default, which means that all methods with a - `copy` keyword will use a lazy copy mechanism to defer the copy and - ignore the `copy` keyword. The `copy` keyword will be removed in a - future version of pandas. - - You can already get the future behavior and improvements through - enabling copy on write ``pd.options.mode.copy_on_write = True`` - - .. deprecated:: 3.0.0 + idx_diff = result_index.difference(correl.index) - Returns - ------- - DataFrame with DatetimeIndex - DataFrame with the PeriodIndex cast to DatetimeIndex. + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - See Also - -------- - DataFrame.to_period: Inverse method to cast DatetimeIndex to PeriodIndex. - Series.to_timestamp: Equivalent method for Series. + return correl - Examples - -------- - >>> idx = pd.PeriodIndex(["2023", "2024"], freq="Y") - >>> d = {"col1": [1, 2], "col2": [3, 4]} - >>> df1 = pd.DataFrame(data=d, index=idx) - >>> df1 - col1 col2 - 2023 1 3 - 2024 2 4 + # ---------------------------------------------------------------------- + # ndarray-like stats methods - The resulting timestamps will be at the beginning of the year in this case + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> df1 = df1.to_timestamp() - >>> df1 - col1 col2 - 2023-01-01 1 3 - 2024-01-01 2 4 - >>> df1.index - DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None) + if notna(mat).all(): + if min_periods is not None and min_periods > len(mat): + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) + else: + base_cov = np.cov(mat.T, ddof=ddof) + base_cov = base_cov.reshape((len(cols), len(cols))) + else: + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) - Using `freq` which is the offset that the Timestamps will have + result = self._constructor(base_cov, index=idx, columns=cols, copy=False) + return result.__finalize__(self, method="cov") - >>> df2 = pd.DataFrame(data=d, index=idx) - >>> df2 = df2.to_timestamp(freq="M") - >>> df2 - col1 col2 - 2023-01-31 1 3 - 2024-01-31 2 4 - >>> df2.index - DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) - """ - self._check_copy_deprecation(copy) - new_obj = self.copy(deep=False) + idx_diff = result_index.difference(correl.index) - axis_name = self._get_axis_name(axis) - old_ax = getattr(self, axis_name) - if not isinstance(old_ax, PeriodIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - new_ax = old_ax.to_timestamp(freq=freq, how=how) + return correl - setattr(new_obj, axis_name, new_ax) - return new_obj + # ---------------------------------------------------------------------- + # ndarray-like stats methods - def to_period( - self, - freq: Frequency | None = None, - axis: Axis = 0, - copy: bool | lib.NoDefault = lib.no_default, - ) -> DataFrame: + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" """ - Convert DataFrame from DatetimeIndex to PeriodIndex. - - Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). Either index of columns can be - converted, depending on `axis` argument. - - Parameters - ---------- - freq : str, default - Frequency of the PeriodIndex. - axis : {0 or 'index', 1 or 'columns'}, default 0 - The axis to convert (the index by default). - copy : bool, default False - If False then underlying input data is not copied. - - .. note:: - The `copy` keyword will change behavior in pandas 3.0. - `Copy-on-Write - `__ - will be enabled by default, which means that all methods with a - `copy` keyword will use a lazy copy mechanism to defer the copy and - ignore the `copy` keyword. The `copy` keyword will be removed in a - future version of pandas. + Compute pairwise correlation. - You can already get the future behavior and improvements through - enabling copy on write ``pd.options.mode.copy_on_write = True`` + idx_diff = result_index.difference(correl.index) - .. deprecated:: 3.0.0 + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - Returns - ------- - DataFrame - The DataFrame with the converted PeriodIndex. + return correl - See Also - -------- - Series.to_period: Equivalent method for Series. - Series.dt.to_period: Convert DateTime column values. + # ---------------------------------------------------------------------- + # ndarray-like stats methods - Examples - -------- - >>> idx = pd.to_datetime( - ... [ - ... "2001-03-31 00:00:00", - ... "2002-05-31 00:00:00", - ... "2003-08-31 00:00:00", - ... ] - ... ) + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - >>> idx - DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[s]', freq=None) + Parameters + ---------- + other : DataFrame, Series + Object with which to compute correlations. + axis : {0 or 'index', 1 or 'columns'}, default 0 + idx_diff = result_index.difference(correl.index) - >>> idx.to_period("M") - PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') + if len(idx_diff) > 0: + correl = correl._append( + Series([np.nan] * len(idx_diff), index=idx_diff) + ) - For the yearly frequency + return correl - >>> idx.to_period("Y") - PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') - """ - self._check_copy_deprecation(copy) - new_obj = self.copy(deep=False) + # ---------------------------------------------------------------------- + # ndarray-like stats methods - axis_name = self._get_axis_name(axis) - old_ax = getattr(self, axis_name) - if not isinstance(old_ax, DatetimeIndex): - raise TypeError(f"unsupported Type {type(old_ax).__name__}") + # ---------------------------------------------------------------------- + # Add index and columns + _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] + _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { + **NDFrame._AXIS_TO_AXIS_NUMBER, + 1: 1, + "columns": 1, + } + _AXIS_LEN = len(_AXIS_ORDERS) + _info_axis_number: Literal[1] = 1 + _info_axis_name: Literal["columns"] = "columns" - new_ax = old_ax.to_period(freq=freq) + * pearson : standard correlation coefficient + * kendall : Kendall Tau correlation coefficient + * spearman : Spearman rank correlation + * callable: callable with input two 1d ndarrays + and returning a float. - setattr(new_obj, axis_name, new_ax) - return new_obj + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. - def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: - """ - Whether each element in the DataFrame is contained in values. + .. versionadded:: 1.5.0 - Parameters - ---------- - values : iterable, Series, DataFrame or dict - The result will only be true at a location if all the - labels match. If `values` is a Series, that's the index. If - `values` is a dict, the keys must be the column names, - which must match. If `values` is a DataFrame, - then both the index and column labels must match. + .. versionchanged:: 2.0.0 + The default value of ``numeric_only`` is now ``False``. Returns ------- - DataFrame - DataFrame of booleans showing whether each element in the DataFrame - is contained in values. + Series + Pairwise correlations. See Also -------- - DataFrame.eq: Equality test for DataFrame. - Series.isin: Equivalent method on Series. - Series.str.contains: Test if pattern or regex is contained within a - string of a Series or Index. + DataFrame.corr : Compute pairwise correlation of columns. Notes ----- @@ -13745,169 +21075,84 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: Examples -------- - >>> df = pd.DataFrame( - ... {"num_legs": [2, 4], "num_wings": [2, 0]}, index=["falcon", "dog"] + >>> index = ["a", "b", "c", "d", "e"] + >>> columns = ["one", "two", "three", "four"] + >>> df1 = pd.DataFrame( + ... np.arange(20).reshape(5, 4), index=index, columns=columns ... ) - >>> df - num_legs num_wings - falcon 2 2 - dog 4 0 - - When ``values`` is a list check whether every value in the DataFrame - is present in the list (which animals have 0 or 2 legs or wings) - - >>> df.isin([0, 2]) - num_legs num_wings - falcon True True - dog False True - - To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: - - >>> ~df.isin([0, 2]) - num_legs num_wings - falcon False False - dog True False - - When ``values`` is a dict, we can pass values to check for each - column separately: - - >>> df.isin({"num_wings": [0, 3]}) - num_legs num_wings - falcon False False - dog False True - - When ``values`` is a Series or DataFrame the index and column must - match. Note that 'falcon' does not match based on the number of legs - in other. - - >>> other = pd.DataFrame( - ... {"num_legs": [8, 3], "num_wings": [0, 2]}, index=["spider", "falcon"] + >>> df2 = pd.DataFrame( + ... np.arange(16).reshape(4, 4), index=index[:4], columns=columns ... ) - >>> df.isin(other) - num_legs num_wings - falcon False True - dog False False + >>> df1.corrwith(df2) + one 1.0 + two 1.0 + three 1.0 + four 1.0 + dtype: float64 + + >>> df2.corrwith(df1, axis=1) + a 1.0 + b 1.0 + c 1.0 + d 1.0 + e NaN + dtype: float64 """ - if isinstance(values, dict): - from pandas.core.reshape.concat import concat + axis = self._get_axis_number(axis) + this = self._get_numeric_data() if numeric_only else self - values = collections.defaultdict(list, values) - result = concat( - ( - self.iloc[:, [i]].isin(values[col]) - for i, col in enumerate(self.columns) - ), - axis=1, - ) - elif isinstance(values, Series): - if not values.index.is_unique: - raise ValueError("cannot compute isin with a duplicate axis.") - result = self.eq(values.reindex_like(self), axis="index") - elif isinstance(values, DataFrame): - if not (values.columns.is_unique and values.index.is_unique): - raise ValueError("cannot compute isin with a duplicate axis.") - result = self.eq(values.reindex_like(self)) - else: - if not is_list_like(values): - raise TypeError( - "only list-like or dict-like objects are allowed " - "to be passed to DataFrame.isin(), " - f"you passed a '{type(values).__name__}'" - ) - - def isin_(x): - # error: Argument 2 to "isin" has incompatible type "Union[Series, - # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected - # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, - # Series], List[Any], range]" - result = algorithms.isin( - x.ravel(), - values, # type: ignore[arg-type] - ) - return result.reshape(x.shape) - - res_mgr = self._mgr.apply(isin_) - result = self._constructor_from_mgr( - res_mgr, - axes=res_mgr.axes, - ) - return result.__finalize__(self, method="isin") + if isinstance(other, Series): + return this.apply(lambda x: other.corr(x, method=method), axis=axis) - # ---------------------------------------------------------------------- - # Add index and columns - _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] - _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { - **NDFrame._AXIS_TO_AXIS_NUMBER, - 1: 1, - "columns": 1, - } - _AXIS_LEN = len(_AXIS_ORDERS) - _info_axis_number: Literal[1] = 1 - _info_axis_name: Literal["columns"] = "columns" + if numeric_only: + other = other._get_numeric_data() + left, right = this.align(other, join="inner") - index = properties.AxisProperty( - axis=1, - doc=""" - The index (row labels) of the DataFrame. + if axis == 1: + left = left.T + right = right.T - The index of a DataFrame is a series of labels that identify each row. - The labels can be integers, strings, or any other hashable type. The index - is used for label-based access and alignment, and can be accessed or - modified using this attribute. + if method == "pearson": + # mask missing values + left = left + right * 0 + right = right + left * 0 - Returns - ------- - pandas.Index - The index labels of the DataFrame. + # demeaned data + ldem = left - left.mean(numeric_only=numeric_only) + rdem = right - right.mean(numeric_only=numeric_only) - See Also - -------- - DataFrame.columns : The column labels of the DataFrame. - DataFrame.to_numpy : Convert the DataFrame to a NumPy array. + num = (ldem * rdem).sum() + dom = ( + (left.count() - 1) + * left.std(numeric_only=numeric_only) + * right.std(numeric_only=numeric_only) + ) - Examples - -------- - >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], - ... 'Age': [25, 30, 35], - ... 'Location': ['Seattle', 'New York', 'Kona']}, - ... index=([10, 20, 30])) - >>> df.index - Index([10, 20, 30], dtype='int64') + correl = num / dom - In this example, we create a DataFrame with 3 rows and 3 columns, - including Name, Age, and Location information. We set the index labels to - be the integers 10, 20, and 30. We then access the `index` attribute of the - DataFrame, which returns an `Index` object containing the index labels. + elif method in ["kendall", "spearman"] or callable(method): - >>> df.index = [100, 200, 300] - >>> df - Name Age Location - 100 Alice 25 Seattle - 200 Bob 30 New York - 300 Aritra 35 Kona - - In this example, we modify the index labels of the DataFrame by assigning - a new list of labels to the `index` attribute. The DataFrame is then - updated with the new labels, and the output shows the modified DataFrame. - """, - ) - columns = properties.AxisProperty( - axis=0, - doc=""" - The column labels of the DataFrame. + def c(x): + return nanops.nancorr(x[0], x[1], method=method) - This property holds the column names as a pandas ``Index`` object. - It provides an immutable sequence of column labels that can be - used for data selection, renaming, and alignment in DataFrame operations. + correl = self._constructor_sliced( + map(c, zip(left.values.T, right.values.T)), + index=left.columns, + copy=False, + ) - Returns - ------- - pandas.Index - The column labels of the DataFrame. + else: + raise ValueError( + f"Invalid method {method} was passed, " + "valid methods are: 'pearson', 'kendall', " + "'spearman', or callable" + ) - See Also - -------- - DataFrame.index: The index (row labels) of the DataFrame. + if not drop: + # Find non-matching labels along the given axis + # and append missing correlations (GH 22375) + raxis: AxisInt = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) DataFrame.axes: Return a list representing the axes of the DataFrame. Examples @@ -13945,119 +21190,97 @@ def _to_dict_of_blocks(self): for k, v in mgr.to_iter_dict() } - @property - def values(self) -> np.ndarray: + def count(self, axis: Axis = 0, numeric_only: bool = False) -> Series: """ - Return a Numpy representation of the DataFrame. - - .. warning:: + Count non-NA cells for each column or row. - We recommend using :meth:`DataFrame.to_numpy` instead. + The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA. - Only the values in the DataFrame will be returned, the axes labels - will be removed. + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + If 0 or 'index' counts are generated for each column. + If 1 or 'columns' counts are generated for each row. + numeric_only : bool, default False + Include only `float`, `int` or `boolean` data. Returns ------- - numpy.ndarray - The values of the DataFrame. + Series + For each column/row the number of non-NA/null entries. See Also -------- - DataFrame.to_numpy : Recommended alternative to this method. - DataFrame.index : Retrieve the index labels. - DataFrame.columns : Retrieving the column names. - - Notes - ----- - The dtype will be a lower-common-denominator dtype (implicit - upcasting); that is to say if the dtypes (even of numeric types) - are mixed, the one that accommodates all will be chosen. Use this - with care if you are not dealing with the blocks. - - e.g. If the dtypes are float16 and float32, dtype will be upcast to - float32. If dtypes are int32 and uint8, dtype will be upcast to - int32. By :func:`numpy.find_common_type` convention, mixing int64 - and uint64 will result in a float64 dtype. + Series.count: Number of non-NA elements in a Series. + DataFrame.value_counts: Count unique combinations of columns. + DataFrame.shape: Number of DataFrame rows and columns (including NA + elements). + DataFrame.isna: Boolean same-sized DataFrame showing places of NA + elements. Examples -------- - A DataFrame where all columns are the same type (e.g., int64) results - in an array of the same type. + Constructing DataFrame from a dictionary: >>> df = pd.DataFrame( - ... {"age": [3, 29], "height": [94, 170], "weight": [31, 115]} + ... { + ... "Person": ["John", "Myla", "Lewis", "John", "Myla"], + ... "Age": [24.0, np.nan, 21.0, 33, 26], + ... "Single": [False, True, True, True, False], + ... } ... ) >>> df - age height weight - 0 3 94 31 - 1 29 170 115 - >>> df.dtypes - age int64 - height int64 - weight int64 - dtype: object - >>> df.values - array([[ 3, 94, 31], - [ 29, 170, 115]]) + Person Age Single + 0 John 24.0 False + 1 Myla NaN True + 2 Lewis 21.0 True + 3 John 33.0 True + 4 Myla 26.0 False + + Notice the uncounted NA values: - A DataFrame with mixed type columns(e.g., str/object, int64, float32) - results in an ndarray of the broadest type that accommodates these - mixed types (e.g., object). + >>> df.count() + Person 5 + Age 4 + Single 5 + dtype: int64 - >>> df2 = pd.DataFrame( - ... [ - ... ("parrot", 24.0, "second"), - ... ("lion", 80.5, 1), - ... ("monkey", np.nan, None), - ... ], - ... columns=("name", "max_speed", "rank"), - ... ) - >>> df2.dtypes - name object - max_speed float64 - rank object - dtype: object - >>> df2.values - array([['parrot', 24.0, 'second'], - ['lion', 80.5, 1], - ['monkey', nan, None]], dtype=object) + Counts for each **row**: + + >>> df.count(axis="columns") + 0 3 + 1 2 + 2 3 + 3 3 + 4 3 + dtype: int64 """ - return self._mgr.as_array() + axis = self._get_axis_number(axis) + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self -def _from_nested_dict( - data: Mapping[HashableT, Mapping[HashableT2, T]], -) -> collections.defaultdict[HashableT2, dict[HashableT, T]]: - new_data: collections.defaultdict[HashableT2, dict[HashableT, T]] = ( - collections.defaultdict(dict) - ) - for index, s in data.items(): - for col, v in s.items(): - new_data[col][index] = v - return new_data - - -def _reindex_for_setitem( - value: DataFrame | Series, index: Index -) -> tuple[ArrayLike, BlockValuesRefs | None]: - # reindex if necessary - - if value.index.equals(index) or not len(index): - if isinstance(value, Series): - return value._values, value._references - return value._values.copy(), None - - # GH#4107 - try: - reindexed_value = value.reindex(index)._values - except ValueError as err: - # raised in MultiIndex.from_tuples, see test_insert_error_msmgs - if not value.index.is_unique: - # duplicate axis - raise err - - raise TypeError( - "incompatible index of inserted column with frame index" - ) from err - return reindexed_value, None + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) + else: + result = notna(frame).sum(axis=axis) + + return result.astype("int64").__finalize__(self, method="count") + + def kurt( + self, + *, + axis: Axis | None = 0, + skipna: bool = True, + numeric_only: bool = False, + **kwargs, + ) -> Series | float: + result = super().kurt( + axis=axis, skipna=skipna, numeric_only=numeric_only, **kwargs + ) + if isinstance(result, Series): + result = result.__finalize__(self, method="kurt") + return result