|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from typing import TYPE_CHECKING, Any |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | +from pandas import eval as _eval |
| 7 | + |
| 8 | +if TYPE_CHECKING: |
| 9 | + from collections.abc import Hashable, Iterator, Mapping, Sequence |
| 10 | + |
| 11 | + from pandas._typing import ArrayLike |
| 12 | + |
| 13 | + |
| 14 | +def _get_cleaned_column_resolvers( |
| 15 | + df: pd.DataFrame, raw: bool = True |
| 16 | +) -> dict[Hashable, ArrayLike | pd.Series]: |
| 17 | + """ |
| 18 | + Return the special character free column resolvers of a dataframe. |
| 19 | +
|
| 20 | + Column names with special characters are 'cleaned up' so that they can |
| 21 | + be referred to by backtick quoting. |
| 22 | + Used in :meth:`DataFrame.eval`. |
| 23 | + """ |
| 24 | + from pandas import Series |
| 25 | + from pandas.core.computation.parsing import clean_column_name |
| 26 | + |
| 27 | + if isinstance(df, pd.Series): |
| 28 | + return {clean_column_name(df.name): df} |
| 29 | + |
| 30 | + # CHANGED FROM PANDAS: do not even convert the arrays to pd.Series, just |
| 31 | + # give the raw arrays to the compute engine. This is potentially a breaking |
| 32 | + # change if any of the operations in the eval string require a pd.Series. |
| 33 | + if raw: |
| 34 | + # Performance tradeoff: in the dict below, we iterate over `df.items`, |
| 35 | + # which yields tuples of (column_name, data as pd.Series). This is marginally |
| 36 | + # slower than iterating over `df.columns` and `df._iter_column_arrays()`, |
| 37 | + # but the latter is not in Pandas' public API, and may be removed in the future. |
| 38 | + return { |
| 39 | + clean_column_name(k): v for k, v in df.items() if not isinstance(k, int) |
| 40 | + } |
| 41 | + |
| 42 | + # CHANGED FROM PANDAS: do not call df.dtype inside the dict comprehension loop |
| 43 | + # This update has been made in https://github.com/pandas-dev/pandas/pull/59573, |
| 44 | + # but appears not to have been released yet as of pandas 2.2.3 |
| 45 | + dtypes = df.dtypes |
| 46 | + |
| 47 | + return { |
| 48 | + clean_column_name(k): Series( |
| 49 | + v, copy=False, index=df.index, name=k, dtype=dtypes[k] |
| 50 | + ).__finalize__(df) |
| 51 | + for k, v in zip(df.columns, df._iter_column_arrays()) |
| 52 | + if not isinstance(k, int) |
| 53 | + } |
| 54 | + |
| 55 | + |
| 56 | +def fast_eval(df: pd.DataFrame, expr: str, **kwargs) -> Any | None: |
| 57 | + """ |
| 58 | + Evaluate a string describing operations on DataFrame columns. |
| 59 | +
|
| 60 | + Operates on columns only, not specific rows or elements. This allows |
| 61 | + `eval` to run arbitrary code, which can make you vulnerable to code |
| 62 | + injection if you pass user input to this function. |
| 63 | +
|
| 64 | + This function is a wrapper that replaces :meth:`~pandas.DataFrame.eval` |
| 65 | + with a more efficient version than in the default pandas library (as |
| 66 | + of pandas 2.2.3). It is recommended to use this function instead of |
| 67 | + :meth:`~pandas.DataFrame.eval` for better performance. However, if you |
| 68 | + encounter issues with this function, you can switch back to the default |
| 69 | + pandas eval by changing the function call from `fast_eval(df, ...)` to |
| 70 | + `df.eval(...)`. |
| 71 | +
|
| 72 | + Parameters |
| 73 | + ---------- |
| 74 | + expr : str |
| 75 | + The expression string to evaluate. |
| 76 | + **kwargs |
| 77 | + See the documentation for :meth:`~pandas.DataFrame.eval` for complete |
| 78 | + details on the keyword arguments accepted. |
| 79 | +
|
| 80 | + Returns |
| 81 | + ------- |
| 82 | + ndarray, scalar, or pandas object |
| 83 | + The result of the evaluation. |
| 84 | + """ |
| 85 | + |
| 86 | + inplace = False |
| 87 | + kwargs["level"] = kwargs.pop("level", 0) + 1 |
| 88 | + index_resolvers = df._get_index_resolvers() |
| 89 | + column_resolvers = _get_cleaned_column_resolvers(df) |
| 90 | + resolvers = column_resolvers, index_resolvers |
| 91 | + if "target" not in kwargs: |
| 92 | + kwargs["target"] = df |
| 93 | + kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers |
| 94 | + |
| 95 | + try: |
| 96 | + return pd.Series( |
| 97 | + _eval(expr, inplace=inplace, **kwargs), index=df.index, name=expr |
| 98 | + ).__finalize__(df) |
| 99 | + except Exception as e: |
| 100 | + # Initially assume that the exception is caused by the potentially |
| 101 | + # breaking change in _get_cleaned_column_resolvers, and try again |
| 102 | + # TODO: what kind of exception should be caught here so it is less broad |
| 103 | + column_resolvers = _get_cleaned_column_resolvers(df, raw=False) |
| 104 | + resolvers = column_resolvers, index_resolvers |
| 105 | + kwargs["resolvers"] = kwargs["resolvers"][:-2] + resolvers |
| 106 | + return _eval(expr, inplace=inplace, **kwargs) |
0 commit comments