Skip to content

Commit 5ed69b5

Browse files
PERF-#7657: Fork pandas eval and query implementation to improve performance. (#7658)
Currently we use the pandas eval() and query() implementations almost entirely as is. That's not good practice in general, and #7657 shows a performance issue that applies to Modin but not pandas in the current implementation. In this commit, fork the query() and eval() implementation and eliminate the `.values` call that causes numpy materialization. The code here is mostly copied from pandas/pandas/core/computation, except: - Replace the `.values` call that causes the performance issue in #7657. - Delete nearly all the numexpr code, since we default to pandas for the numexpr engine. - Clean up code in a few places to get through linter and CodeQL. - Delete the pytables code. I don't think that pandas uses this code currently. Resolves #7657 --------- Signed-off-by: sfc-gh-mvashishtha <mahesh.vashishtha@snowflake.com> Co-authored-by: Devin Petersohn <devin.petersohn@snowflake.com>
1 parent 8d468dc commit 5ed69b5

File tree

12 files changed

+2951
-7
lines changed

12 files changed

+2951
-7
lines changed

LICENSE

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,3 +199,40 @@
199199
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200200
See the License for the specific language governing permissions and
201201
limitations under the License.
202+
203+
204+
# Certain code used and distributed in this package is forked from pandas
205+
# (https://github.com/pandas-dev/pandas). The pandas LICENSE
206+
# below applies to those certain forked components in this project:
207+
208+
BSD 3-Clause License
209+
210+
Copyright (c) 2008-2011, AQR Capital Management, LLC, Lambda Foundry, Inc. and PyData Development Team
211+
All rights reserved.
212+
213+
Copyright (c) 2011-2025, Open source contributors.
214+
215+
Redistribution and use in source and binary forms, with or without
216+
modification, are permitted provided that the following conditions are met:
217+
218+
* Redistributions of source code must retain the above copyright notice, this
219+
list of conditions and the following disclaimer.
220+
221+
* Redistributions in binary form must reproduce the above copyright notice,
222+
this list of conditions and the following disclaimer in the documentation
223+
and/or other materials provided with the distribution.
224+
225+
* Neither the name of the copyright holder nor the names of its
226+
contributors may be used to endorse or promote products derived from
227+
this software without specific prior written permission.
228+
229+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
230+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
231+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
232+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
233+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
234+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
235+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
236+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
237+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
238+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

modin/core/computation/__init__.py

Whitespace-only changes.

modin/core/computation/align.py

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
"""
15+
Core eval alignment algorithms. Forked from pandas.core.computation.align
16+
"""
17+
18+
from __future__ import annotations
19+
20+
import warnings
21+
from collections.abc import Sequence
22+
from functools import (
23+
partial,
24+
wraps,
25+
)
26+
from typing import (
27+
Callable,
28+
)
29+
30+
import numpy as np
31+
import pandas
32+
import pandas.core.common as com
33+
from pandas._typing import F
34+
from pandas.core.base import PandasObject
35+
from pandas.errors import PerformanceWarning
36+
37+
from modin.core.computation.common import result_type_many
38+
from modin.pandas import DataFrame, Series
39+
from modin.pandas.base import BasePandasDataset
40+
41+
42+
def _align_core_single_unary_op(
43+
term,
44+
) -> tuple[partial | type[BasePandasDataset], dict[str, pandas.Index] | None]:
45+
typ: partial | type[BasePandasDataset]
46+
axes: dict[str, pandas.Index] | None = None
47+
48+
if isinstance(term.value, np.ndarray):
49+
typ = partial(np.asanyarray, dtype=term.value.dtype)
50+
else:
51+
typ = type(term.value)
52+
if hasattr(term.value, "axes"):
53+
axes = _zip_axes_from_type(typ, term.value.axes)
54+
55+
return typ, axes
56+
57+
58+
def _zip_axes_from_type(
59+
typ: type[BasePandasDataset], new_axes: Sequence[pandas.Index]
60+
) -> dict[str, pandas.Index]:
61+
return {name: new_axes[i] for i, name in enumerate(typ._AXIS_ORDERS)}
62+
63+
64+
def _any_pandas_objects(terms) -> bool:
65+
"""
66+
Check a sequence of terms for instances of PandasObject.
67+
"""
68+
return any(isinstance(term.value, PandasObject) for term in terms)
69+
70+
71+
def _filter_special_cases(f) -> Callable[[F], F]:
72+
@wraps(f)
73+
def wrapper(terms):
74+
# single unary operand
75+
if len(terms) == 1:
76+
return _align_core_single_unary_op(terms[0])
77+
78+
term_values = (term.value for term in terms)
79+
80+
# we don't have any pandas objects
81+
if not _any_pandas_objects(terms):
82+
return result_type_many(*term_values), None
83+
84+
return f(terms)
85+
86+
return wrapper
87+
88+
89+
@_filter_special_cases
90+
def _align_core(terms):
91+
term_index = [i for i, term in enumerate(terms) if hasattr(term.value, "axes")]
92+
term_dims = [terms[i].value.ndim for i in term_index]
93+
94+
ndims = pandas.Series(dict(zip(term_index, term_dims)))
95+
96+
# initial axes are the axes of the largest-axis'd term
97+
biggest = terms[ndims.idxmax()].value
98+
typ = biggest._constructor
99+
axes = biggest.axes
100+
naxes = len(axes)
101+
gt_than_one_axis = naxes > 1
102+
103+
for value in (terms[i].value for i in term_index):
104+
is_series = isinstance(value, Series)
105+
is_series_and_gt_one_axis = is_series and gt_than_one_axis
106+
107+
for axis, items in enumerate(value.axes):
108+
if is_series_and_gt_one_axis:
109+
ax, itm = naxes - 1, value.index
110+
else:
111+
ax, itm = axis, items
112+
113+
if not axes[ax].is_(itm):
114+
axes[ax] = axes[ax].union(itm)
115+
116+
for i, ndim in ndims.items():
117+
for axis, items in zip(range(ndim), axes):
118+
ti = terms[i].value
119+
120+
if hasattr(ti, "reindex"):
121+
transpose = isinstance(ti, Series) and naxes > 1
122+
reindexer = axes[naxes - 1] if transpose else items
123+
124+
term_axis_size = len(ti.axes[axis])
125+
reindexer_size = len(reindexer)
126+
127+
ordm = np.log10(max(1, abs(reindexer_size - term_axis_size)))
128+
if ordm >= 1 and reindexer_size >= 10000:
129+
w = (
130+
f"Alignment difference on axis {axis} is larger "
131+
+ f"than an order of magnitude on term {repr(terms[i].name)}, "
132+
+ f"by more than {ordm:.4g}; performance may suffer."
133+
)
134+
warnings.warn(w, category=PerformanceWarning)
135+
136+
obj = ti.reindex(reindexer, axis=axis, copy=False)
137+
terms[i].update(obj)
138+
139+
terms[i].update(terms[i].value.values)
140+
141+
return typ, _zip_axes_from_type(typ, axes)
142+
143+
144+
def align_terms(terms):
145+
"""
146+
Align a set of terms.
147+
"""
148+
try:
149+
# flatten the parse tree (a nested list, really)
150+
terms = list(com.flatten(terms))
151+
except TypeError:
152+
# can't iterate so it must just be a constant or single variable
153+
if isinstance(terms.value, (Series, DataFrame)):
154+
typ = type(terms.value)
155+
return typ, _zip_axes_from_type(typ, terms.value.axes)
156+
return np.result_type(terms.type), None
157+
158+
# if all resolved variables are numeric scalars
159+
if all(term.is_scalar for term in terms):
160+
return result_type_many(*(term.value for term in terms)).type, None
161+
162+
# perform the main alignment
163+
typ, axes = _align_core(terms)
164+
return typ, axes
165+
166+
167+
def reconstruct_object(typ, obj, axes, dtype):
168+
"""
169+
Reconstruct an object given its type, raw value, and possibly empty
170+
(None) axes.
171+
172+
Parameters
173+
----------
174+
typ : object
175+
A type
176+
obj : object
177+
The value to use in the type constructor
178+
axes : dict
179+
The axes to use to construct the resulting pandas object
180+
181+
Returns
182+
-------
183+
ret : typ
184+
An object of type ``typ`` with the value `obj` and possible axes
185+
`axes`.
186+
"""
187+
try:
188+
typ = typ.type
189+
except AttributeError:
190+
pass
191+
192+
res_t = np.result_type(obj.dtype, dtype)
193+
194+
if not isinstance(typ, partial) and issubclass(typ, PandasObject):
195+
return typ(obj, dtype=res_t, **axes)
196+
197+
# special case for pathological things like ~True/~False
198+
if hasattr(res_t, "type") and typ == np.bool_ and res_t != np.bool_:
199+
ret_value = res_t.type(obj)
200+
else:
201+
ret_value = typ(obj).astype(res_t)
202+
# The condition is to distinguish 0-dim array (returned in case of
203+
# scalar) and 1 element array
204+
# e.g. np.array(0) and np.array([0])
205+
if (
206+
len(obj.shape) == 1
207+
and len(obj) == 1
208+
and not isinstance(ret_value, np.ndarray)
209+
):
210+
ret_value = np.array([ret_value]).astype(res_t)
211+
212+
return ret_value

modin/core/computation/check.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
"""
15+
Forked from pandas.core.computation.check
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from pandas.compat._optional import import_optional_dependency
21+
22+
ne = import_optional_dependency("numexpr", errors="warn")
23+
NUMEXPR_INSTALLED = ne is not None
24+
25+
__all__ = ["NUMEXPR_INSTALLED"]

modin/core/computation/common.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Licensed to Modin Development Team under one or more contributor license agreements.
2+
# See the NOTICE file distributed with this work for additional information regarding
3+
# copyright ownership. The Modin Development Team licenses this file to you under the
4+
# Apache License, Version 2.0 (the "License"); you may not use this file except in
5+
# compliance with the License. You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under
10+
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific language
12+
# governing permissions and limitations under the License.
13+
14+
"""
15+
Forked from pandas.core.computation.common
16+
"""
17+
18+
from __future__ import annotations
19+
20+
from functools import reduce
21+
22+
import numpy as np
23+
from pandas._config import get_option
24+
from pandas.core.dtypes.cast import find_common_type
25+
from pandas.core.dtypes.common import is_extension_array_dtype
26+
27+
28+
def ensure_decoded(s) -> str:
29+
"""
30+
If we have bytes, decode them to unicode.
31+
"""
32+
if isinstance(s, (np.bytes_, bytes)):
33+
s = s.decode(get_option("display.encoding"))
34+
return s
35+
36+
37+
def result_type_many(*arrays_and_dtypes):
38+
"""
39+
Wrapper around numpy.result_type which overcomes the NPY_MAXARGS (32)
40+
argument limit.
41+
"""
42+
try:
43+
return np.result_type(*arrays_and_dtypes)
44+
except ValueError:
45+
# we have > NPY_MAXARGS terms in our expression
46+
return reduce(np.result_type, arrays_and_dtypes)
47+
except TypeError:
48+
arr_and_dtypes = list(arrays_and_dtypes)
49+
ea_dtypes, non_ea_dtypes = [], []
50+
for arr_or_dtype in arr_and_dtypes:
51+
if is_extension_array_dtype(arr_or_dtype):
52+
ea_dtypes.append(arr_or_dtype)
53+
else:
54+
non_ea_dtypes.append(arr_or_dtype)
55+
56+
if non_ea_dtypes:
57+
try:
58+
np_dtype = np.result_type(*non_ea_dtypes)
59+
except ValueError:
60+
np_dtype = reduce(np.result_type, arrays_and_dtypes)
61+
return find_common_type(ea_dtypes + [np_dtype])
62+
63+
return find_common_type(ea_dtypes)

0 commit comments

Comments
 (0)