Skip to content

Commit cc48c65

Browse files
authored
Merge branch 'main' into jkew/emit-hybrid-metrics
2 parents cb308f7 + 8266834 commit cc48c65

File tree

10 files changed

+103
-76
lines changed

10 files changed

+103
-76
lines changed

modin/config/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
MinPartitionSize,
4545
MinRowPartitionSize,
4646
ModinNumpy,
47+
NativePandasMaxRows,
48+
NativePandasTransferThreshold,
4749
NPartitions,
4850
PersistentPickle,
4951
ProgressBar,
@@ -85,6 +87,9 @@
8587
"LazyExecution",
8688
# Dask specific
8789
"DaskThreadsPerWorker",
90+
# Native Pandas Specific
91+
"NativePandasMaxRows",
92+
"NativePandasTransferThreshold",
8893
# Partitioning
8994
"NPartitions",
9095
"MinPartitionSize",

modin/config/envvars.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1305,6 +1305,25 @@ class DaskThreadsPerWorker(EnvironmentVariable, type=int):
13051305
default = 1
13061306

13071307

1308+
class NativePandasMaxRows(EnvironmentVariable, type=int):
1309+
"""Maximum number of rows which can be processed using local, native, pandas."""
1310+
1311+
varname = "MODIN_NATIVE_MAX_ROWS"
1312+
default = 10_000_000
1313+
1314+
1315+
class NativePandasTransferThreshold(EnvironmentVariable, type=int):
1316+
"""
1317+
Targeted max number of dataframe rows which should be transferred between engines.
1318+
1319+
This is often the same value as MODIN_NATIVE_MAX_ROWS but it can be independently
1320+
set to change how transfer costs are considered.
1321+
"""
1322+
1323+
varname = "MODIN_NATIVE_MAX_XFER_ROWS"
1324+
default = 10_000_000
1325+
1326+
13081327
class DynamicPartitioning(EnvironmentVariable, type=bool):
13091328
"""
13101329
Set to true to use Modin's dynamic-partitioning implementation where possible.

modin/core/storage_formats/base/query_compiler.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -356,13 +356,13 @@ def move_to_cost(
356356
"""
357357
if isinstance(self, other_qc_type):
358358
return QCCoercionCost.COST_ZERO
359-
if self._TRANSFER_THRESHOLD <= 0:
359+
if self.__class__._transfer_threshold() <= 0:
360360
return QCCoercionCost.COST_ZERO
361361
cost = int(
362362
(
363363
QCCoercionCost.COST_IMPOSSIBLE
364364
* self._max_shape()[0]
365-
/ self._TRANSFER_THRESHOLD
365+
/ self.__class__._transfer_threshold()
366366
)
367367
)
368368
if cost > QCCoercionCost.COST_IMPOSSIBLE:
@@ -447,7 +447,7 @@ def stay_cost(
447447
return self._stay_cost_rows(
448448
self._max_shape()[0],
449449
self._OPERATION_PER_ROW_OVERHEAD,
450-
self._MAX_SIZE_THIS_ENGINE_CAN_HANDLE,
450+
self.__class__._engine_max_size(),
451451
self._OPERATION_INITIALIZATION_OVERHEAD,
452452
)
453453

@@ -498,10 +498,20 @@ def move_to_me_cost(
498498
return cls._stay_cost_rows(
499499
row_count,
500500
cls._OPERATION_PER_ROW_OVERHEAD,
501-
cls._MAX_SIZE_THIS_ENGINE_CAN_HANDLE,
501+
cls._engine_max_size(),
502502
cls._OPERATION_INITIALIZATION_OVERHEAD,
503503
)
504504

505+
@classmethod
506+
def _engine_max_size(cls) -> int:
507+
"""Maximum number of rows this engine can handle."""
508+
return cls._MAX_SIZE_THIS_ENGINE_CAN_HANDLE
509+
510+
@classmethod
511+
def _transfer_threshold(cls) -> int:
512+
"""Maximum number of rows this backend can handle before transferring data to another backend."""
513+
return cls._TRANSFER_THRESHOLD
514+
505515
@disable_logging
506516
def max_cost(self) -> int:
507517
"""

modin/core/storage_formats/pandas/native_query_compiler.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,11 @@
2323
import pandas
2424
from pandas.core.dtypes.common import is_scalar
2525

26+
from modin.config.envvars import NativePandasMaxRows, NativePandasTransferThreshold
2627
from modin.core.dataframe.base.interchange.dataframe_protocol.dataframe import (
2728
ProtocolDataframe,
2829
)
29-
from modin.core.storage_formats.base.query_compiler import (
30-
BaseQueryCompiler,
31-
)
30+
from modin.core.storage_formats.base.query_compiler import BaseQueryCompiler
3231
from modin.utils import _inherit_docstrings
3332

3433
_NO_REPARTITION_ON_NATIVE_EXECUTION_EXCEPTION_MESSAGE = (
@@ -92,10 +91,8 @@ class NativeQueryCompiler(BaseQueryCompiler):
9291
The pandas frame to query with the compiled queries.
9392
"""
9493

95-
_MAX_SIZE_THIS_ENGINE_CAN_HANDLE = 10_000_000
9694
_OPERATION_INITIALIZATION_OVERHEAD = 0
9795
_OPERATION_PER_ROW_OVERHEAD = 0
98-
_TRANSFER_THRESHOLD = 10_000_000
9996

10097
_modin_frame: pandas.DataFrame
10198
_should_warn_on_default_to_pandas: bool = False
@@ -212,8 +209,21 @@ def free(self):
212209
def finalize(self):
213210
return
214211

215-
# Dataframe interchange protocol
212+
@classmethod
213+
def _engine_max_size(cls):
214+
# do not return the custom configuration for sub-classes
215+
if cls == NativeQueryCompiler:
216+
return NativePandasMaxRows.get()
217+
return cls._MAX_SIZE_THIS_ENGINE_CAN_HANDLE
216218

219+
@classmethod
220+
def _transfer_threshold(cls):
221+
# do not return the custom configuration for sub-classes
222+
if cls == NativeQueryCompiler:
223+
return NativePandasTransferThreshold.get()
224+
return cls._TRANSFER_THRESHOLD
225+
226+
# Dataframe interchange protocol
217227
def to_interchange_dataframe(
218228
self, nan_as_null: bool = False, allow_copy: bool = True
219229
):

modin/core/storage_formats/pandas/query_compiler_caster.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
"__delattr__",
6262
"__getattr__",
6363
"_getattribute__from_extension_impl",
64-
"_getattr__from_extension_impl",
6564
"get_backend",
6665
"move_to",
6766
"_update_inplace",

modin/pandas/base.py

Lines changed: 0 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@
110110
"_query_compiler",
111111
"get_backend",
112112
"_getattribute__from_extension_impl",
113-
"_getattr__from_extension_impl",
114113
"_get_query_compiler",
115114
"set_backend",
116115
"_pinned",
@@ -4376,29 +4375,6 @@ def default_handler(*args, **kwargs):
43764375
return default_handler
43774376
return attr
43784377

4379-
@disable_logging
4380-
def __getattr__(self, item) -> Any:
4381-
"""
4382-
Return attribute from this `BasePandasDataset`.
4383-
4384-
Parameters
4385-
----------
4386-
item : str
4387-
Item to get.
4388-
4389-
Returns
4390-
-------
4391-
Any
4392-
The attribute from this `BasePandasDataset`.
4393-
"""
4394-
# NOTE that to get an attribute, python calls __getattribute__() first and
4395-
# then falls back to __getattr__() if the former raises an AttributeError.
4396-
if item not in _EXTENSION_NO_LOOKUP:
4397-
extension = self._getattr__from_extension_impl(item, __class__._extensions)
4398-
if extension is not sentinel:
4399-
return extension
4400-
return object.__getattribute__(self, item)
4401-
44024378
def __array_ufunc__(
44034379
self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
44044380
) -> DataFrame | Series | Any:
@@ -4664,36 +4640,6 @@ def _getattribute__from_extension_impl(
46644640
)
46654641
return sentinel
46664642

4667-
@disable_logging
4668-
def _getattr__from_extension_impl(self, item, extensions: EXTENSION_DICT_TYPE):
4669-
"""
4670-
__getattr__() an extension with the given name from the given set of extensions.
4671-
4672-
Implement __getattr__() for extensions. python falls back to
4673-
__getattr__() if __getattribute__() raises an AttributeError.
4674-
4675-
Parameters
4676-
----------
4677-
item : str
4678-
The name of the attribute to get.
4679-
extensions : EXTENSION_DICT_TYPE
4680-
The set of extensions.
4681-
4682-
Returns
4683-
-------
4684-
Any
4685-
The attribute from the extension, or `sentinel` if the attribute is
4686-
not found.
4687-
"""
4688-
extension = self._get_extension(item, extensions)
4689-
if extension is not sentinel:
4690-
# We need to implement callable extensions before we fall back
4691-
# to __getattr__(), because they need to dispatch to the
4692-
# appropriate backend.
4693-
assert not callable(extension)
4694-
return extension
4695-
return sentinel
4696-
46974643
@disable_logging
46984644
@_inherit_docstrings(QueryCompilerCaster._get_query_compiler)
46994645
def _get_query_compiler(self):

modin/pandas/dataframe.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2644,11 +2644,6 @@ def __getattr__(self, key) -> Any:
26442644
"""
26452645
# NOTE that to get an attribute, python calls __getattribute__() first and
26462646
# then falls back to __getattr__() if the former raises an AttributeError.
2647-
2648-
if key not in _EXTENSION_NO_LOOKUP:
2649-
extension = self._getattr__from_extension_impl(key, __class__._extensions)
2650-
if extension is not sentinel:
2651-
return extension
26522647
try:
26532648
return super().__getattr__(key)
26542649
except AttributeError as err:

modin/pandas/series.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -387,10 +387,6 @@ def __getattr__(self, key: Hashable) -> Any:
387387
"""
388388
# NOTE that to get an attribute, python calls __getattribute__() first and
389389
# then falls back to __getattr__() if the former raises an AttributeError.
390-
if key not in _EXTENSION_NO_LOOKUP:
391-
extension = self._getattr__from_extension_impl(key, __class__._extensions)
392-
if extension is not sentinel:
393-
return extension
394390
try:
395391
return super().__getattr__(key)
396392
except AttributeError as err:

modin/tests/pandas/extensions/test_base_extensions.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,23 @@ def del_property(self):
217217
)
218218

219219
modin_object = data_class({"a": [1, 2, 3], "b": [4, 5, 6]}).set_backend(backend)
220-
assert hasattr(modin_object, public_property_name)
221220
setattr(modin_object, public_property_name, "value")
222221
assert getattr(modin_object, public_property_name) == "value"
223222
delattr(modin_object, public_property_name)
224223
# check that the deletion works.
225224
assert not hasattr(modin_object, private_property_name)
226225

226+
def test_get_property_that_raises_attribute_error_on_get_modin_issue_7562(
227+
self, data_class
228+
):
229+
def get_property(self):
230+
raise AttributeError
231+
232+
register_base_accessor(name="extension_property")(property(fget=get_property))
233+
modin_object = data_class()
234+
with pytest.raises(AttributeError):
235+
getattr(modin_object, "extension_property")
236+
227237
def test_non_settable_extension_property(self, Backend1, data_class):
228238
modin_object = data_class([0])
229239
property_name = "property_name"

modin/tests/pandas/native_df_interoperability/test_compiler_caster.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,13 @@
2424

2525
import modin.pandas as pd
2626
from modin.config import context as config_context
27-
from modin.config.envvars import Backend, Engine, Execution
27+
from modin.config.envvars import (
28+
Backend,
29+
Engine,
30+
Execution,
31+
NativePandasMaxRows,
32+
NativePandasTransferThreshold,
33+
)
2834
from modin.core.execution.dispatching.factories import factories
2935
from modin.core.execution.dispatching.factories.factories import BaseFactory
3036
from modin.core.io.io import BaseIO
@@ -1317,3 +1323,34 @@ def test_second_init_only_calls_from_pandas_once_github_issue_7559():
13171323
) as mock_from_pandas:
13181324
pd.DataFrame([1])
13191325
mock_from_pandas.assert_called_once()
1326+
1327+
1328+
def test_native_config():
1329+
qc = NativeQueryCompiler(pandas.DataFrame([0, 1, 2]))
1330+
1331+
# Native Query Compiler gets a special configuration
1332+
assert qc._TRANSFER_THRESHOLD == 0
1333+
assert qc._transfer_threshold() == NativePandasTransferThreshold.get()
1334+
assert qc._MAX_SIZE_THIS_ENGINE_CAN_HANDLE == 1
1335+
assert qc._engine_max_size() == NativePandasMaxRows.get()
1336+
1337+
oldmax = qc._engine_max_size()
1338+
oldthresh = qc._transfer_threshold()
1339+
1340+
with config_context(NativePandasMaxRows=123, NativePandasTransferThreshold=321):
1341+
qc2 = NativeQueryCompiler(pandas.DataFrame([0, 1, 2]))
1342+
assert qc2._transfer_threshold() == 321
1343+
assert qc2._engine_max_size() == 123
1344+
assert qc._engine_max_size() == 123
1345+
assert qc._transfer_threshold() == 321
1346+
1347+
# sub class configuration is unchanged
1348+
class AQC(NativeQueryCompiler):
1349+
pass
1350+
1351+
subqc = AQC(pandas.DataFrame([0, 1, 2]))
1352+
assert subqc._TRANSFER_THRESHOLD == 0
1353+
assert subqc._MAX_SIZE_THIS_ENGINE_CAN_HANDLE == 1
1354+
1355+
assert qc._engine_max_size() == oldmax
1356+
assert qc._transfer_threshold() == oldthresh

0 commit comments

Comments
 (0)