Skip to content

Commit 2689541

Browse files
committed
NativeQueryCompiler performance improvements
WIP for discussion which does two things: - allows for fewer deep copies - overrides default_to_pandas with a version which reduces copies - comment out various metrics ( probably not a huge factor here )
1 parent 60d34ac commit 2689541

File tree

3 files changed

+69
-63
lines changed

3 files changed

+69
-63
lines changed

modin/core/storage_formats/base/query_compiler_calculator.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -162,22 +162,22 @@ def calculate(self) -> str:
162162
f"BackendCostCalculator Results: {self._calc_result_log(self._result_backend)}"
163163
)
164164
# Does not need to be secure, should not use system entropy
165-
metrics_group = "%04x" % random.randrange(16**4)
166-
for qc in self._qc_list:
167-
max_shape = qc._max_shape()
168-
backend = qc.get_backend()
169-
emit_metric(
170-
f"hybrid.merge.candidate.{backend}.group.{metrics_group}.rows",
171-
max_shape[0],
172-
)
173-
emit_metric(
174-
f"hybrid.merge.candidate.{backend}.group.{metrics_group}.cols",
175-
max_shape[1],
176-
)
177-
for k, v in self._backend_data.items():
178-
emit_metric(
179-
f"hybrid.merge.candidate.{k}.group.{metrics_group}.cost", v.cost
180-
)
165+
metrics_group = 1 #"%04x" % random.randrange(16**4)
166+
#for qc in self._qc_list:
167+
# max_shape = qc._max_shape()
168+
# backend = qc.get_backend()
169+
# emit_metric(
170+
# f"hybrid.merge.candidate.{backend}.group.{metrics_group}.rows",
171+
# max_shape[0],
172+
# )
173+
# emit_metric(
174+
# f"hybrid.merge.candidate.{backend}.group.{metrics_group}.cols",
175+
# max_shape[1],
176+
# )
177+
#for k, v in self._backend_data.items():
178+
# emit_metric(
179+
# f"hybrid.merge.candidate.{k}.group.{metrics_group}.cost", v.cost
180+
# )
181181
emit_metric(
182182
f"hybrid.merge.decision.{self._result_backend}.group.{metrics_group}",
183183
1,

modin/core/storage_formats/pandas/native_query_compiler.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ class NativeQueryCompiler(BaseQueryCompiler):
102102
_modin_frame: pandas.DataFrame
103103
_should_warn_on_default_to_pandas: bool = False
104104

105-
def __init__(self, pandas_frame):
105+
def __init__(self, pandas_frame, in_place=False):
106106
if hasattr(pandas_frame, "_to_pandas"):
107107
pandas_frame = pandas_frame._to_pandas()
108108
if is_scalar(pandas_frame):
@@ -112,10 +112,10 @@ def __init__(self, pandas_frame):
112112
# so that we don't modify it.
113113
# TODO(https://github.com/modin-project/modin/issues/7435): Look
114114
# into avoiding this copy.
115-
pandas_frame = pandas_frame.copy()
115+
if not in_place:
116+
pandas_frame = pandas_frame.copy()
116117
else:
117118
pandas_frame = pandas.DataFrame(pandas_frame)
118-
119119
self._modin_frame = pandas_frame
120120

121121
storage_format = property(
@@ -126,6 +126,11 @@ def __init__(self, pandas_frame):
126126
def execute(self):
127127
pass
128128

129+
130+
def default_to_pandas(self, pandas_op, *args, **kwargs):
131+
#print("new default to pandas")
132+
return type(self)(pandas_op(self._modin_frame, *args, **kwargs), in_place=True)
133+
129134
@property
130135
def frame_has_materialized_dtypes(self) -> bool:
131136
"""
@@ -202,7 +207,7 @@ def to_pandas(self):
202207

203208
@classmethod
204209
def from_pandas(cls, df, data_cls):
205-
return cls(df)
210+
return cls(df, in_place=True)
206211

207212
@classmethod
208213
def from_arrow(cls, at, data_cls):

modin/core/storage_formats/pandas/query_compiler_caster.py

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -768,7 +768,7 @@ def _get_backend_for_auto_switch(
768768
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
769769

770770
# Does not need to be secure, should not use system entropy
771-
metrics_group = "%04x" % random.randrange(16**4)
771+
metrics_group = 1 #"%04x" % random.randrange(16**4)
772772
starting_backend = input_qc.get_backend()
773773

774774
min_move_stay_delta = None
@@ -779,23 +779,23 @@ def _get_backend_for_auto_switch(
779779
operation=function_name,
780780
arguments=arguments,
781781
)
782-
data_max_shape = input_qc._max_shape()
783-
emit_metric(
784-
f"hybrid.auto.api.{class_of_wrapped_fn}.{function_name}.group.{metrics_group}",
785-
1,
786-
)
787-
emit_metric(
788-
f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.stay_cost",
789-
stay_cost,
790-
)
791-
emit_metric(
792-
f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.rows",
793-
data_max_shape[0],
794-
)
795-
emit_metric(
796-
f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.cols",
797-
data_max_shape[1],
798-
)
782+
#data_max_shape = input_qc._max_shape()
783+
#emit_metric(
784+
# f"hybrid.auto.api.{class_of_wrapped_fn}.{function_name}.group.{metrics_group}",
785+
# 1,
786+
#)
787+
#emit_metric(
788+
# f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.stay_cost",
789+
# stay_cost,
790+
#)
791+
#emit_metric(
792+
# f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.rows",
793+
# data_max_shape[0],
794+
#)
795+
#emit_metric(
796+
# f"hybrid.auto.current.{starting_backend}.group.{metrics_group}.cols",
797+
# data_max_shape[1],
798+
#)
799799
for backend in Backend.get_active_backends():
800800
if backend in ("Ray", "Unidist", "Dask"):
801801
# Disable automatically switching to these engines for now, because
@@ -840,35 +840,36 @@ def _get_backend_for_auto_switch(
840840
):
841841
min_move_stay_delta = move_stay_delta
842842
best_backend = backend
843-
emit_metric(
844-
f"hybrid.auto.candidate.{backend}.group.{metrics_group}.move_to_cost",
845-
move_to_cost,
846-
)
847-
emit_metric(
848-
f"hybrid.auto.candidate.{backend}.group.{metrics_group}.other_execute_cost",
849-
other_execute_cost,
850-
)
851-
emit_metric(
852-
f"hybrid.auto.candidate.{backend}.group.{metrics_group}.delta",
853-
move_stay_delta,
854-
)
855-
856-
get_logger().info(
857-
f"After {_normalize_class_name(class_of_wrapped_fn)} function {function_name}, "
858-
+ f"considered moving to backend {backend} with "
859-
+ f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) "
860-
+ f", stay_cost {stay_cost}, and move-stay delta "
861-
+ f"{move_stay_delta}"
862-
)
843+
#emit_metric(
844+
# f"hybrid.auto.candidate.{backend}.group.{metrics_group}.move_to_cost",
845+
# move_to_cost,
846+
#)
847+
#emit_metric(
848+
# f"hybrid.auto.candidate.{backend}.group.{metrics_group}.other_execute_cost",
849+
# other_execute_cost,
850+
#)
851+
#emit_metric(
852+
# f"hybrid.auto.candidate.{backend}.group.{metrics_group}.delta",
853+
# move_stay_delta,
854+
#)
855+
856+
#get_logger().info(
857+
# f"After {_normalize_class_name(class_of_wrapped_fn)} function {function_name}, "
858+
# + f"considered moving to backend {backend} with "
859+
# + f"(transfer_cost {move_to_cost} + other_execution_cost {other_execute_cost}) "
860+
# + f", stay_cost {stay_cost}, and move-stay delta "
861+
# + f"{move_stay_delta}"
862+
#)
863863

864864
if best_backend == starting_backend:
865-
emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0)
866-
get_logger().info(
867-
f"Chose not to switch backends after operation {function_name}"
868-
)
865+
pass
866+
#emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 0)
867+
#get_logger().info(
868+
# f"Chose not to switch backends after operation {function_name}"
869+
#)
869870
else:
870871
emit_metric(f"hybrid.auto.decision.{best_backend}.group.{metrics_group}", 1)
871-
get_logger().info(f"Chose to move to backend {best_backend}")
872+
#get_logger().info(f"Chose to move to backend {best_backend}")
872873
return best_backend
873874

874875

0 commit comments

Comments
 (0)