2020
2121import numpy as np
2222import pandas as pd
23+ from scipy .stats import variation
2324
2425from ... import opcodes as OperandDef
2526from ...config import options
5960
6061class SizeRecorder :
6162 def __init__ (self ):
62- self ._raw_records = 0
63- self ._agg_records = 0
63+ self ._raw_records = []
64+ self ._agg_records = []
6465
65- def record (self , raw_records : int , agg_records : int ):
66- self ._raw_records += raw_records
67- self ._agg_records += agg_records
66+ def record (self , raw_record : int , agg_record : int ):
67+ self ._raw_records . append ( raw_record )
68+ self ._agg_records . append ( agg_record )
6869
6970 def get (self ):
7071 return self ._raw_records , self ._agg_records
@@ -659,15 +660,27 @@ def _tile_auto(
659660 # yield to trigger execution
660661 yield chunks
661662
662- raw_size , agg_size = size_recorder .get ()
663+ raw_sizes , agg_sizes = size_recorder .get ()
663664 # destroy size recorder
664665 ctx .destroy_remote_object (size_recorder_name )
665666
666667 left_chunks = in_df .chunks [combine_size :]
667668 left_chunks = cls ._gen_map_chunks (op , left_chunks , out_df , func_infos )
668- if raw_size >= agg_size * len (chunks ):
669- # aggregated size is less than 1 chunk
670- # use tree aggregation
669+ # calculate the coefficient of variation of aggregation sizes,
670+ # if the CV is less than 0.2 and the mean of agg_size/raw_size
671+ # is less than 0.8, we suppose the single chunk's aggregation size
672+ # almost equals to the tileable's, then use tree method
673+ # as combine aggregation results won't lead to a rapid expansion.
674+ ratios = [
675+ agg_size / raw_size for agg_size , raw_size in zip (agg_sizes , raw_sizes )
676+ ]
677+ cv = variation (agg_sizes )
678+ mean_ratio = np .mean (ratios )
679+ if mean_ratio <= 1 / len (chunks ):
680+ # if mean of ratio is less than 0.25, use tree
681+ return cls ._combine_tree (op , chunks + left_chunks , out_df , func_infos )
682+ elif cv <= 0.2 and mean_ratio <= 2 / 3 :
683+ # check CV and mean of ratio
671684 return cls ._combine_tree (op , chunks + left_chunks , out_df , func_infos )
672685 else :
673686 # otherwise, use shuffle
@@ -685,7 +698,7 @@ def tile(cls, op: "DataFrameGroupByAgg"):
685698 func_infos = cls ._compile_funcs (op , in_df )
686699
687700 if op .method == "auto" :
688- if len (in_df .chunks ) < op .combine_size :
701+ if len (in_df .chunks ) <= op .combine_size :
689702 return cls ._tile_with_tree (op , in_df , out_df , func_infos )
690703 else :
691704 return (yield from cls ._tile_auto (op , in_df , out_df , func_infos ))
0 commit comments