2828 recursive_tile ,
2929)
3030from ...core .operand import OperandStage
31- from ...utils import tokenize
31+ from ...lib . version import parse as parse_version
3232from ...serialization .serializables import (
3333 BoolField ,
3434 AnyField ,
3535 DataTypeField ,
3636 Int32Field ,
3737 StringField ,
3838)
39+ from ...utils import tokenize
3940from ..core import SERIES_TYPE
4041from ..utils import (
4142 parse_index ,
4748)
4849from ..operands import DataFrameOperandMixin , DataFrameOperand , DATAFRAME_TYPE
4950
51+ _pd_release = parse_version (pd .__version__ ).release [:2 ]
52+ # in pandas<1.3, when aggregating with multiple levels and numeric_only is True,
53+ # object cols not ignored with min-max funcs
54+ _level_reduction_keep_object = _pd_release < (1 , 3 )
55+ # in pandas>=1.3, when dataframes are reduced into series, mixture of float and bool
56+ # results in object.
57+ _reduce_bool_as_object = _pd_release >= (1 , 3 )
58+
5059
5160class DataFrameReductionOperand (DataFrameOperand ):
5261 _axis = AnyField ("axis" )
@@ -211,22 +220,22 @@ def _get_series_reduction_dtype(
211220 func_name ,
212221 axis = None ,
213222 bool_only = False ,
214- skipna = False ,
223+ skipna = True ,
215224 numeric_only = False ,
216225):
217- empty_series = build_series (dtype = dtype , ensure_string = True )
226+ test_series = build_series (dtype = dtype , ensure_string = True )
218227 if func_name == "count" :
219- reduced = empty_series .count ()
228+ reduced = test_series .count ()
220229 elif func_name == "nunique" :
221- reduced = empty_series .nunique ()
230+ reduced = test_series .nunique ()
222231 elif func_name in ("all" , "any" ):
223- reduced = getattr (empty_series , func_name )(axis = axis , bool_only = bool_only )
232+ reduced = getattr (test_series , func_name )(axis = axis , bool_only = bool_only )
224233 elif func_name == "size" :
225- reduced = empty_series .size
234+ reduced = test_series .size
226235 elif func_name == "str_concat" :
227- reduced = pd .Series ([empty_series .str .cat ()])
236+ reduced = pd .Series ([test_series .str .cat ()])
228237 else :
229- reduced = getattr (empty_series , func_name )(
238+ reduced = getattr (test_series , func_name )(
230239 axis = axis , skipna = skipna , numeric_only = numeric_only
231240 )
232241 return pd .Series (reduced ).dtype
@@ -236,17 +245,17 @@ def _get_series_reduction_dtype(
236245def _get_df_reduction_dtype (
237246 dtype , func_name , axis = None , bool_only = False , skipna = False , numeric_only = False
238247):
239- empty_df = build_series (dtype = dtype , ensure_string = True ).to_frame ()
248+ test_df = build_series (dtype = dtype , ensure_string = True ).to_frame ()
240249 if func_name == "count" :
241- reduced = getattr (empty_df , func_name )(axis = axis , numeric_only = numeric_only )
250+ reduced = getattr (test_df , func_name )(axis = axis , numeric_only = numeric_only )
242251 elif func_name == "nunique" :
243- reduced = getattr (empty_df , func_name )(axis = axis )
252+ reduced = getattr (test_df , func_name )(axis = axis )
244253 elif func_name in ("all" , "any" ):
245- reduced = getattr (empty_df , func_name )(axis = axis , bool_only = bool_only )
254+ reduced = getattr (test_df , func_name )(axis = axis , bool_only = bool_only )
246255 elif func_name == "str_concat" :
247- reduced = empty_df .apply (lambda s : s .str .cat (), axis = axis )
256+ reduced = test_df .apply (lambda s : s .str .cat (), axis = axis )
248257 else :
249- reduced = getattr (empty_df , func_name )(
258+ reduced = getattr (test_df , func_name )(
250259 axis = axis , skipna = skipna , numeric_only = numeric_only
251260 )
252261 if len (reduced ) == 0 :
@@ -304,7 +313,7 @@ def _call_groupby_level(self, df, level):
304313 def _call_dataframe (self , df ):
305314 axis = getattr (self , "axis" , None ) or 0
306315 level = getattr (self , "level" , None )
307- skipna = getattr (self , "skipna" , None )
316+ skipna = getattr (self , "skipna" , True )
308317 numeric_only = getattr (self , "numeric_only" , None )
309318 bool_only = getattr (self , "bool_only" , None )
310319 self ._axis = axis = validate_axis (axis , df )
@@ -327,9 +336,9 @@ def _call_dataframe(self, df):
327336 reduced_dtype = reduced .dtype
328337 else :
329338 reduced_cols , dtypes = [], []
330- for col , dt in df .dtypes .items ():
339+ for col , src_dt in df .dtypes .items ():
331340 dt = _get_df_reduction_dtype (
332- dt ,
341+ src_dt ,
333342 func_name ,
334343 axis = axis ,
335344 bool_only = bool_only ,
@@ -339,16 +348,29 @@ def _call_dataframe(self, df):
339348 if dt is not None :
340349 reduced_cols .append (col )
341350 dtypes .append (dt )
351+ elif (
352+ _level_reduction_keep_object
353+ and numeric_only
354+ and level is not None
355+ and func_name in ("min" , "max" )
356+ and src_dt == np .dtype (object )
357+ ): # pragma: no cover
358+ reduced_cols .append (col )
359+ dtypes .append (np .dtype (object ))
342360 if len (dtypes ) == 0 :
343361 reduced_dtype = np .dtype ("O" )
344362 elif all (dt == dtypes [0 ] for dt in dtypes ):
345363 reduced_dtype = dtypes [0 ]
346- elif not all (isinstance (dt , np .dtype ) and dt != bool for dt in dtypes ):
347- # todo currently we return mixed dtypes as np.dtype('O').
348- # handle pandas Dtypes in the future more carefully.
349- reduced_dtype = np .dtype ("O" )
350364 else :
351- reduced_dtype = np .find_common_type (dtypes , [])
365+ has_bool = any (dt == bool for dt in dtypes )
366+ if _reduce_bool_as_object and has_bool :
367+ reduced_dtype = np .dtype ("O" )
368+ elif not all (isinstance (dt , np .dtype ) for dt in dtypes ):
369+ # todo currently we return mixed dtypes as np.dtype('O').
370+ # handle pandas Dtypes in the future more carefully.
371+ reduced_dtype = np .dtype ("O" )
372+ else :
373+ reduced_dtype = np .find_common_type (dtypes , [])
352374
353375 if level is not None :
354376 return self ._call_groupby_level (df [reduced_cols ], level )
@@ -370,7 +392,7 @@ def _call_dataframe(self, df):
370392 def _call_series (self , series ):
371393 level = getattr (self , "level" , None )
372394 axis = getattr (self , "axis" , None )
373- skipna = getattr (self , "skipna" , None )
395+ skipna = getattr (self , "skipna" , True )
374396 numeric_only = getattr (self , "numeric_only" , None )
375397 bool_only = getattr (self , "bool_only" , None )
376398 self ._axis = axis = validate_axis (axis or 0 , series )
@@ -442,8 +464,8 @@ def _tile_dataframe(cls, op):
442464 n_rows , n_cols = in_df .chunk_shape
443465
444466 # map to get individual results and summaries
445- src_chunks = np .empty (in_df .chunk_shape , dtype = np . object )
446- summary_chunks = np .empty (in_df .chunk_shape , dtype = np . object )
467+ src_chunks = np .empty (in_df .chunk_shape , dtype = object )
468+ summary_chunks = np .empty (in_df .chunk_shape , dtype = object )
447469 for c in in_df .chunks :
448470 new_chunk_op = op .copy ().reset_key ()
449471 new_chunk_op .stage = OperandStage .map
@@ -457,7 +479,7 @@ def _tile_dataframe(cls, op):
457479 )
458480
459481 # combine summaries into results
460- output_chunk_array = np .empty (in_df .chunk_shape , dtype = np . object )
482+ output_chunk_array = np .empty (in_df .chunk_shape , dtype = object )
461483 if op .axis == 1 :
462484 for row in range (n_rows ):
463485 row_src = src_chunks [row , :]
@@ -493,7 +515,7 @@ def _tile_series(cls, op):
493515 series = op .outputs [0 ]
494516
495517 # map to get individual results and summaries
496- summary_chunks = np .empty (in_series .chunk_shape , dtype = np . object )
518+ summary_chunks = np .empty (in_series .chunk_shape , dtype = object )
497519 for c in in_series .chunks :
498520 new_chunk_op = op .copy ().reset_key ()
499521 new_chunk_op .stage = OperandStage .map
0 commit comments