@@ -424,10 +424,13 @@ def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int:
424424
425425
426426def estimate_pandas_size (
427- df_obj , max_samples : int = 10 , min_sample_rows : int = 100
427+ pd_obj , max_samples : int = 10 , min_sample_rows : int = 100
428428) -> int :
429- if len (df_obj ) <= min_sample_rows or isinstance (df_obj , pd .RangeIndex ):
430- return sys .getsizeof (df_obj )
429+ if len (pd_obj ) <= min_sample_rows or isinstance (pd_obj , pd .RangeIndex ):
430+ return sys .getsizeof (pd_obj )
431+ if isinstance (pd_obj , pd .MultiIndex ):
432+ # MultiIndex's sample size can't be used to estimate
433+ return sys .getsizeof (pd_obj )
431434
432435 from .dataframe .arrays import ArrowDtype
433436
@@ -438,14 +441,16 @@ def _is_fast_dtype(dtype):
438441 return isinstance (dtype , ArrowDtype )
439442
440443 dtypes = []
441- if isinstance (df_obj , pd .DataFrame ):
442- dtypes .extend (df_obj .dtypes )
443- index_obj = df_obj .index
444- elif isinstance (df_obj , pd .Series ):
445- dtypes .append (df_obj .dtype )
446- index_obj = df_obj .index
444+ is_series = False
445+ if isinstance (pd_obj , pd .DataFrame ):
446+ dtypes .extend (pd_obj .dtypes )
447+ index_obj = pd_obj .index
448+ elif isinstance (pd_obj , pd .Series ):
449+ dtypes .append (pd_obj .dtype )
450+ index_obj = pd_obj .index
451+ is_series = True
447452 else :
448- index_obj = df_obj
453+ index_obj = pd_obj
449454
450455 # handling possible MultiIndex
451456 if hasattr (index_obj , "dtypes" ):
@@ -454,12 +459,22 @@ def _is_fast_dtype(dtype):
454459 dtypes .append (index_obj .dtype )
455460
456461 if all (_is_fast_dtype (dtype ) for dtype in dtypes ):
457- return sys .getsizeof (df_obj )
458-
459- indices = np .sort (np .random .choice (len (df_obj ), size = max_samples , replace = False ))
460- iloc = df_obj if isinstance (df_obj , pd .Index ) else df_obj .iloc
461- sample_size = sys .getsizeof (iloc [indices ])
462- return sample_size * len (df_obj ) // max_samples
462+ return sys .getsizeof (pd_obj )
463+
464+ indices = np .sort (np .random .choice (len (pd_obj ), size = max_samples , replace = False ))
465+ iloc = pd_obj if isinstance (pd_obj , pd .Index ) else pd_obj .iloc
466+ if isinstance (index_obj , pd .MultiIndex ):
467+ # MultiIndex's sample size is much greater than expected, thus we calculate
468+ # the size separately.
469+ index_size = sys .getsizeof (pd_obj .index )
470+ if is_series :
471+ sample_frame_size = iloc [indices ].memory_usage (deep = True , index = False )
472+ else :
473+ sample_frame_size = iloc [indices ].memory_usage (deep = True , index = False ).sum ()
474+ return index_size + sample_frame_size * len (pd_obj ) // max_samples
475+ else :
476+ sample_size = sys .getsizeof (iloc [indices ])
477+ return sample_size * len (pd_obj ) // max_samples
463478
464479
465480def build_fetch_chunk (
0 commit comments