Fix estimate_pandas_size on pd.MultiIndex (#2707)

hekaisheng · web-flow · commit c3e8bdcd3762 · 2022-02-11T23:45:19.000+08:00
diff --git a/mars/_version.py b/mars/_version.py
@@ -16,7 +16,7 @@
 import os
 from typing import NamedTuple, Optional
 
-version_info = (0, 9, 0, "a2")
+version_info = (0, 9, 0, "b1")
 _num_index = max(idx if isinstance(v, int) else 0 for idx, v in enumerate(version_info))
 __version__ = ".".join(map(str, version_info[: _num_index + 1])) + "".join(
     version_info[_num_index + 1 :]
diff --git a/mars/tests/test_utils.py b/mars/tests/test_utils.py
@@ -508,11 +508,52 @@ def test_estimate_pandas_size():
 
     s3 = pd.Series(np.random.choice(["abcd", "def", "gh"], size=(1000,)))
     assert utils.estimate_pandas_size(s3) != sys.getsizeof(s3)
+    assert (
+        pytest.approx(utils.estimate_pandas_size(s3) / sys.getsizeof(s3), abs=0.5) == 1
+    )
 
     idx1 = pd.MultiIndex.from_arrays(
         [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
     )
-    assert utils.estimate_pandas_size(idx1) != sys.getsizeof(idx1)
+    assert utils.estimate_pandas_size(idx1) == sys.getsizeof(idx1)
+
+    string_idx = pd.Index(np.random.choice(["a", "bb", "cc"], size=(1000,)))
+    assert utils.estimate_pandas_size(string_idx) != sys.getsizeof(string_idx)
+    assert (
+        pytest.approx(
+            utils.estimate_pandas_size(string_idx) / sys.getsizeof(string_idx), abs=0.5
+        )
+        == 1
+    )
+
+    # dataframe with multi index
+    idx2 = pd.MultiIndex.from_arrays(
+        [np.arange(0, 1000), np.random.choice(["abcd", "def", "gh"], size=(1000,))]
+    )
+    df4 = pd.DataFrame(
+        {
+            "A": np.random.choice(["abcd", "def", "gh"], size=(1000,)),
+            "B": np.random.rand(1000),
+            "C": np.random.rand(1000),
+        },
+        index=idx2,
+    )
+    assert utils.estimate_pandas_size(df4) != sys.getsizeof(df4)
+    assert (
+        pytest.approx(utils.estimate_pandas_size(df4) / sys.getsizeof(df4), abs=0.5)
+        == 1
+    )
+
+    # series with multi index
+    idx3 = pd.MultiIndex.from_arrays(
+        [
+            np.random.choice(["a1", "a2", "a3"], size=(1000,)),
+            np.random.choice(["abcd", "def", "gh"], size=(1000,)),
+        ]
+    )
+    s4 = pd.Series(np.arange(1000), index=idx3)
+
+    assert utils.estimate_pandas_size(s4) == sys.getsizeof(s4)
 
 
 @require_ray
diff --git a/mars/utils.py b/mars/utils.py
@@ -424,10 +424,13 @@ def calc_data_size(dt: Any, shape: Tuple[int] = None) -> int:
 
 
 def estimate_pandas_size(
-    df_obj, max_samples: int = 10, min_sample_rows: int = 100
+    pd_obj, max_samples: int = 10, min_sample_rows: int = 100
 ) -> int:
-    if len(df_obj) <= min_sample_rows or isinstance(df_obj, pd.RangeIndex):
-        return sys.getsizeof(df_obj)
+    if len(pd_obj) <= min_sample_rows or isinstance(pd_obj, pd.RangeIndex):
+        return sys.getsizeof(pd_obj)
+    if isinstance(pd_obj, pd.MultiIndex):
+        # MultiIndex's sample size can't be used to estimate
+        return sys.getsizeof(pd_obj)
 
     from .dataframe.arrays import ArrowDtype
 
@@ -438,14 +441,16 @@ def _is_fast_dtype(dtype):
             return isinstance(dtype, ArrowDtype)
 
     dtypes = []
-    if isinstance(df_obj, pd.DataFrame):
-        dtypes.extend(df_obj.dtypes)
-        index_obj = df_obj.index
-    elif isinstance(df_obj, pd.Series):
-        dtypes.append(df_obj.dtype)
-        index_obj = df_obj.index
+    is_series = False
+    if isinstance(pd_obj, pd.DataFrame):
+        dtypes.extend(pd_obj.dtypes)
+        index_obj = pd_obj.index
+    elif isinstance(pd_obj, pd.Series):
+        dtypes.append(pd_obj.dtype)
+        index_obj = pd_obj.index
+        is_series = True
     else:
-        index_obj = df_obj
+        index_obj = pd_obj
 
     # handling possible MultiIndex
     if hasattr(index_obj, "dtypes"):
@@ -454,12 +459,22 @@ def _is_fast_dtype(dtype):
         dtypes.append(index_obj.dtype)
 
     if all(_is_fast_dtype(dtype) for dtype in dtypes):
-        return sys.getsizeof(df_obj)
-
-    indices = np.sort(np.random.choice(len(df_obj), size=max_samples, replace=False))
-    iloc = df_obj if isinstance(df_obj, pd.Index) else df_obj.iloc
-    sample_size = sys.getsizeof(iloc[indices])
-    return sample_size * len(df_obj) // max_samples
+        return sys.getsizeof(pd_obj)
+
+    indices = np.sort(np.random.choice(len(pd_obj), size=max_samples, replace=False))
+    iloc = pd_obj if isinstance(pd_obj, pd.Index) else pd_obj.iloc
+    if isinstance(index_obj, pd.MultiIndex):
+        # MultiIndex's sample size is much greater than expected, thus we calculate
+        # the size separately.
+        index_size = sys.getsizeof(pd_obj.index)
+        if is_series:
+            sample_frame_size = iloc[indices].memory_usage(deep=True, index=False)
+        else:
+            sample_frame_size = iloc[indices].memory_usage(deep=True, index=False).sum()
+        return index_size + sample_frame_size * len(pd_obj) // max_samples
+    else:
+        sample_size = sys.getsizeof(iloc[indices])
+        return sample_size * len(pd_obj) // max_samples
 
 
 def build_fetch_chunk(