Skip to content

Commit 4a0ab76

Browse files
author
Xuye (Chris) Qin
authored
Fix potential empty chunks when creating DataFrame from pandas (#2987)
1 parent 115ee0b commit 4a0ab76

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

mars/dataframe/datasource/dataframe.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414

1515
import itertools
1616

17+
import pandas as pd
18+
1719
from ... import opcodes as OperandDef
1820
from ...config import options
1921
from ...core import OutputType
2022
from ...serialization.serializables import DataFrameField, SeriesField
2123
from ...tensor.utils import get_chunk_slices
24+
from ...utils import estimate_pandas_size
2225
from ..utils import decide_dataframe_chunk_sizes, parse_index, is_cudf
2326
from ..operands import DataFrameOperand, DataFrameOperandMixin
2427

@@ -61,7 +64,10 @@ def tile(cls, op: "DataFrameDataSource"):
6164
df = op.outputs[0]
6265
raw_df = op.data
6366

64-
memory_usage = raw_df.memory_usage(index=False, deep=True)
67+
# estimate column memory usage instead of calling df.memory_usage(deep=True)
68+
memory_usage = pd.Series(
69+
{c: estimate_pandas_size(s) for c, s in raw_df.iteritems()}
70+
)
6571
chunk_size = df.extra_params.raw_chunk_size or options.chunk_size
6672
chunk_size = decide_dataframe_chunk_sizes(df.shape, chunk_size, memory_usage)
6773
chunk_size_idxes = (range(len(size)) for size in chunk_size)

mars/dataframe/datasource/tests/test_datasource.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,15 @@
1515
import os
1616
import tempfile
1717
import shutil
18+
import string
1819
from collections import OrderedDict
1920

2021
import numpy as np
2122
import pandas as pd
2223
import pytest
2324

2425
from .... import tensor as mt
26+
from ....config import option_context
2527
from ....core import tile
2628
from ....tests.core import require_ray
2729
from ....utils import lazy_import
@@ -138,6 +140,20 @@ def test_from_pandas_dataframe():
138140
pd.testing.assert_frame_equal(df2.chunks[5].op.data, df2.op.data.iloc[4:, 8:])
139141
assert df2.chunks[3].index_value._index_value._slice == slice(8, 10, 2)
140142

143+
raw = pd.DataFrame(
144+
{
145+
"a": [
146+
string.printable[i : i + 15]
147+
for i in np.random.randint(len(string.printable), size=100)
148+
],
149+
"b": np.random.rand(100),
150+
}
151+
)
152+
with option_context({"chunk_store_limit": raw["a"].memory_usage(deep=True) / 10}):
153+
df = tile(from_pandas_df(raw))
154+
# see GH#2985, empty chunks are wrongly generated
155+
assert len([ns for ns in df.nsplits[1] if ns == 0]) == 0
156+
141157

142158
def test_from_pandas_series():
143159
data = pd.Series(np.random.rand(10), name="a")

mars/dataframe/utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,10 @@ def decide_dataframe_chunk_sizes(shape, chunk_size, memory_usage):
201201
int(np.power(max_chunk_size / nbytes_occupied, 1 / float(nleft))), 1
202202
)
203203

204-
if col_left_size == 0:
204+
if col_left_size == 0 and not col_chunk_size:
205205
col_chunk_size.append(0)
206206

207-
if row_left_size == 0:
207+
if row_left_size == 0 and not row_chunk_size:
208208
row_chunk_size.append(0)
209209

210210
# check col first

0 commit comments

Comments
 (0)