Skip to content

Commit 4fbf841

Browse files
authored
Fix sort_values for empty DataFrame or Series (#2681)
1 parent 56efd30 commit 4fbf841

File tree

9 files changed

+47
-7
lines changed

9 files changed

+47
-7
lines changed

mars/dataframe/datasource/core.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,9 @@ def merge_small_files(
210210
if merged_file_size is not None:
211211
merged_file_size = parse_readable_size(merged_file_size)[0]
212212
else:
213-
merged_file_size = options.chunk_store_limit
213+
# Estimated size is relatively large than the real one,
214+
# so we double the merged size
215+
merged_file_size = options.chunk_store_limit * 2
214216
# sample files whose size equals `n_sample_file`
215217
sampled_chunks = np.random.choice(df.chunks, n_sample_file)
216218
max_chunk_size = 0

mars/dataframe/sort/sort_values.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,11 @@ def _tile_series(cls, op):
9494

9595
@classmethod
9696
def _tile(cls, op):
97-
if op.inputs[0].ndim == 2:
97+
inp = op.inputs[0]
98+
if inp.shape[op.axis] == 0:
99+
# if the length is zero, return input directly
100+
return inp
101+
if inp.ndim == 2:
98102
return (yield from cls._tile_dataframe(op))
99103
else:
100104
return (yield from cls._tile_series(op))

mars/dataframe/sort/tests/test_sort_execution.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,21 @@ def test_sort_values_execution(setup, distinct_opt):
238238
result.reset_index(drop=True), expected.reset_index(drop=True)
239239
)
240240

241+
# test for empty input(#GH 2649)
242+
pd_df = pd.DataFrame(np.random.rand(10, 3), columns=["col1", "col2", "col3"])
243+
df = DataFrame(pd_df, chunk_size=4)
244+
df = df[df["col2"] > 1].execute()
245+
result = df.sort_values(by="col1").execute().fetch()
246+
expected = pd_df[pd_df["col2"] > 1].sort_values(by="col1")
247+
pd.testing.assert_frame_equal(result, expected)
248+
249+
pd_s = pd.Series(np.random.rand(10))
250+
s = Series(pd_s, chunk_size=4)
251+
s = s[s > 1].execute()
252+
result = s.sort_values().execute().fetch()
253+
expected = pd_s[pd_s > 1].sort_values()
254+
pd.testing.assert_series_equal(result, expected)
255+
241256

242257
def test_sort_index_execution(setup):
243258
raw = pd.DataFrame(np.random.rand(100, 20), index=np.random.rand(100))

mars/learn/metrics/pairwise/tests/test_pariwise_distances.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_pairwise_distances_execution(setup):
4747
weight = np.random.rand(5)
4848
d = pairwise_distances(x, y, metric="wminkowski", p=3, w=weight)
4949
result = d.execute().fetch()
50-
expected = sk_pairwise_distances(raw_x, raw_y, metric="wminkowski", p=3, w=weight)
50+
expected = sk_pairwise_distances(raw_x, raw_y, metric="minkowski", p=3, w=weight)
5151
np.testing.assert_almost_equal(result, expected)
5252

5353
# test pdist

mars/tensor/datasource/from_zarr.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,11 +47,20 @@ def execute(cls, ctx, op):
4747
def fromzarr(path, group=None, dataset=None, chunk_size=None):
4848
import zarr
4949

50+
try:
51+
# since v2.11.0, zarr convert mutable mappings to KVStore
52+
from zarr.storage import KVStore as zarr_kvstore
53+
except ImportError: # pragma: no cover
54+
zarr_kvstore = None
55+
5056
if isinstance(path, zarr.Array):
5157
arr = path
52-
if isinstance(arr.store, FSMap):
58+
if zarr_kvstore is None and isinstance(arr.store, FSMap): # pragma: no cover
5359
root = arr.store.root
5460
path, dataset = root.rsplit("/", 1)
61+
elif zarr_kvstore and isinstance(arr.store, zarr_kvstore):
62+
root = arr.store._mutable_mapping.root
63+
path, dataset = root.rsplit("/", 1)
5564
else:
5665
path = arr.store.path
5766
if "/" in arr.path and group is None:

mars/tensor/random/tests/test_random_execution.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def test_sparse_randint_execution(setup):
149149
assert res.shape == (30, 50)
150150
np.testing.assert_array_less(res.data, 2)
151151
np.testing.assert_array_less(0, res.data)
152-
assert pytest.approx((res >= 1).toarray().sum(), 30 * 50 * 0.1, abs=20)
152+
assert (res >= 1).toarray().sum() == pytest.approx(30 * 50 * 0.1, abs=20)
153153

154154

155155
random_test_options = namedtuple("random_test_options", ["func_name", "args", "kwargs"])

mars/tensor/spatial/distance/cdist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,11 @@ def cdist(XA, XB, metric="euclidean", **kwargs):
532532
"3rd argument metric must be a string identifier " "or a function."
533533
)
534534

535+
# scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
536+
# keyword-argument for the given weight.
537+
if metric == "wminkowski":
538+
metric = "minkowski"
539+
535540
p = kwargs.pop("p", None)
536541
w = kwargs.pop("w", None)
537542
if w is not None:

mars/tensor/spatial/distance/pdist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -697,6 +697,11 @@ def pdist(X, metric="euclidean", **kwargs):
697697
"2nd argument metric must be a string identifier " "or a function."
698698
)
699699

700+
# scipy remove "wminkowski" since v1.8.0, use "minkowski" with `w=`
701+
# keyword-argument for the given weight.
702+
if metric == "wminkowski":
703+
metric = "minkowski"
704+
700705
p = kwargs.pop("p", None)
701706
w = kwargs.pop("w", None)
702707
if w is not None:

mars/tensor/spatial/distance/tests/test_distance_execution.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def test_pdist_execution(setup):
8181
w = tensor(weight, chunk_size=7)
8282
dist = distance.pdist(x, metric="wminkowski", p=3, w=w)
8383
result = dist.execute().fetch()
84-
expected = sp_pdist(raw, metric="wminkowski", p=3, w=weight)
84+
expected = sp_pdist(raw, metric="minkowski", p=3, w=weight)
8585
np.testing.assert_array_equal(result, expected)
8686

8787
# test V
@@ -157,7 +157,7 @@ def test_cdist_execution(setup):
157157
w = tensor(weight, chunk_size=7)
158158
dist = distance.cdist(xa, xb, metric="wminkowski", p=3, w=w)
159159
result = dist.execute().fetch()
160-
expected = sp_cdist(raw_a, raw_b, metric="wminkowski", p=3, w=weight)
160+
expected = sp_cdist(raw_a, raw_b, metric="minkowski", p=3, w=weight)
161161
np.testing.assert_array_equal(result, expected)
162162

163163
# test V

0 commit comments

Comments
 (0)