From 1b0440021f5e0a6d3556d5e80f9a3b004f70c624 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 28 Aug 2024 10:35:07 -0500 Subject: [PATCH] Handle dask-expr optimizations in sjoin Closes https://github.com/geopandas/dask-geopandas/issues/303 --- dask_geopandas/sjoin.py | 12 ++++++++++-- dask_geopandas/tests/test_sjoin.py | 11 +++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/dask_geopandas/sjoin.py b/dask_geopandas/sjoin.py index 2b3c35a..95283c8 100644 --- a/dask_geopandas/sjoin.py +++ b/dask_geopandas/sjoin.py @@ -62,6 +62,14 @@ def sjoin(left, right, how="inner", predicate="intersects", **kwargs): if isinstance(right, geopandas.GeoDataFrame): right = from_geopandas(right, npartitions=1) + if backends.QUERY_PLANNING_ON: + # We call optimize on the inputs to ensure that any optimizations + # done by dask-expr (which might change the expression, and thus the + # name of the DataFrame) *before* we build the HighLevelGraph. + # https://github.com/dask/dask-expr/issues/1129 + left = left.optimize() + right = right.optimize() + name = "sjoin-" + tokenize(left, right, how, predicate) meta = geopandas.sjoin(left._meta, right._meta, how=how, predicate=predicate) @@ -74,8 +82,8 @@ def sjoin(left, right, how="inner", predicate="intersects", **kwargs): how="inner", predicate="intersects", ) - parts_left = np.asarray(parts.index) - parts_right = np.asarray(parts["index_right"].values) + parts_left = np.asarray(parts.index).tolist() + parts_right = np.asarray(parts["index_right"].values).tolist() using_spatial_partitions = True else: # Unknown spatial partitions -> full cartesian (cross) product of all diff --git a/dask_geopandas/tests/test_sjoin.py b/dask_geopandas/tests/test_sjoin.py index d729fe1..63cd9cf 100644 --- a/dask_geopandas/tests/test_sjoin.py +++ b/dask_geopandas/tests/test_sjoin.py @@ -1,4 +1,5 @@ import geopandas +import shapely import dask_geopandas @@ -46,3 +47,13 @@ def test_sjoin_dask_geopandas(naturalearth_lowres, naturalearth_cities): # check warning with pytest.warns(FutureWarning, match="The `op` parameter is deprecated"): dask_geopandas.sjoin(df_points, ddf_polygons, op="within", how="inner") + + +def test_no_value_error(): + # https://github.com/geopandas/dask-geopandas/issues/303 + shape = shapely.geometry.box(-74.5, -74.0, 4.5, 5.0) + df = dask_geopandas.from_geopandas( + geopandas.GeoDataFrame(geometry=[shape]), npartitions=1 + ).spatial_shuffle() + # no TypeError + df.sjoin(df).compute()