From a54c937cf3e78472d19b02871be4b3ff287e83d2 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 21:50:02 -0400 Subject: [PATCH 01/11] Hook spatial predicates up to FlexiJoins.jl by an extension --- Project.toml | 3 ++ .../GeometryOpsFlexiJoinsExt.jl | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl diff --git a/Project.toml b/Project.toml index bf1184a99..7ea40437a 100644 --- a/Project.toml +++ b/Project.toml @@ -12,13 +12,16 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [weakdeps] +FlexiJoins = "e37f2e79-19fa-4eb7-8510-b63b51fe0a37" Proj = "c94c279d-25a6-4763-9509-64d165bea63e" [extensions] +GeometryOpsFlexiJoinsExt = "FlexiJoins" GeometryOpsProjExt = "Proj" [compat] CoordinateTransformations = "0.5, 0.6" +FlexiJoins = "0.1.30" GeoInterface = "1.2" GeometryBasics = "0.4.7" LinearAlgebra = "1" diff --git a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl new file mode 100644 index 000000000..8d06c9fd8 --- /dev/null +++ b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl @@ -0,0 +1,34 @@ +module GeometryOpsFlexiJoinsExt + +using GeometryOps +import GeometryOps as GO +using FlexiJoins + +# This module defines the FlexiJoins APIs for GeometryOps' boolean comparison functions, taken from DE-9IM. + +# For now, we only allow regular n^2 loops for the predicates, primarily because I'm not entirely sure how to do the "fast" mode for these. +# TODO: fix that and allow faster joins. + +# First, we define that FlexiJoins supports the "NestedLoopFast" mode for the predicates. + +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.contains)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.within)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.intersects)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.disjoint)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.touches)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.crosses)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.overlaps)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.covers)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.coveredby)}, datas) = true + +# Next, just in case, we define the `swap_sides` function for those predicates which are defined as inversions. + +FlexiJoins.swap_sides(::typeof(GO.contains)) = GO.within +FlexiJoins.swap_sides(::typeof(GO.within)) = GO.contains +FlexiJoins.swap_sides(::typeof(GO.coveredby)) = GO.covers +FlexiJoins.swap_sides(::typeof(GO.covers)) = GO.coveredby + +# That's a wrap, folks! + +end + From a720e96783c93f8006f6066243498384fa9d6b5c Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 21:50:16 -0400 Subject: [PATCH 02/11] Add a tutorial on how to use spatial joins via FlexiJoin --- docs/Project.toml | 5 +++ docs/make.jl | 3 ++ docs/src/tutorials/spatial_joins.md | 61 +++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 docs/src/tutorials/spatial_joins.md diff --git a/docs/Project.toml b/docs/Project.toml index 1401255a5..d10a588eb 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -10,6 +10,9 @@ DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" DocumenterVitepress = "4710194d-e776-4893-9690-8d956a29c365" +DoubleFloats = "497a8b3b-efae-58df-a0af-a86822472b78" +FlexiJoins = "e37f2e79-19fa-4eb7-8510-b63b51fe0a37" +GADM = "a8dd9ffe-31dc-4cf5-a379-ea69100a8233" GeoDatasets = "ddc7317b-88db-5cb5-a849-8449e5df04f9" GeoInterface = "cf35fbd7-0cd7-5166-be24-54bfbe79505f" GeoInterfaceMakie = "0edc0954-3250-4c18-859d-ec71c1660c08" @@ -20,6 +23,8 @@ LibGEOS = "a90b1aa1-3769-5649-ba7e-abc5a9d163eb" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" MakieThemes = "e296ed71-da82-5faf-88ab-0034a9761098" +Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7" +MonteCarloMeasurements = "0987c9cc-fe09-11e8-30f0-b96dd679fdca" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Proj = "c94c279d-25a6-4763-9509-64d165bea63e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/docs/make.jl b/docs/make.jl index 7cbef9434..398bb39c4 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -91,6 +91,9 @@ makedocs(; pages=[ "Introduction" => "introduction.md", "API Reference" => "api.md", + "Tutorials" => [ + "Spatial Joins" => "tutorials/spatial_joins.md", + ], "Source code" => literate_pages, ], warnonly = true, diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md new file mode 100644 index 000000000..9420cf169 --- /dev/null +++ b/docs/src/tutorials/spatial_joins.md @@ -0,0 +1,61 @@ +# Spatial joins + +Spatial joins are joins which are based not on equality, but on some predicate ``p(x, y)``, which takes two geometries, and returns a value of either `true` or `false`. For geometries, the `DE + +In this tutorial, we will show how to perform a spatial join on first a toy dataset and then two Natural Earth datasets, to show how this can be used in the real world. + +## Simple example + +This example demonstrates how to perform a spatial join between two datasets: a set of polygons and a set of randomly generated points. + +The polygons are represented as a DataFrame with geometries and colors, while the points are stored in a separate DataFrame. + +The spatial join is performed using the `contains` predicate from GeometryOps, which checks if each point is contained within any of the polygons. The resulting joined DataFrame is then used to plot the points, colored according to the containing polygon. + +First, we generate our data. We create two triangle polygons which, together, span the rectangle (0, 0, 1, 1), and a set of points which are randomly distributed within this rectangle. + +```@example spatialjoins +import GeoInterface as GI, GeometryOps as GO +using FlexiJoins, DataFrames + +using CairoMakie, GeoInterfaceMakie + +Makie.set_theme!(Attributes(; Axis = (; aspect = DataAspect()))) # hide + +pl = GI.Polygon([GI.LinearRing([(0, 0), (1, 0), (1, 1), (0, 0)])]) +pu = GI.Polygon([GI.LinearRing([(0, 0), (0, 1), (1, 1), (0, 0)])]) +poly_df = DataFrame(geometry = [pl, pu], color = [:red, :blue]) +f, a, p = poly(poly_df.geometry; color = tuple.(poly_df.color, 0.7)) +``` + +Here, the upper polygon is blue, and the lower polygon is red. Keep this in mind! + +Now, we generate the points. + +```@example spatialjoins +points = tuple.(rand(100), rand(100)) +points_df = DataFrame(geometry = points) +scatter!(points_df.geometry) +f +``` + +You can see that they are evenly distributed around the box. But how do we know which points are in which polygons? + +The answer here is to perform a spatial join. + +Now, we can perform the "spatial join" using FlexiJoins. We are performing an outer join here + +```@example spatialjoin +joined_df = FlexiJoins.innerjoin((poly_df, points_df), by_pred(:geometry, GO.contains, :geometry)) +``` + +```@example spatialjoin +scatter(joined_df.geometry_1; color = joined_df.color) +``` + +Here, you can see that the colors were assigned appropriately to the scattered points! + +## Real-world example + +Suppose I have a list of polygons representing administrative regions (or mining sites, or what have you), and I have a list of polygons for each country. I want to find the country each region is in. + From d37642afe4e791292efd3c26378727f87af1f8fd Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 22:02:20 -0400 Subject: [PATCH 03/11] Add tests --- Project.toml | 3 ++- test/extensions/flexijoins.jl | 22 ++++++++++++++++++++++ test/runtests.jl | 2 ++ 3 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 test/extensions/flexijoins.jl diff --git a/Project.toml b/Project.toml index 7ea40437a..f8861d0fd 100644 --- a/Project.toml +++ b/Project.toml @@ -35,6 +35,7 @@ ArchGDAL = "c9ce4bd3-c3d5-55b8-8973-c0e20141b8c3" CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +FlexiJoins = "e37f2e79-19fa-4eb7-8510-b63b51fe0a37" GeoFormatTypes = "68eda718-8dee-11e9-39e7-89f7f65f511f" GeoJSON = "61d90e0f-e114-555e-ac52-39dfb47a3ef9" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" @@ -45,4 +46,4 @@ Shapefile = "8e980c4a-a4fe-5da2-b3a7-4b4b0353a2f4" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["ArchGDAL", "CoordinateTransformations", "DataFrames", "Distributions", "GeoFormatTypes", "GeoJSON", "Proj", "JLD2", "LibGEOS", "Random", "Shapefile", "Test"] +test = ["ArchGDAL", "CoordinateTransformations", "DataFrames", "Distributions", "FlexiJoins", "GeoFormatTypes", "GeoJSON", "Proj", "JLD2", "LibGEOS", "Random", "Shapefile", "Test"] diff --git a/test/extensions/flexijoins.jl b/test/extensions/flexijoins.jl new file mode 100644 index 000000000..80e0432ac --- /dev/null +++ b/test/extensions/flexijoins.jl @@ -0,0 +1,22 @@ +import GeometryOps as GO, GeoInterface as GI +using FlexiJoins, DataFrames + +pl = GI.Polygon([GI.LinearRing([(0, 0), (1, 0), (1, 1), (0, 0)])]) +pu = GI.Polygon([GI.LinearRing([(0, 0), (0, 1), (1, 1), (0, 0)])]) +poly_df = DataFrame(geometry = [pl, pu], color = [:red, :blue]) + +points = tuple.(rand(100), rand(100)) +points_df = DataFrame(geometry = points) + +@testset "Basic integration" begin + + @test_nowarn joined_df = FlexiJoins.innerjoin((poly_df, points_df), by_pred(:geometry, GO.contains, :geometry)) + # Test that the join happened correctly + joined_df = FlexiJoins.innerjoin((poly_df, points_df), by_pred(:geometry, GO.contains, :geometry)) + @test all(GO.contains.((pl,), joined_df.geometry_1[joined_df.color .== :red])) + @test all(GO.contains.((pu,), joined_df.geometry_1[joined_df.color .== :blue])) + # Test that within also works + @test_nowarn joined_df = FlexiJoins.innerjoin((points_df, poly_df), by_pred(:geometry, GO.within, :geometry)) + +end + diff --git a/test/runtests.jl b/test/runtests.jl index eca987fd6..f33af4a3c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -41,4 +41,6 @@ const GO = GeometryOps include("transformations/correction/closed_ring.jl") include("transformations/correction/intersecting_polygons.jl") end + # # Extensions + @testset "FlexiJoins" begin include("extensions/flexijoins.jl") end end From c69208faddff0c82e934df245255f2de37358db8 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 22:07:43 -0400 Subject: [PATCH 04/11] Allow `GO.equals` as a predicate --- ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl index 8d06c9fd8..118bfbf6c 100644 --- a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl +++ b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl @@ -20,6 +20,7 @@ FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{t FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.overlaps)}, datas) = true FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.covers)}, datas) = true FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.coveredby)}, datas) = true +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.equals)}, datas) = true # Next, just in case, we define the `swap_sides` function for those predicates which are defined as inversions. From 9bfb7699bccb21a6eebedf2bde4e9202b49626d4 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 22:07:50 -0400 Subject: [PATCH 05/11] Improve docs significantly --- docs/src/tutorials/spatial_joins.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md index 9420cf169..fcbc7a7a7 100644 --- a/docs/src/tutorials/spatial_joins.md +++ b/docs/src/tutorials/spatial_joins.md @@ -1,9 +1,22 @@ # Spatial joins -Spatial joins are joins which are based not on equality, but on some predicate ``p(x, y)``, which takes two geometries, and returns a value of either `true` or `false`. For geometries, the `DE +Spatial joins are joins which are based not on equality, but on some predicate ``p(x, y)``, which takes two geometries, and returns a value of either `true` or `false`. For geometries, the [`DE-9IM`](https://en.wikipedia.org/wiki/DE-9IM) spatial relationship model is used to determine the spatial relationship between two geometries. In this tutorial, we will show how to perform a spatial join on first a toy dataset and then two Natural Earth datasets, to show how this can be used in the real world. +In order to perform the spatial join, we use [FlexiJoins.jl](https://github.com/JuliaAPlavin/FlexiJoins.jl) to perform the join, specifically using its `by_pred` joining method. This allows the user to specify a predicate in the following manner: +```julia +[inner/left/outer/...]join((table1, table1), + by_pred(:table1_column, predicate_function, :table2_column) +) +``` + +We have enabled the use of all of GeometryOps' boolean comparisons here. These are: + +```julia +GO.contains, GO.within, GO.intersects, GO.touches, GO.crosses, GO.disjoint, GO.overlaps, GO.covers, GO.coveredby, GO.equals +``` + ## Simple example This example demonstrates how to perform a spatial join between two datasets: a set of polygons and a set of randomly generated points. @@ -25,7 +38,7 @@ Makie.set_theme!(Attributes(; Axis = (; aspect = DataAspect()))) # hide pl = GI.Polygon([GI.LinearRing([(0, 0), (1, 0), (1, 1), (0, 0)])]) pu = GI.Polygon([GI.LinearRing([(0, 0), (0, 1), (1, 1), (0, 0)])]) poly_df = DataFrame(geometry = [pl, pu], color = [:red, :blue]) -f, a, p = poly(poly_df.geometry; color = tuple.(poly_df.color, 0.7)) +f, a, p = poly(poly_df.geometry; color = tuple.(poly_df.color, 0.3)) ``` Here, the upper polygon is blue, and the lower polygon is red. Keep this in mind! From be8b0e27255b6aefc10ae38592012e6186247cad Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 22:36:37 -0400 Subject: [PATCH 06/11] Fix example name --- docs/src/tutorials/spatial_joins.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md index fcbc7a7a7..4d222e2bf 100644 --- a/docs/src/tutorials/spatial_joins.md +++ b/docs/src/tutorials/spatial_joins.md @@ -58,11 +58,14 @@ The answer here is to perform a spatial join. Now, we can perform the "spatial join" using FlexiJoins. We are performing an outer join here -```@example spatialjoin -joined_df = FlexiJoins.innerjoin((poly_df, points_df), by_pred(:geometry, GO.contains, :geometry)) +```@example spatialjoins +joined_df = FlexiJoins.innerjoin( + (poly_df, points_df), + by_pred(:geometry, GO.contains, :geometry) +) ``` -```@example spatialjoin +```@example spatialjoins scatter(joined_df.geometry_1; color = joined_df.color) ``` From a34f9049d47a75a52da8217e0ce286fa457abd1c Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Tue, 16 Apr 2024 22:45:24 -0400 Subject: [PATCH 07/11] Hide theming without causing global effects --- docs/src/tutorials/spatial_joins.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md index 4d222e2bf..0ae2d4657 100644 --- a/docs/src/tutorials/spatial_joins.md +++ b/docs/src/tutorials/spatial_joins.md @@ -33,12 +33,12 @@ using FlexiJoins, DataFrames using CairoMakie, GeoInterfaceMakie -Makie.set_theme!(Attributes(; Axis = (; aspect = DataAspect()))) # hide - pl = GI.Polygon([GI.LinearRing([(0, 0), (1, 0), (1, 1), (0, 0)])]) pu = GI.Polygon([GI.LinearRing([(0, 0), (0, 1), (1, 1), (0, 0)])]) poly_df = DataFrame(geometry = [pl, pu], color = [:red, :blue]) +f, a, p = Makie.with_theme(Attributes(; Axis = (; aspect = DataAspect()))) do # hide f, a, p = poly(poly_df.geometry; color = tuple.(poly_df.color, 0.3)) +end # hide ``` Here, the upper polygon is blue, and the lower polygon is red. Keep this in mind! From 15098bc6bb5efe29c9f5e4af68c0775d3f62d080 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 17 Apr 2024 15:20:54 -0400 Subject: [PATCH 08/11] Clean up the implementation --- .../GeometryOpsFlexiJoinsExt.jl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl index 118bfbf6c..d7586a999 100644 --- a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl +++ b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl @@ -1,15 +1,18 @@ module GeometryOpsFlexiJoinsExt using GeometryOps -import GeometryOps as GO using FlexiJoins -# This module defines the FlexiJoins APIs for GeometryOps' boolean comparison functions, taken from DE-9IM. +import GeometryOps as GO, GeoInterface as GI +using SortTileRecursiveTree, Tables + -# For now, we only allow regular n^2 loops for the predicates, primarily because I'm not entirely sure how to do the "fast" mode for these. -# TODO: fix that and allow faster joins. +# This module defines the FlexiJoins APIs for GeometryOps' boolean comparison functions, taken from DE-9IM. -# First, we define that FlexiJoins supports the "NestedLoopFast" mode for the predicates. +# First, we define the joining modes (Tree, NestedLoopFast) that the GO DE-9IM functions support. +const GO_DE9IM_FUNCS = Union{typeof(GO.contains), typeof(GO.within), typeof(GO.intersects), typeof(GO.disjoint), typeof(GO.touches), typeof(GO.crosses), typeof(GO.overlaps), typeof(GO.covers), typeof(GO.coveredby), typeof(GO.equals)} +# NestedLoopFast is the naive fallback method +FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{F}, datas) where F <: GO_DE9IM_FUNCS = true FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.contains)}, datas) = true FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.within)}, datas) = true From 76e068e97fae7e2c4687b0e661a40cd254f9328e Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 17 Apr 2024 15:21:15 -0400 Subject: [PATCH 09/11] Implement tree based joins for speed on larger datasets --- Project.toml | 1 + docs/src/tutorials/spatial_joins.md | 50 ++++++++++++++++--- .../GeometryOpsFlexiJoinsExt.jl | 34 ++++++++----- 3 files changed, 65 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index f8861d0fd..0558ee481 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ CoordinateTransformations = "150eb455-5306-5404-9cee-2592286d6298" GeoInterface = "cf35fbd7-0cd7-5166-be24-54bfbe79505f" GeometryBasics = "5c1252a2-5f33-56bf-86c9-59e7332b4326" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +SortTileRecursiveTree = "746ee33f-1797-42c2-866d-db2fce69d14d" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md index 0ae2d4657..ca9122823 100644 --- a/docs/src/tutorials/spatial_joins.md +++ b/docs/src/tutorials/spatial_joins.md @@ -6,8 +6,8 @@ In this tutorial, we will show how to perform a spatial join on first a toy data In order to perform the spatial join, we use [FlexiJoins.jl](https://github.com/JuliaAPlavin/FlexiJoins.jl) to perform the join, specifically using its `by_pred` joining method. This allows the user to specify a predicate in the following manner: ```julia -[inner/left/outer/...]join((table1, table1), - by_pred(:table1_column, predicate_function, :table2_column) +[inner/left/right/outer/...]join((table1, table1), + by_pred(:table1_column, predicate_function, :table2_column) # & add other conditions here ) ``` @@ -17,6 +17,9 @@ We have enabled the use of all of GeometryOps' boolean comparisons here. These GO.contains, GO.within, GO.intersects, GO.touches, GO.crosses, GO.disjoint, GO.overlaps, GO.covers, GO.coveredby, GO.equals ``` +!!! tip + Always place the dataframe with more complex geometries second, as that is the one which will be sorted into a tree. + ## Simple example This example demonstrates how to perform a spatial join between two datasets: a set of polygons and a set of randomly generated points. @@ -46,7 +49,7 @@ Here, the upper polygon is blue, and the lower polygon is red. Keep this in min Now, we generate the points. ```@example spatialjoins -points = tuple.(rand(100), rand(100)) +points = tuple.(rand(1000), rand(1000)) points_df = DataFrame(geometry = points) scatter!(points_df.geometry) f @@ -54,19 +57,20 @@ f You can see that they are evenly distributed around the box. But how do we know which points are in which polygons? -The answer here is to perform a spatial join. +We have to join the two dataframes based on which polygon (if any) each point lies within. Now, we can perform the "spatial join" using FlexiJoins. We are performing an outer join here ```@example spatialjoins -joined_df = FlexiJoins.innerjoin( - (poly_df, points_df), - by_pred(:geometry, GO.contains, :geometry) +@time joined_df = FlexiJoins.innerjoin( + (points_df, poly_df), + by_pred(:geometry, GO.within, :geometry) ) ``` ```@example spatialjoins -scatter(joined_df.geometry_1; color = joined_df.color) +scatter!(a, joined_df.geometry; color = joined_df.color) +f ``` Here, you can see that the colors were assigned appropriately to the scattered points! @@ -75,3 +79,33 @@ Here, you can see that the colors were assigned appropriately to the scattered p Suppose I have a list of polygons representing administrative regions (or mining sites, or what have you), and I have a list of polygons for each country. I want to find the country each region is in. +```julia real +import GeoInterface as GI, GeometryOps as GO +using FlexiJoins, DataFrames, GADM # GADM gives us country and sublevel geometry + +using CairoMakie, GeoInterfaceMakie + +country_df = GADM.get.(["JPN", "USA", "IND", "DEU", "FRA"]) |> DataFrame +country_df.geometry = GI.GeometryCollection.(GO.tuples.(country_df.geom)) + +state_doublets = [ + ("USA", "New York"), + ("USA", "California"), + ("IND", "Karnataka"), + ("DEU", "Berlin"), + ("FRA", "Grand Est"), + ("JPN", "Tokyo"), +] + +state_full_df = (x -> GADM.get(x...)).(state_doublets) |> DataFrame +state_full_df.geom = GO.tuples.(only.(state_full_df.geom)) +state_compact_df = state_full_df[:, [:geom, :NAME_1]] +``` + +```julia real +innerjoin((state_compact_df, country_df), by_pred(:geom, GO.within, :geometry)) +innerjoin((state_compact_df, view(country_df, 1:1, :)), by_pred(:geom, GO.within, :geometry)) +``` + +!!! warning + This is how you would do this, but it doesn't work yet, since the GeometryOps predicates are quite slow on large polygons. If you try this, the code will continue to run for a very, very long time (it took 12 hours on my laptop, but with minimal CPU usage). \ No newline at end of file diff --git a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl index d7586a999..5b08a6dcf 100644 --- a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl +++ b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl @@ -13,19 +13,29 @@ using SortTileRecursiveTree, Tables const GO_DE9IM_FUNCS = Union{typeof(GO.contains), typeof(GO.within), typeof(GO.intersects), typeof(GO.disjoint), typeof(GO.touches), typeof(GO.crosses), typeof(GO.overlaps), typeof(GO.covers), typeof(GO.coveredby), typeof(GO.equals)} # NestedLoopFast is the naive fallback method FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{F}, datas) where F <: GO_DE9IM_FUNCS = true +# This method allows you to cache a tree, which we do by using an STRtree. +# TODO: wrap GO predicate functions in a `TreeJoiner` struct or something, to indicate that we want to use trees, +# since they can be slower in some situations. +FlexiJoins.supports_mode(::FlexiJoins.Mode.Tree, ::FlexiJoins.ByPred{F}, datas) where F <: GO_DE9IM_FUNCS = true + +# Nested loop support is simple, and needs no further support. +# However, for trees, we need to define how the tree is prepared and how it is used. +# This is done by defining the `prepare_for_join` function to return an STRTree, +# and by defining the `findmatchix` function as querying that tree before checking +# intersections. + +# In theory, one could extract the tree from e.g a GeoPackage or some future GeoDataFrame. + +FlexiJoins.prepare_for_join(::FlexiJoins.Mode.Tree, X, cond::FlexiJoins.ByPred{<: GO_DE9IM_FUNCS}) = (X, SortTileRecursiveTree.STRtree(cond.Rf(X))) +function FlexiJoins.findmatchix(::FlexiJoins.Mode.Tree, cond::FlexiJoins.ByPred{F}, ix_a, a, (B, tree)::Tuple, multi::typeof(identity)) where F <: GO_DE9IM_FUNCS + idxs = SortTileRecursiveTree.query(tree, a) + intersecting_idxs = filter(idxs) do idx + cond.pred(a, cond.Rf(B)[idx]) + end + return intersecting_idxs +end -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.contains)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.within)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.intersects)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.disjoint)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.touches)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.crosses)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.overlaps)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.covers)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.coveredby)}, datas) = true -FlexiJoins.supports_mode(::FlexiJoins.Mode.NestedLoopFast, ::FlexiJoins.ByPred{typeof(GO.equals)}, datas) = true - -# Next, just in case, we define the `swap_sides` function for those predicates which are defined as inversions. +# Finally, for completeness, we define the `swap_sides` function for those predicates which are defined as inversions. FlexiJoins.swap_sides(::typeof(GO.contains)) = GO.within FlexiJoins.swap_sides(::typeof(GO.within)) = GO.contains From da6fa4c70a311b4deea054232fbddc179f5776a4 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Wed, 17 Apr 2024 16:36:54 -0400 Subject: [PATCH 10/11] Apply suggestions from code review Co-authored-by: Alexander Plavin --- ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl index 5b08a6dcf..165a52495 100644 --- a/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl +++ b/ext/GeometryOpsFlexiJoinsExt/GeometryOpsFlexiJoinsExt.jl @@ -26,11 +26,11 @@ FlexiJoins.supports_mode(::FlexiJoins.Mode.Tree, ::FlexiJoins.ByPred{F}, datas) # In theory, one could extract the tree from e.g a GeoPackage or some future GeoDataFrame. -FlexiJoins.prepare_for_join(::FlexiJoins.Mode.Tree, X, cond::FlexiJoins.ByPred{<: GO_DE9IM_FUNCS}) = (X, SortTileRecursiveTree.STRtree(cond.Rf(X))) +FlexiJoins.prepare_for_join(::FlexiJoins.Mode.Tree, X, cond::FlexiJoins.ByPred{<: GO_DE9IM_FUNCS}) = (X, SortTileRecursiveTree.STRtree(map(cond.Rf, X))) function FlexiJoins.findmatchix(::FlexiJoins.Mode.Tree, cond::FlexiJoins.ByPred{F}, ix_a, a, (B, tree)::Tuple, multi::typeof(identity)) where F <: GO_DE9IM_FUNCS - idxs = SortTileRecursiveTree.query(tree, a) - intersecting_idxs = filter(idxs) do idx - cond.pred(a, cond.Rf(B)[idx]) + idxs = SortTileRecursiveTree.query(tree, cond.Lf(a)) + intersecting_idxs = filter!(idxs) do idx + cond.pred(a, cond.Rf(B[idx])) end return intersecting_idxs end From 9639b44f82c6c57dc8c79c913d4e1afa10a57629 Mon Sep 17 00:00:00 2001 From: Anshul Singhvi Date: Fri, 19 Apr 2024 14:18:45 -0400 Subject: [PATCH 11/11] Add instructions on how to add custom predicates --- docs/src/tutorials/spatial_joins.md | 30 ++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/docs/src/tutorials/spatial_joins.md b/docs/src/tutorials/spatial_joins.md index ca9122823..61af4baf7 100644 --- a/docs/src/tutorials/spatial_joins.md +++ b/docs/src/tutorials/spatial_joins.md @@ -1,10 +1,12 @@ # Spatial joins -Spatial joins are joins which are based not on equality, but on some predicate ``p(x, y)``, which takes two geometries, and returns a value of either `true` or `false`. For geometries, the [`DE-9IM`](https://en.wikipedia.org/wiki/DE-9IM) spatial relationship model is used to determine the spatial relationship between two geometries. +Spatial joins are [table joins](https://www.geeksforgeeks.org/sql-join-set-1-inner-left-right-and-full-joins/) which are based not on equality, but on some predicate ``p(x, y)``, which takes two geometries, and returns a value of either `true` or `false`. For geometries, the [`DE-9IM`](https://en.wikipedia.org/wiki/DE-9IM) spatial relationship model is used to determine the spatial relationship between two geometries. + +Spatial joins can be done between any geometry types (from geometrycollections to points), just as geometrical predicates can be evaluated on any geometries. In this tutorial, we will show how to perform a spatial join on first a toy dataset and then two Natural Earth datasets, to show how this can be used in the real world. -In order to perform the spatial join, we use [FlexiJoins.jl](https://github.com/JuliaAPlavin/FlexiJoins.jl) to perform the join, specifically using its `by_pred` joining method. This allows the user to specify a predicate in the following manner: +In order to perform the spatial join, we use **[FlexiJoins.jl](https://github.com/JuliaAPlavin/FlexiJoins.jl)** to perform the join, specifically using its `by_pred` joining method. This allows the user to specify a predicate in the following manner: ```julia [inner/left/right/outer/...]join((table1, table1), by_pred(:table1_column, predicate_function, :table2_column) # & add other conditions here @@ -108,4 +110,26 @@ innerjoin((state_compact_df, view(country_df, 1:1, :)), by_pred(:geom, GO.withi ``` !!! warning - This is how you would do this, but it doesn't work yet, since the GeometryOps predicates are quite slow on large polygons. If you try this, the code will continue to run for a very, very long time (it took 12 hours on my laptop, but with minimal CPU usage). \ No newline at end of file + This is how you would do this, but it doesn't work yet, since the GeometryOps predicates are quite slow on large polygons. If you try this, the code will continue to run for a very, very long time (it took 12 hours on my laptop, but with minimal CPU usage). + +## Enabling custom predicates + +In case you want to use a custom predicate, you only need to define a method to tell FlexiJoins how to use it. + +For example, let's suppose you wanted to perform a spatial join on geometries which are some distance away from each other: + +```julia +my_predicate_function = <(5) ∘ abs ∘ GO.distance +``` + +You would need to define `FlexiJoins.supports_mode` on your predicate: + +```julia{3} +FlexiJoins.supports_mode( + ::FlexiJoins.Mode.NestedLoopFast, + ::FlexiJoins.ByPred{typeof(my_predicate_function)}, + datas +) = true +``` + +This will enable FlexiJoins to support your custom function, when it's passed to `by_pred(:geometry, my_predicate_function, :geometry)`.