Skip to content

Commit 6ed0d61

Browse files
authored
add a DataFrames extension for better table reconstruction (#312)
1 parent a741edc commit 6ed0d61

File tree

5 files changed

+128
-17
lines changed

5 files changed

+128
-17
lines changed

GeometryOpsCore/src/apply.jl

Lines changed: 63 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,11 @@ with the same schema, but with the new geometry column.
175175
This new table may be of the same type as the old one iff `Tables.materializer` is defined for
176176
that table. If not, then a `NamedTuple` is returned.
177177
=#
178-
function _apply_table(f::F, target, iterable::IterableType; geometrycolumn = nothing, preserve_default_metadata = false, threaded, kw...) where {F, IterableType}
179-
_get_col_pair(colname) = colname => Tables.getcolumn(iterable, colname)
180-
# We extract the geometry column and run `apply` on it.
178+
function _apply_table(f::F, target, iterable::IterableType; geometrycolumn = nothing, preserve_default_metadata = false, threaded, kw...) where {F, IterableType} # We extract the geometry column and run `apply` on it.
179+
# First, we need the table schema:
180+
input_schema = Tables.schema(iterable)
181+
input_colnames = input_schema.names
182+
# then, we find the geometry column(s)
181183
geometry_columns = if isnothing(geometrycolumn)
182184
GI.geometrycolumns(iterable)
183185
elseif geometrycolumn isa NTuple{N, <: Symbol} where N
@@ -187,31 +189,31 @@ function _apply_table(f::F, target, iterable::IterableType; geometrycolumn = not
187189
else
188190
throw(ArgumentError("geometrycolumn must be a Symbol or a tuple of Symbols, got a $(typeof(geometrycolumn))"))
189191
end
190-
if !all(Base.Fix2(in, Tables.columnnames(iterable)), geometry_columns)
192+
if !Base.issubset(geometry_columns, input_colnames)
191193
throw(ArgumentError(
192194
"""
193195
`apply`: the `geometrycolumn` kwarg must be a subset of the column names of the table,
194196
got $(geometry_columns)
195197
but the table has columns
196-
$(Tables.columnnames(iterable))
198+
$(input_colnames)
197199
"""
198200
))
199201
end
202+
# here we apply the function to the geometry column(s).
203+
apply_kw = if isempty(used_reconstruct_table_kwargs(iterable))
204+
kw
205+
else
206+
Base.structdiff(values(kw), NamedTuple{used_reconstruct_table_kwargs(iterable)})
207+
end
200208
new_geometry_vecs = map(geometry_columns) do colname
201-
_apply(f, target, Tables.getcolumn(iterable, colname); threaded, kw...)
209+
_apply(f, target, Tables.getcolumn(iterable, colname); threaded, apply_kw...)
202210
end
203-
# Then, we obtain the schema of the table,
204-
old_schema = Tables.schema(iterable)
205-
# filter the geometry column out,
206-
new_names = filter(x -> !(x in geometry_columns), old_schema.names)
211+
# Then, we filter the geometry column(s) out,
212+
new_names = filter(x -> !(x in geometry_columns), input_colnames)
207213
# and try to rebuild the same table as the best type - either the original type of `iterable`,
208214
# or a named tuple which is the default fallback.
209-
result = Tables.materializer(iterable)(
210-
merge(
211-
NamedTuple{geometry_columns, Base.Tuple{typeof.(new_geometry_vecs)...}}(new_geometry_vecs),
212-
NamedTuple(Iterators.map(_get_col_pair, new_names))
213-
)
214-
)
215+
# See the function directly below this one for the actual fallback implementation.
216+
result = reconstruct_table(iterable, geometry_columns, new_geometry_vecs, new_names; kw...)
215217
# Finally, we ensure that metadata is propagated correctly.
216218
# This can only happen if the original table supports metadata reads,
217219
# and the result supports metadata writes.
@@ -246,6 +248,51 @@ function _apply_table(f::F, target, iterable::IterableType; geometrycolumn = not
246248
return result
247249
end
248250

251+
252+
"""
253+
used_reconstruct_table_kwargs(input)
254+
255+
Return a tuple of the kwargs that should be passed to `reconstruct_table` for the given input.
256+
257+
This is "semi-public" API, and required for any input type that defines `reconstruct_table`.
258+
"""
259+
function used_reconstruct_table_kwargs(input)
260+
()
261+
end
262+
263+
"""
264+
reconstruct_table(input, geometry_column_names, geometry_columns, other_column_names, args...; kwargs...)
265+
266+
Reconstruct a table from the given input, geometry column names,
267+
geometry columns, and other column names.
268+
269+
Any function that defines `reconstruct_table` must also define `used_reconstruct_table_kwargs`.
270+
271+
The input must be a table.
272+
273+
The function should return a best-effort attempt at a table of the same type as the input,
274+
with the new geometry column(s) and other columns.
275+
276+
The fallback implementation invokes `Tables.materializer`. But if you want to be efficient
277+
and pass e.g. arbitrary kwargs to the materializer, or materialize in a different way, you
278+
can do so by overloading this function for your desired input type.
279+
280+
This is "semi-public" API and while it may add optional arguments, it will not add new required
281+
positional arguments. All implementations must allow arbitrary kwargs to pass through and harvest
282+
what they need.
283+
"""
284+
function reconstruct_table(input, geometry_column_names, geometry_columns, other_column_names, args...; kwargs...)
285+
@assert Tables.istable(input)
286+
_get_col_pair(colname) = colname => Tables.getcolumn(input, colname)
287+
288+
return Tables.materializer(input)(
289+
merge(
290+
NamedTuple{geometry_column_names, Base.Tuple{typeof.(geometry_columns)...}}(geometry_columns),
291+
NamedTuple(Iterators.map(_get_col_pair, other_column_names))
292+
)
293+
)
294+
end
295+
249296
# Rewrap all FeatureCollectionTrait feature collections as GI.FeatureCollection
250297
# Maybe use threads to call _apply on component features
251298
@inline function _apply(f::F, target, ::GI.FeatureCollectionTrait, fc;

Project.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
2121
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
2222

2323
[weakdeps]
24+
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
2425
FlexiJoins = "e37f2e79-19fa-4eb7-8510-b63b51fe0a37"
2526
LibGEOS = "a90b1aa1-3769-5649-ba7e-abc5a9d163eb"
2627
Proj = "c94c279d-25a6-4763-9509-64d165bea63e"
2728
TGGeometry = "d7e755d2-3c95-4bcf-9b3c-79ab1a78647b"
2829

2930
[extensions]
31+
GeometryOpsDataFramesExt = "DataFrames"
3032
GeometryOpsFlexiJoinsExt = "FlexiJoins"
3133
GeometryOpsLibGEOSExt = "LibGEOS"
3234
GeometryOpsProjExt = "Proj"
@@ -37,6 +39,7 @@ AbstractTrees = "0.4"
3739
AdaptivePredicates = "1.2"
3840
CoordinateTransformations = "0.5, 0.6"
3941
DataAPI = "1"
42+
DataFrames = "1"
4043
DelaunayTriangulation = "1.0.4"
4144
ExactPredicates = "2.2.8"
4245
Extents = "0.1.5"
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#=
2+
# DataFrames extension
3+
4+
This module simply extends Core's `reconstruct_table` method to:
5+
- work with DataFrames
6+
- not copy columns unless it is necessary to do so
7+
- allow passing through known kwargs to the constructor
8+
9+
10+
In the future, if we ever end up defining `ApplyToFeatures` on a table,
11+
then we will need to add some form of method for that...
12+
which will likely entail adding an extra positional argument to the
13+
reconstruct_table method, and checking whether `other_column_names`
14+
is equal to `setdiff(Tables.columnnames(input), geometry_column_names)`.
15+
16+
If it is not then we will have to reconstruct the whole DataFrame from the
17+
GI.Feature named-tuple row table representation.
18+
=#
19+
module GeometryOpsDataFramesExt
20+
21+
import GeometryOpsCore
22+
using DataFrames
23+
24+
GeometryOpsCore.used_reconstruct_table_kwargs(::DataFrames.DataFrame) = (:copycols,)
25+
26+
function GeometryOpsCore.reconstruct_table(
27+
input::DataFrames.DataFrame, geometry_column_names, geometry_columns,
28+
other_column_names, args...;
29+
copycols = true, kwargs...
30+
)
31+
# Create a new dataframe, let the rest be the same
32+
new_df = DataFrame(input; copycols)
33+
34+
# Copy over the geometry columns
35+
for (colname, col) in zip(geometry_column_names, geometry_columns)
36+
new_df[!, colname] = col
37+
end
38+
39+
# The other columns were already copied over by the constructor,
40+
# so we're done.
41+
# Metadata is set in the `_apply_table` method.
42+
return new_df
43+
end
44+
45+
end

test/extensions/dataframes.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
using Test
2+
using DataFrames
3+
import GeometryOps as GO, GeoInterface as GI
4+
5+
using NaturalEarth
6+
7+
@testset "DataFrames extension: can we use copycols=false?" begin
8+
df = DataFrame(naturalearth("admin_0_countries", 110); copycols = true)
9+
transformed = GO.transform(identity, df; copycols = true)
10+
transformed_lazy = GO.transform(identity, df; copycols = false)
11+
12+
@test transformed.NAME == transformed_lazy.NAME
13+
@test transformed.NAME !== df.NAME # test that they are different arrays
14+
@test transformed_lazy.NAME === df.NAME # test that they are the same array
15+
end

test/runtests.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,5 @@ end
4747
# Extensions
4848
@safetestset "FlexiJoins" begin include("extensions/flexijoins.jl") end
4949
@safetestset "LibGEOS" begin include("extensions/libgeos.jl") end
50-
@safetestset "TGGeometry" begin include("extensions/tggeometry.jl") end
50+
@safetestset "TGGeometry" begin include("extensions/tggeometry.jl") end
51+
@safetestset "DataFrames" begin include("extensions/dataframes.jl") end

0 commit comments

Comments
 (0)