Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Depends:
R (>= 3.6.0)
Suggests:
arrow,
jsonlite,
R6,
sf,
testthat (>= 3.0.0)
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ export(geoarrow_handle)
export(geoarrow_schema_parse)
export(geoarrow_writer)
export(infer_geoarrow_schema)
export(is_geoarrow_schema)
export(na_extension_geoarrow)
export(na_extension_large_wkb)
export(na_extension_large_wkt)
Expand Down
252 changes: 252 additions & 0 deletions R/geoparquet.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@


write_geoparquet <- function(x, ...,
primary_geometry_column = NULL,
geometry_columns = NULL,
write_geometry_types = NULL,
write_bbox = NULL,
check_wkb = NULL,
schema = NULL) {
if (is.null(schema)) {
schema <- infer_nanoarrow_schema(x, geoarrow_lazy = TRUE)
} else {
schema <- as_nanoarrow_schema(schema)
}

geo_meta <- geoparquet_metadata_from_schema(
schema,
primary_geometry_column = primary_geometry_column,
geometry_columns = geometry_columns,
add_geometry_types = write_geometry_types
)

# should work with data.frame and table input
geometry_indices <- match(names(geo_meta$columns), names(schema$children))
table_not_geometry <- arrow::as_arrow_table(
x[-geometry_indices],
schema = arrow::as_schema(
nanoarrow::na_struct(schema$children[-geometry_indices])
)
)
columns_not_geometry <- table_not_geometry$columns
names(columns_not_geometry) <- names(table_not_geometry)

columns_geometry <- vector("list", length(geometry_indices))
names(columns_geometry) <- names(geo_meta$columns)
for (geo_column_i in seq_along(geometry_indices)) {
x_i <- geometry_indices[[geo_column_i]]
updated_column_meta_and_chunked_array <- geoparquet_encode_chunked_array(
x[[x_i]],
geo_meta$columns[[geo_column_i]],
add_geometry_types = write_geometry_types,
add_bbox = write_bbox,
check_wkb = check_wkb
)

geo_meta$columns[[geo_column_i]] <- updated_column_meta_and_chunked_array[[1]]
columns_geometry[[geo_column_i]] <- updated_column_meta_and_chunked_array[[2]]
}

columns_all <- c(columns_not_geometry, columns_geometry)[names(schema$children)]
table <- arrow::Table$create(!!!columns_all)

# We have to use auto_unbox = TRUE for now because the CRS is
# an R representation of PROJJSON that does not roundtrip without
# auto_unbox = TRUE and there doesn't seem to be a way to mark literal
# json in jsonlite.
table$metadata$geo <- jsonlite::toJSON(geo_meta, auto_unbox = TRUE)

# Write!
arrow::write_parquet(table, ...)
}

geoparquet_metadata_from_schema <- function(schema,
primary_geometry_column,
geometry_columns,
add_geometry_types) {
primary_geometry_column <- geoparquet_guess_primary_geometry_column(
schema,
primary_geometry_column
)

columns <- geoparquet_columns_from_schema(
schema,
geometry_columns,
primary_geometry_column,
add_geometry_types
)

list(
version = "1.0.0",
primary_geometry_column = primary_geometry_column,
columns = columns
)
}

geoparquet_encode_chunked_array <- function(chunked_array_or_vctr,
spec,
add_geometry_types = NULL,
add_bbox = NULL,
check_wkb = NULL) {
# Only WKB is currently supported
if (spec$encoding == "WKB") {
item_out_vctr <- as_geoarrow_vctr(chunked_array_or_vctr, schema = na_extension_wkb())

# Less of a problem than in Python, because we are unlikely to start out with
# arrow-native EWKB binary which might be short-circuited into the output
if (isTRUE(check_wkb)) {
stop("check_wkb TRUE not implemented")
}

} else {
stop(sprintf("Expected column encoding 'WKB' but got '%s'", spec$encoding))
}

# These can use the unique_geometry_types and box_agg kernels when implemented
if (isTRUE(add_geometry_types)) {
stop("add_geometry_types TRUE not supported")
}

if (isTRUE(add_bbox)) {
stop("add_bbox TRUE not supported")
}

# Convert to ChunkedArray, except without the extension information
item_out_chunked_array <- as_chunked_array.geoarrow_vctr(
item_out_vctr,
geoarrow_extension_type = FALSE
)

list(spec, item_out_chunked_array)
}

geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_column = NULL) {
if (!is.null(primary_geometry_column)) {
return(primary_geometry_column)
}

schema_children <- schema$children
if ("geometry" %in% names(schema_children)) {
return("geometry")
}

if ("geography" %in% names(schema_children)) {
return("geography")
}

is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1))
if (any(is_geoarrow)) {
names(schema_children)[which(is_geoarrow)[1]]
} else {
stop("write_geoparquet() requires at least one geometry column")
}
}

geoparquet_columns_from_schema <- function(schema,
geometry_columns = NULL,
primary_geometry_column = NULL,
add_geometry_types = NULL) {
schema_children <- schema$children

if (is.null(geometry_columns)) {
geometry_columns <- character()

is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1))
geometry_columns <- union(geometry_columns, names(schema_children)[is_geoarrow])
}

geometry_columns <- union(geometry_columns, primary_geometry_column)

missing_geometry_columns <- setdiff(geometry_columns, names(schema_children))
if (length(missing_geometry_columns) > 0) {
stop(
sprintf(
"Specified geometry_columns %s not found in source",
paste0("'", missing_geometry_columns, "'", collapse = ",")
)
)
}

lapply(
schema_children[geometry_columns],
geoparquet_column_spec_from_type,
add_geometry_types = add_geometry_types
)
}

geoparquet_column_spec_from_type <- function(schema, add_geometry_types = NULL) {
spec <- list(
encoding = "WKB",
geometry_types = list()
)

if (is_geoarrow_schema(schema)) {
parsed <- geoarrow_schema_parse(schema)

spec$crs <- switch(
enum_label(parsed$crs_type, "CrsType"),
"NONE" = NULL,
"UNKNOWN" = ,
"PROJJSON" = jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE),
{
crs_info <- sanitize_crs(parsed$crs)
if (crs_info$crs_type == enum$CrsType$PROJJSON) {
jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE)
} else {
stop("Can't convert CRS to PROJJSON")
}
}
)

if (parsed$edge_type == enum$EdgeType$SPHERICAL) {
spec$edges <- "spherical"
}

maybe_known_geometry_type <- parsed$geometry_type
maybe_known_dimensions <- parsed$dimensions

if (!identical(add_geometry_types, FALSE) &&
maybe_known_geometry_type != "GEOMETRY" &&
maybe_known_dimensions != "UNKNOWN") {
# Wrap in a list so that we can use auto_unbox = TRUE
spec$geometry_types <- as.list(
paste0(
geoparquet_geometry_type_label(maybe_known_geometry_type),
geoparquet_dimension_label(maybe_known_dimensions)
)
)
}
}

spec
}

geoparquet_geometry_type_label <- function(x) {
switch(
x,
POINT = "Point",
LINESTRING = "LineString",
POLYGON = "Polygon",
MULTIPOINT = "MultiPoint",
MULTILINESTRING = "MultiLineString",
MULTIPOLYGON = "MultiPolygon",
NULL
)
}

geoparquet_dimension_label <- function(x) {
switch(
x,
XY = "",
XYZ = " Z",
XYM = " M",
XYZM = " ZM",
NULL
)
}

has_geoparquet_dependencies <- function() {
requireNamespace("arrow", quietly = TRUE) &&
requireNamespace("jsonlite", quietly = TRUE)
}

15 changes: 11 additions & 4 deletions R/infer-default.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#' @param promote_multi Use `TRUE` to return a MULTI type when both normal and
#' MULTI elements are in the same array.
#' @param coord_type Specify the coordinate type to use if returning
#' @param lazy Use TRUE to minimize the chance that inference will loop over
#' all features.
#' @param ... Passed to S3 methods.
#'
#' @return A [nanoarrow_schema][as_nanoarrow_schema]
Expand All @@ -14,13 +16,15 @@
#' infer_geoarrow_schema(wk::wkt("POINT (0 1)"))
#'
infer_geoarrow_schema <- function(x, ..., promote_multi = TRUE,
coord_type = NULL) {
coord_type = NULL,
lazy = FALSE) {
UseMethod("infer_geoarrow_schema")
}

#' @export
infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE,
coord_type = NULL) {
coord_type = NULL,
lazy = FALSE) {
if (is.null(coord_type)) {
coord_type <- enum$CoordType$SEPARATE
}
Expand All @@ -29,12 +33,15 @@ infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE,
vector_meta <- wk::wk_vector_meta(x)
all_types <- vector_meta$geometry_type

has_mising_info <- is.na(vector_meta$geometry_type) ||
has_missing_info <- is.na(vector_meta$geometry_type) ||
(vector_meta$geometry_type == 0L) ||
is.na(vector_meta$has_z) ||
is.na(vector_meta$has_m)

if (has_mising_info) {
if (has_missing_info && lazy) {
# If we've been requested not to iterate on features, the best we can do is wkb
wk_geoarrow_schema(x, na_extension_wkb)
} else if (has_missing_info) {
# Fall back on calculation from wk_meta(). This would be better with
# the unique_geometry_types kernel (because it has the option to disregard
# empties).
Expand Down
7 changes: 6 additions & 1 deletion R/pkg-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ as_arrow_array.geoarrow_vctr <- function(x, ..., type = NULL) {
}
}

as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) {
as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL,
geoarrow_extension_type = TRUE) {
if (is.null(type)) {
schema <- NULL
type <- arrow::as_data_type(attr(x, "schema", exact = TRUE))
Expand All @@ -23,6 +24,10 @@ as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) {
type <- arrow::as_data_type(type)

schema <- as_nanoarrow_schema(type)
if (!geoarrow_extension_type) {
schema <- force_schema_storage(schema)
}

arrays <- vector("list", length(chunks))
for (i in seq_along(arrays)) {
tmp_schema <- nanoarrow::nanoarrow_allocate_schema()
Expand Down
4 changes: 2 additions & 2 deletions R/pkg-sf.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ convert_array.sfc <- function(array, to, ..., sfc_promote_multi = FALSE) {

#' @importFrom nanoarrow infer_nanoarrow_schema
#' @export
infer_nanoarrow_schema.sfc <- function(x, ...) {
infer_geoarrow_schema(x)
infer_nanoarrow_schema.sfc <- function(x, ..., geoarrow_lazy = FALSE) {
infer_geoarrow_schema(x, lazy = geoarrow_lazy)
}

#' @export
Expand Down
9 changes: 9 additions & 0 deletions R/type.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,15 @@ geoarrow_schema_parse <- function(schema, extension_name = NULL) {
.Call(geoarrow_c_schema_parse, schema, extension_name)
}

#' @rdname geoarrow_schema_parse
#' @export
is_geoarrow_schema <- function(schema, extension_name = NULL) {
tryCatch({
geoarrow_schema_parse(schema, extension_name)
TRUE
}, error = function(...) FALSE)
}

na_extension_geoarrow_internal <- function(type_id, crs, edges) {
metadata <- na_extension_metadata_internal(crs, edges)
schema <- nanoarrow::nanoarrow_allocate_schema()
Expand Down
3 changes: 3 additions & 0 deletions man/geoarrow_schema_parse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 10 additions & 1 deletion man/infer_geoarrow_schema.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading