diff --git a/DESCRIPTION b/DESCRIPTION index 2273f3f..7eb8b80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Depends: R (>= 3.6.0) Suggests: arrow, + jsonlite, R6, sf, testthat (>= 3.0.0) diff --git a/NAMESPACE b/NAMESPACE index 3ff17b0..a6f934a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ export(geoarrow_handle) export(geoarrow_schema_parse) export(geoarrow_writer) export(infer_geoarrow_schema) +export(is_geoarrow_schema) export(na_extension_geoarrow) export(na_extension_large_wkb) export(na_extension_large_wkt) diff --git a/R/geoparquet.R b/R/geoparquet.R new file mode 100644 index 0000000..5749f68 --- /dev/null +++ b/R/geoparquet.R @@ -0,0 +1,252 @@ + + +write_geoparquet <- function(x, ..., + primary_geometry_column = NULL, + geometry_columns = NULL, + write_geometry_types = NULL, + write_bbox = NULL, + check_wkb = NULL, + schema = NULL) { + if (is.null(schema)) { + schema <- infer_nanoarrow_schema(x, geoarrow_lazy = TRUE) + } else { + schema <- as_nanoarrow_schema(schema) + } + + geo_meta <- geoparquet_metadata_from_schema( + schema, + primary_geometry_column = primary_geometry_column, + geometry_columns = geometry_columns, + add_geometry_types = write_geometry_types + ) + + # should work with data.frame and table input + geometry_indices <- match(names(geo_meta$columns), names(schema$children)) + table_not_geometry <- arrow::as_arrow_table( + x[-geometry_indices], + schema = arrow::as_schema( + nanoarrow::na_struct(schema$children[-geometry_indices]) + ) + ) + columns_not_geometry <- table_not_geometry$columns + names(columns_not_geometry) <- names(table_not_geometry) + + columns_geometry <- vector("list", length(geometry_indices)) + names(columns_geometry) <- names(geo_meta$columns) + for (geo_column_i in seq_along(geometry_indices)) { + x_i <- geometry_indices[[geo_column_i]] + updated_column_meta_and_chunked_array <- geoparquet_encode_chunked_array( + x[[x_i]], + geo_meta$columns[[geo_column_i]], + add_geometry_types = write_geometry_types, + add_bbox = write_bbox, + check_wkb = check_wkb + ) + + geo_meta$columns[[geo_column_i]] <- updated_column_meta_and_chunked_array[[1]] + columns_geometry[[geo_column_i]] <- updated_column_meta_and_chunked_array[[2]] + } + + columns_all <- c(columns_not_geometry, columns_geometry)[names(schema$children)] + table <- arrow::Table$create(!!!columns_all) + + # We have to use auto_unbox = TRUE for now because the CRS is + # an R representation of PROJJSON that does not roundtrip without + # auto_unbox = TRUE and there doesn't seem to be a way to mark literal + # json in jsonlite. + table$metadata$geo <- jsonlite::toJSON(geo_meta, auto_unbox = TRUE) + + # Write! + arrow::write_parquet(table, ...) +} + +geoparquet_metadata_from_schema <- function(schema, + primary_geometry_column, + geometry_columns, + add_geometry_types) { + primary_geometry_column <- geoparquet_guess_primary_geometry_column( + schema, + primary_geometry_column + ) + + columns <- geoparquet_columns_from_schema( + schema, + geometry_columns, + primary_geometry_column, + add_geometry_types + ) + + list( + version = "1.0.0", + primary_geometry_column = primary_geometry_column, + columns = columns + ) +} + +geoparquet_encode_chunked_array <- function(chunked_array_or_vctr, + spec, + add_geometry_types = NULL, + add_bbox = NULL, + check_wkb = NULL) { + # Only WKB is currently supported + if (spec$encoding == "WKB") { + item_out_vctr <- as_geoarrow_vctr(chunked_array_or_vctr, schema = na_extension_wkb()) + + # Less of a problem than in Python, because we are unlikely to start out with + # arrow-native EWKB binary which might be short-circuited into the output + if (isTRUE(check_wkb)) { + stop("check_wkb TRUE not implemented") + } + + } else { + stop(sprintf("Expected column encoding 'WKB' but got '%s'", spec$encoding)) + } + + # These can use the unique_geometry_types and box_agg kernels when implemented + if (isTRUE(add_geometry_types)) { + stop("add_geometry_types TRUE not supported") + } + + if (isTRUE(add_bbox)) { + stop("add_bbox TRUE not supported") + } + + # Convert to ChunkedArray, except without the extension information + item_out_chunked_array <- as_chunked_array.geoarrow_vctr( + item_out_vctr, + geoarrow_extension_type = FALSE + ) + + list(spec, item_out_chunked_array) +} + +geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_column = NULL) { + if (!is.null(primary_geometry_column)) { + return(primary_geometry_column) + } + + schema_children <- schema$children + if ("geometry" %in% names(schema_children)) { + return("geometry") + } + + if ("geography" %in% names(schema_children)) { + return("geography") + } + + is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1)) + if (any(is_geoarrow)) { + names(schema_children)[which(is_geoarrow)[1]] + } else { + stop("write_geoparquet() requires at least one geometry column") + } +} + +geoparquet_columns_from_schema <- function(schema, + geometry_columns = NULL, + primary_geometry_column = NULL, + add_geometry_types = NULL) { + schema_children <- schema$children + + if (is.null(geometry_columns)) { + geometry_columns <- character() + + is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1)) + geometry_columns <- union(geometry_columns, names(schema_children)[is_geoarrow]) + } + + geometry_columns <- union(geometry_columns, primary_geometry_column) + + missing_geometry_columns <- setdiff(geometry_columns, names(schema_children)) + if (length(missing_geometry_columns) > 0) { + stop( + sprintf( + "Specified geometry_columns %s not found in source", + paste0("'", missing_geometry_columns, "'", collapse = ",") + ) + ) + } + + lapply( + schema_children[geometry_columns], + geoparquet_column_spec_from_type, + add_geometry_types = add_geometry_types + ) +} + +geoparquet_column_spec_from_type <- function(schema, add_geometry_types = NULL) { + spec <- list( + encoding = "WKB", + geometry_types = list() + ) + + if (is_geoarrow_schema(schema)) { + parsed <- geoarrow_schema_parse(schema) + + spec$crs <- switch( + enum_label(parsed$crs_type, "CrsType"), + "NONE" = NULL, + "UNKNOWN" = , + "PROJJSON" = jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE), + { + crs_info <- sanitize_crs(parsed$crs) + if (crs_info$crs_type == enum$CrsType$PROJJSON) { + jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE) + } else { + stop("Can't convert CRS to PROJJSON") + } + } + ) + + if (parsed$edge_type == enum$EdgeType$SPHERICAL) { + spec$edges <- "spherical" + } + + maybe_known_geometry_type <- parsed$geometry_type + maybe_known_dimensions <- parsed$dimensions + + if (!identical(add_geometry_types, FALSE) && + maybe_known_geometry_type != "GEOMETRY" && + maybe_known_dimensions != "UNKNOWN") { + # Wrap in a list so that we can use auto_unbox = TRUE + spec$geometry_types <- as.list( + paste0( + geoparquet_geometry_type_label(maybe_known_geometry_type), + geoparquet_dimension_label(maybe_known_dimensions) + ) + ) + } + } + + spec +} + +geoparquet_geometry_type_label <- function(x) { + switch( + x, + POINT = "Point", + LINESTRING = "LineString", + POLYGON = "Polygon", + MULTIPOINT = "MultiPoint", + MULTILINESTRING = "MultiLineString", + MULTIPOLYGON = "MultiPolygon", + NULL + ) +} + +geoparquet_dimension_label <- function(x) { + switch( + x, + XY = "", + XYZ = " Z", + XYM = " M", + XYZM = " ZM", + NULL + ) +} + +has_geoparquet_dependencies <- function() { + requireNamespace("arrow", quietly = TRUE) && + requireNamespace("jsonlite", quietly = TRUE) +} + diff --git a/R/infer-default.R b/R/infer-default.R index 114f83d..262e346 100644 --- a/R/infer-default.R +++ b/R/infer-default.R @@ -5,6 +5,8 @@ #' @param promote_multi Use `TRUE` to return a MULTI type when both normal and #' MULTI elements are in the same array. #' @param coord_type Specify the coordinate type to use if returning +#' @param lazy Use TRUE to minimize the chance that inference will loop over +#' all features. #' @param ... Passed to S3 methods. #' #' @return A [nanoarrow_schema][as_nanoarrow_schema] @@ -14,13 +16,15 @@ #' infer_geoarrow_schema(wk::wkt("POINT (0 1)")) #' infer_geoarrow_schema <- function(x, ..., promote_multi = TRUE, - coord_type = NULL) { + coord_type = NULL, + lazy = FALSE) { UseMethod("infer_geoarrow_schema") } #' @export infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE, - coord_type = NULL) { + coord_type = NULL, + lazy = FALSE) { if (is.null(coord_type)) { coord_type <- enum$CoordType$SEPARATE } @@ -29,12 +33,15 @@ infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE, vector_meta <- wk::wk_vector_meta(x) all_types <- vector_meta$geometry_type - has_mising_info <- is.na(vector_meta$geometry_type) || + has_missing_info <- is.na(vector_meta$geometry_type) || (vector_meta$geometry_type == 0L) || is.na(vector_meta$has_z) || is.na(vector_meta$has_m) - if (has_mising_info) { + if (has_missing_info && lazy) { + # If we've been requested not to iterate on features, the best we can do is wkb + wk_geoarrow_schema(x, na_extension_wkb) + } else if (has_missing_info) { # Fall back on calculation from wk_meta(). This would be better with # the unique_geometry_types kernel (because it has the option to disregard # empties). diff --git a/R/pkg-arrow.R b/R/pkg-arrow.R index 9874e16..d3024cf 100644 --- a/R/pkg-arrow.R +++ b/R/pkg-arrow.R @@ -8,7 +8,8 @@ as_arrow_array.geoarrow_vctr <- function(x, ..., type = NULL) { } } -as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) { +as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL, + geoarrow_extension_type = TRUE) { if (is.null(type)) { schema <- NULL type <- arrow::as_data_type(attr(x, "schema", exact = TRUE)) @@ -23,6 +24,10 @@ as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) { type <- arrow::as_data_type(type) schema <- as_nanoarrow_schema(type) + if (!geoarrow_extension_type) { + schema <- force_schema_storage(schema) + } + arrays <- vector("list", length(chunks)) for (i in seq_along(arrays)) { tmp_schema <- nanoarrow::nanoarrow_allocate_schema() diff --git a/R/pkg-sf.R b/R/pkg-sf.R index f3282c6..fe4f358 100644 --- a/R/pkg-sf.R +++ b/R/pkg-sf.R @@ -67,8 +67,8 @@ convert_array.sfc <- function(array, to, ..., sfc_promote_multi = FALSE) { #' @importFrom nanoarrow infer_nanoarrow_schema #' @export -infer_nanoarrow_schema.sfc <- function(x, ...) { - infer_geoarrow_schema(x) +infer_nanoarrow_schema.sfc <- function(x, ..., geoarrow_lazy = FALSE) { + infer_geoarrow_schema(x, lazy = geoarrow_lazy) } #' @export diff --git a/R/type.R b/R/type.R index 8e3879e..a2ec505 100644 --- a/R/type.R +++ b/R/type.R @@ -87,6 +87,15 @@ geoarrow_schema_parse <- function(schema, extension_name = NULL) { .Call(geoarrow_c_schema_parse, schema, extension_name) } +#' @rdname geoarrow_schema_parse +#' @export +is_geoarrow_schema <- function(schema, extension_name = NULL) { + tryCatch({ + geoarrow_schema_parse(schema, extension_name) + TRUE + }, error = function(...) FALSE) +} + na_extension_geoarrow_internal <- function(type_id, crs, edges) { metadata <- na_extension_metadata_internal(crs, edges) schema <- nanoarrow::nanoarrow_allocate_schema() diff --git a/man/geoarrow_schema_parse.Rd b/man/geoarrow_schema_parse.Rd index 7e9d048..feb3030 100644 --- a/man/geoarrow_schema_parse.Rd +++ b/man/geoarrow_schema_parse.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/type.R \name{geoarrow_schema_parse} \alias{geoarrow_schema_parse} +\alias{is_geoarrow_schema} \title{Inspect a GeoArrow schema} \usage{ geoarrow_schema_parse(schema, extension_name = NULL) + +is_geoarrow_schema(schema, extension_name = NULL) } \arguments{ \item{schema}{A \link[nanoarrow:as_nanoarrow_schema]{nanoarrow_schema}} diff --git a/man/infer_geoarrow_schema.Rd b/man/infer_geoarrow_schema.Rd index 9936b48..746df96 100644 --- a/man/infer_geoarrow_schema.Rd +++ b/man/infer_geoarrow_schema.Rd @@ -4,7 +4,13 @@ \alias{infer_geoarrow_schema} \title{Infer a GeoArrow-native type from a vector} \usage{ -infer_geoarrow_schema(x, ..., promote_multi = TRUE, coord_type = NULL) +infer_geoarrow_schema( + x, + ..., + promote_multi = TRUE, + coord_type = NULL, + lazy = FALSE +) } \arguments{ \item{x}{An object from which to infer a schema.} @@ -15,6 +21,9 @@ infer_geoarrow_schema(x, ..., promote_multi = TRUE, coord_type = NULL) MULTI elements are in the same array.} \item{coord_type}{Specify the coordinate type to use if returning} + +\item{lazy}{Use TRUE to minimize the chance that inference will loop over +all features.} } \value{ A \link[=as_nanoarrow_schema]{nanoarrow_schema} diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R new file mode 100644 index 0000000..e89288a --- /dev/null +++ b/tests/testthat/test-geoparquet.R @@ -0,0 +1,198 @@ + +test_that("write_geoparquet_table() can write a data.frame", { + skip_if_not(has_geoparquet_dependencies()) + + tmp <- tempfile() + on.exit(unlink(tmp)) + + df <- data.frame(not_geometry = 1L, geometry = wk::wkt("POINT (0 1)")) + write_geoparquet(df, tmp) + table <- arrow::read_parquet(tmp, as_data_frame = FALSE) + + expect_true("geo" %in% names(table$metadata)) + geo_meta <- jsonlite::fromJSON(table$metadata$geo, simplifyVector = FALSE) + expect_identical(geo_meta$version, "1.0.0") + expect_identical(geo_meta$primary_geometry_column, "geometry") + expect_identical( + geo_meta$columns$geometry, + list(encoding = "WKB", geometry_types = list()) + ) + + expect_true(table$schema$geometry$type$Equals(arrow::binary())) + wkb <- wk::wkb(as.vector(table$geometry)) + expect_identical(wk::as_wkt(wkb), wk::wkt("POINT (0 1)")) +}) + +test_that("geoparquet_guess_primary_geometry_column() works", { + schema <- nanoarrow::na_struct(list(geometry = nanoarrow::na_string())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "geometry" + ) + + expect_identical( + geoparquet_guess_primary_geometry_column(schema, "something_else"), + "something_else" + ) + + schema <- nanoarrow::na_struct(list(geography = nanoarrow::na_string())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "geography" + ) + + schema <- nanoarrow::na_struct(list(something_else = na_extension_wkt())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "something_else" + ) + + schema <- nanoarrow::na_struct(list()) + expect_error( + geoparquet_guess_primary_geometry_column(schema), + "requires at least one geometry column" + ) +}) + +test_that("geoparquet_columns_from_schema() always includes geometry_columns", { + schema <- nanoarrow::na_struct( + list( + geom = nanoarrow::na_string(), + geom2 = na_extension_wkt() + ) + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, geometry_columns = NULL)), + "geom2" + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, geometry_columns = "geom")), + "geom" + ) +}) + +test_that("geoparquet_columns_from_schema() always includes primary_geometry_column", { + schema <- nanoarrow::na_struct(list(geom = nanoarrow::na_string())) + + expect_identical( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = NULL)), + character() + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = "geom")), + "geom" + ) + + expect_error( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = "not_a_col")), + "Specified geometry_columns" + ) +}) + +test_that("geoparquet_column_spec_from_type() works", { + # non-geoarrow type + expect_identical( + geoparquet_column_spec_from_type(nanoarrow::na_string()), + list(encoding = "WKB", geometry_types = list()) + ) + + # geoarrow type with crs + spec_crs <- geoparquet_column_spec_from_type(na_extension_wkb(crs = wk::wk_crs_longlat())) + expect_identical(spec_crs$encoding, "WKB") + expect_identical(spec_crs$crs$id, list(authority = "OGC", code = "CRS84")) + + # geoarrow type with add_geometry_types = FALSE + expect_identical( + geoparquet_column_spec_from_type( + na_extension_geoarrow("POINT"), + add_geometry_types = FALSE + ), + list(encoding = "WKB", geometry_types = list()) + ) + + # geoarrow types with varying dimensions + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT")), + list(encoding = "WKB", geometry_types = list("Point")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZ")), + list(encoding = "WKB", geometry_types = list("Point Z")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYM")), + list(encoding = "WKB", geometry_types = list("Point M")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZM")), + list(encoding = "WKB", geometry_types = list("Point ZM")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZ")), + list(encoding = "WKB", geometry_types = list("Point Z")) + ) + + # Also check geometry types + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING")), + list(encoding = "WKB", geometry_types = list("LineString")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POLYGON")), + list(encoding = "WKB", geometry_types = list("Polygon")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTIPOINT")), + list(encoding = "WKB", geometry_types = list("MultiPoint")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTILINESTRING")), + list(encoding = "WKB", geometry_types = list("MultiLineString")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTIPOLYGON")), + list(encoding = "WKB", geometry_types = list("MultiPolygon")) + ) + + # Check unknown geometry type + expect_identical( + geoparquet_column_spec_from_type(na_extension_wkb()), + list(encoding = "WKB", geometry_types = list()) + ) + + # Check edge types + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING", edges = "SPHERICAL")), + list(encoding = "WKB", geometry_types = list("LineString"), edges = "spherical") + ) +}) + +test_that("geoparquet_encode_chunked_array() works", { + skip_if_not(has_geoparquet_dependencies()) + + chunked <- geoparquet_encode_chunked_array("POINT (0 1)", list(encoding = "WKB")) + expect_identical( + chunked[[1]], + list(encoding = "WKB") + ) + expect_true(chunked[[2]]$type$Equals(arrow::binary())) + expect_equal(chunked[[2]]$length(), 1L) + + expect_error( + geoparquet_encode_chunked_array("POINT (0 1)", list(encoding = "Not valid")), + "Expected column encoding 'WKB'" + ) +}) + + diff --git a/tests/testthat/test-type.R b/tests/testthat/test-type.R index 1e92c3c..40d42ff 100644 --- a/tests/testthat/test-type.R +++ b/tests/testthat/test-type.R @@ -110,6 +110,11 @@ test_that("geoarrow_schema_parse() can parse a storage schema", { ) }) +test_that("schema checker works", { + expect_true(is_geoarrow_schema(na_extension_wkt())) + expect_false(is_geoarrow_schema(nanoarrow::na_decimal128())) +}) + test_that("enum matcher works", { expect_identical( enum_value(c("GEOMETRY", "MULTIPOINT", "NOT VALID"), "GeometryType"),