From 2c26c3c113efaeab968e83071a356b54195b1f34 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 1 Jan 2024 23:10:39 -0400 Subject: [PATCH 1/6] add basic writer --- DESCRIPTION | 1 + NAMESPACE | 1 + R/geoparquet.R | 242 +++++++++++++++++++++++++++++++ R/infer-default.R | 15 +- R/pkg-arrow.R | 7 +- R/pkg-sf.R | 4 +- R/type.R | 9 ++ man/geoarrow_schema_parse.Rd | 3 + man/infer_geoarrow_schema.Rd | 11 +- tests/testthat/test-geoparquet.R | 24 +++ tests/testthat/test-type.R | 5 + 11 files changed, 314 insertions(+), 8 deletions(-) create mode 100644 R/geoparquet.R create mode 100644 tests/testthat/test-geoparquet.R diff --git a/DESCRIPTION b/DESCRIPTION index 2273f3f..7eb8b80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Depends: R (>= 3.6.0) Suggests: arrow, + jsonlite, R6, sf, testthat (>= 3.0.0) diff --git a/NAMESPACE b/NAMESPACE index 3ff17b0..a6f934a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ export(geoarrow_handle) export(geoarrow_schema_parse) export(geoarrow_writer) export(infer_geoarrow_schema) +export(is_geoarrow_schema) export(na_extension_geoarrow) export(na_extension_large_wkb) export(na_extension_large_wkt) diff --git a/R/geoparquet.R b/R/geoparquet.R new file mode 100644 index 0000000..69febc1 --- /dev/null +++ b/R/geoparquet.R @@ -0,0 +1,242 @@ + + +write_geoparquet <- function(x, ..., + primary_geometry_column = NULL, + geometry_columns = NULL, + write_geometry_types = NULL, + write_bbox = NULL, + check_wkb = NULL, + schema = NULL) { + if (is.null(schema)) { + schema <- infer_nanoarrow_schema(x, geoarrow_lazy = TRUE) + } else { + schema <- as_nanoarrow_schema(schema) + } + + geo_meta <- geoparquet_metadata_from_schema( + schema, + primary_geometry_column = primary_geometry_column, + geometry_columns = geometry_columns, + add_geometry_types = write_geometry_types + ) + + # should work with data.frame and table input + geometry_indices <- match(names(geo_meta$columns), names(schema$children)) + table_not_geometry <- arrow::as_arrow_table( + x[-geometry_indices], + schema = arrow::as_schema( + nanoarrow::na_struct(schema$children[-geometry_indices]) + ) + ) + columns_not_geometry <- table_not_geometry$columns + names(columns_not_geometry) <- names(table_not_geometry) + + columns_geometry <- vector("list", length(geometry_indices)) + names(columns_geometry) <- names(geo_meta$columns) + for (geo_column_i in seq_along(geometry_indices)) { + x_i <- geometry_indices[[geo_column_i]] + updated_column_meta_and_chunked_array <- geoparquet_encode_chunked_array( + x[[x_i]], + geo_meta$columns[[geo_column_i]], + add_geometry_types = write_geometry_types, + add_bbox = write_bbox, + check_wkb = check_wkb + ) + + geo_meta$columns[[geo_column_i]] <- updated_column_meta_and_chunked_array[[1]] + columns_geometry[[geo_column_i]] <- updated_column_meta_and_chunked_array[[2]] + } + + columns_all <- c(columns_not_geometry, columns_geometry)[names(schema$children)] + table <- arrow::Table$create(!!!columns_all) + + # We have to use auto_unbox = TRUE for now because the CRS is + # an R representation of PROJJSON that does not roundtrip without + # auto_unbox = TRUE and there doesn't seem to be a way to mark literal + # json in jsonlite. + table$metadata$geo <- jsonlite::toJSON(geo_meta, auto_unbox = TRUE) + + # Write! + arrow::write_parquet(table, ...) +} + +geoparquet_metadata_from_schema <- function(schema, + primary_geometry_column, + geometry_columns, + add_geometry_types) { + primary_geometry_column <- geoparquet_guess_primary_geometry_column( + schema, + primary_geometry_column + ) + + columns <- geoparquet_columns_from_schema( + schema, + geometry_columns, + primary_geometry_column, + add_geometry_types + ) + + list( + version = "1.0.0", + primary_geometry_column = primary_geometry_column, + columns = columns + ) +} + +geoparquet_encode_chunked_array <- function(chunked_array_or_vctr, + spec, + add_geometry_types, + add_bbox, + check_wkb) { + # Only WKB is currently supported + if (spec$encoding == "WKB") { + item_out_vctr <- as_geoarrow_vctr(chunked_array_or_vctr, schema = na_extension_wkb()) + + # Less of a problem than in Python, because we are unlikely to start out with + # arrow-native EWKB binary which might be short-circuited into the output + if (isTRUE(check_wkb)) { + stop("check_wkb TRUE not implemented") + } + + } else { + stop(sprintf("Expected column encoding 'WKB' but got '%s'", spec$encoding)) + } + + # These can use the unique_geometry_types and box_agg kernels when implemented + if (isTRUE(add_geometry_types)) { + stop("add_geometry_types TRUE not supported") + } + + if (isTRUE(add_bbox)) { + stop("add_bbox TRUE not supported") + } + + # Convert to ChunkedArray, except without the extension information + item_out_chunked_array <- as_chunked_array.geoarrow_vctr( + item_out_vctr, + geoarrow_extension_type = FALSE + ) + + list(spec, item_out_chunked_array) +} + +geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_column) { + if (!is.null(primary_geometry_column)) { + return(primary_geometry_column) + } + + schema_children <- schema$children + if ("geometry" %in% names(schema_children)) { + return("geometry") + } + + if ("geography" %in% names(schema_children)) { + return("geography") + } + + is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1)) + if (any(is_geoarrow)) { + names(schema_children)[which(is_geoarrow)[1]] + } else { + stop("write_geoparquet() requires at least one geometry column") + } +} + +geoparquet_columns_from_schema <- function(schema, + geometry_columns, + primary_geometry_column, + add_geometry_types) { + schema_children <- schema$children + + if (is.null(geometry_columns)) { + geometry_columns <- character() + if (!is.null(primary_geometry_column)) { + geometry_columns <- union(geometry_columns, primary_geometry_column) + } + + is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1)) + geometry_columns <- union(geometry_columns, names(schema_children)[is_geoarrow]) + } + + lapply( + schema_children[geometry_columns], + geoparquet_column_spec_from_type, + add_geometry_types = add_geometry_types + ) +} + +geoparquet_column_spec_from_type <- function(schema, add_geometry_types) { + spec <- list( + encoding = "WKB", + geometry_types = list() + ) + + if (is_geoarrow_schema(schema)) { + parsed <- geoarrow_schema_parse(schema) + + spec$crs <- switch( + parsed$crs_type, + "NONE" = NULL, + "PROJJSON" = jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE), + { + crs_info <- sanitize_crs(parsed$crs) + if (crs_info$crs_type == enum$CrsType$PROJJSON) { + jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE) + } else { + stop("Can't convert CRS to PROJJSON") + } + } + ) + + if (parsed$edge_type == "SPHERICAL") { + spec$edges <- "spherical" + } + + maybe_known_geometry_type <- parsed$geometry_type + maybe_known_dimensions <- parsed$dimensions + + if (!identical(add_geometry_types, FALSE) && + maybe_known_geometry_type != "GEOMETRY" && + maybe_known_dimensions != "UNKNOWN") { + # Wrap in a list so that we can use auto_unbox = TRUE + spec$geometry_types <- as.list( + paste0( + geoparquet_geometry_type_label(maybe_known_geometry_type), + geoparquet_dimension_label(maybe_known_dimensions) + ) + ) + } + } + + spec +} + +geoparquet_geometry_type_label <- function(x) { + switch( + x, + POINT = "Point", + LINESTRING = "LineString", + POLYGON = "Polygon", + MULTIPOINT = "MultiPoint", + MULTILINESTRING = "MultiLineString", + MULTIPOLYGON = "MultiPolygon", + NULL + ) +} + +geoparquet_dimension_label <- function(x) { + switch( + x, + XY = "", + XYZ = " Z", + XYM = " M", + XYZM = " ZM", + NULL + ) +} + +has_geoparquet_dependencies <- function() { + requireNamespace("arrow", quietly = TRUE) && + requireNamespace("jsonlite", quietly = TRUE) +} + diff --git a/R/infer-default.R b/R/infer-default.R index 114f83d..262e346 100644 --- a/R/infer-default.R +++ b/R/infer-default.R @@ -5,6 +5,8 @@ #' @param promote_multi Use `TRUE` to return a MULTI type when both normal and #' MULTI elements are in the same array. #' @param coord_type Specify the coordinate type to use if returning +#' @param lazy Use TRUE to minimize the chance that inference will loop over +#' all features. #' @param ... Passed to S3 methods. #' #' @return A [nanoarrow_schema][as_nanoarrow_schema] @@ -14,13 +16,15 @@ #' infer_geoarrow_schema(wk::wkt("POINT (0 1)")) #' infer_geoarrow_schema <- function(x, ..., promote_multi = TRUE, - coord_type = NULL) { + coord_type = NULL, + lazy = FALSE) { UseMethod("infer_geoarrow_schema") } #' @export infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE, - coord_type = NULL) { + coord_type = NULL, + lazy = FALSE) { if (is.null(coord_type)) { coord_type <- enum$CoordType$SEPARATE } @@ -29,12 +33,15 @@ infer_geoarrow_schema.default <- function(x, ..., promote_multi = TRUE, vector_meta <- wk::wk_vector_meta(x) all_types <- vector_meta$geometry_type - has_mising_info <- is.na(vector_meta$geometry_type) || + has_missing_info <- is.na(vector_meta$geometry_type) || (vector_meta$geometry_type == 0L) || is.na(vector_meta$has_z) || is.na(vector_meta$has_m) - if (has_mising_info) { + if (has_missing_info && lazy) { + # If we've been requested not to iterate on features, the best we can do is wkb + wk_geoarrow_schema(x, na_extension_wkb) + } else if (has_missing_info) { # Fall back on calculation from wk_meta(). This would be better with # the unique_geometry_types kernel (because it has the option to disregard # empties). diff --git a/R/pkg-arrow.R b/R/pkg-arrow.R index 9874e16..d3024cf 100644 --- a/R/pkg-arrow.R +++ b/R/pkg-arrow.R @@ -8,7 +8,8 @@ as_arrow_array.geoarrow_vctr <- function(x, ..., type = NULL) { } } -as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) { +as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL, + geoarrow_extension_type = TRUE) { if (is.null(type)) { schema <- NULL type <- arrow::as_data_type(attr(x, "schema", exact = TRUE)) @@ -23,6 +24,10 @@ as_chunked_array.geoarrow_vctr <- function(x, ..., type = NULL) { type <- arrow::as_data_type(type) schema <- as_nanoarrow_schema(type) + if (!geoarrow_extension_type) { + schema <- force_schema_storage(schema) + } + arrays <- vector("list", length(chunks)) for (i in seq_along(arrays)) { tmp_schema <- nanoarrow::nanoarrow_allocate_schema() diff --git a/R/pkg-sf.R b/R/pkg-sf.R index f3282c6..fe4f358 100644 --- a/R/pkg-sf.R +++ b/R/pkg-sf.R @@ -67,8 +67,8 @@ convert_array.sfc <- function(array, to, ..., sfc_promote_multi = FALSE) { #' @importFrom nanoarrow infer_nanoarrow_schema #' @export -infer_nanoarrow_schema.sfc <- function(x, ...) { - infer_geoarrow_schema(x) +infer_nanoarrow_schema.sfc <- function(x, ..., geoarrow_lazy = FALSE) { + infer_geoarrow_schema(x, lazy = geoarrow_lazy) } #' @export diff --git a/R/type.R b/R/type.R index 8e3879e..a2ec505 100644 --- a/R/type.R +++ b/R/type.R @@ -87,6 +87,15 @@ geoarrow_schema_parse <- function(schema, extension_name = NULL) { .Call(geoarrow_c_schema_parse, schema, extension_name) } +#' @rdname geoarrow_schema_parse +#' @export +is_geoarrow_schema <- function(schema, extension_name = NULL) { + tryCatch({ + geoarrow_schema_parse(schema, extension_name) + TRUE + }, error = function(...) FALSE) +} + na_extension_geoarrow_internal <- function(type_id, crs, edges) { metadata <- na_extension_metadata_internal(crs, edges) schema <- nanoarrow::nanoarrow_allocate_schema() diff --git a/man/geoarrow_schema_parse.Rd b/man/geoarrow_schema_parse.Rd index 7e9d048..feb3030 100644 --- a/man/geoarrow_schema_parse.Rd +++ b/man/geoarrow_schema_parse.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/type.R \name{geoarrow_schema_parse} \alias{geoarrow_schema_parse} +\alias{is_geoarrow_schema} \title{Inspect a GeoArrow schema} \usage{ geoarrow_schema_parse(schema, extension_name = NULL) + +is_geoarrow_schema(schema, extension_name = NULL) } \arguments{ \item{schema}{A \link[nanoarrow:as_nanoarrow_schema]{nanoarrow_schema}} diff --git a/man/infer_geoarrow_schema.Rd b/man/infer_geoarrow_schema.Rd index 9936b48..746df96 100644 --- a/man/infer_geoarrow_schema.Rd +++ b/man/infer_geoarrow_schema.Rd @@ -4,7 +4,13 @@ \alias{infer_geoarrow_schema} \title{Infer a GeoArrow-native type from a vector} \usage{ -infer_geoarrow_schema(x, ..., promote_multi = TRUE, coord_type = NULL) +infer_geoarrow_schema( + x, + ..., + promote_multi = TRUE, + coord_type = NULL, + lazy = FALSE +) } \arguments{ \item{x}{An object from which to infer a schema.} @@ -15,6 +21,9 @@ infer_geoarrow_schema(x, ..., promote_multi = TRUE, coord_type = NULL) MULTI elements are in the same array.} \item{coord_type}{Specify the coordinate type to use if returning} + +\item{lazy}{Use TRUE to minimize the chance that inference will loop over +all features.} } \value{ A \link[=as_nanoarrow_schema]{nanoarrow_schema} diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R new file mode 100644 index 0000000..8a51d2a --- /dev/null +++ b/tests/testthat/test-geoparquet.R @@ -0,0 +1,24 @@ + +test_that("write_geoparquet_table() can write a data.frame", { + skip_if_not(has_geoparquet_dependencies()) + + tmp <- tempfile() + on.exit(unlink(tmp)) + + df <- data.frame(not_geometry = 1L, geometry = wk::wkt("POINT (0 1)")) + write_geoparquet(df, tmp) + table <- arrow::read_parquet(tmp, as_data_frame = FALSE) + + expect_true("geo" %in% names(table$metadata)) + geo_meta <- jsonlite::fromJSON(table$metadata$geo, simplifyVector = FALSE) + expect_identical(geo_meta$version, "1.0.0") + expect_identical(geo_meta$primary_geometry_column, "geometry") + expect_identical( + geo_meta$columns$geometry, + list(encoding = "WKB", geometry_types = list()) + ) + + expect_true(table$schema$geometry$type$Equals(arrow::binary())) + wkb <- wk::wkb(as.vector(table$geometry)) + expect_identical(wk::as_wkt(wkb), wk::wkt("POINT (0 1)")) +}) diff --git a/tests/testthat/test-type.R b/tests/testthat/test-type.R index 1e92c3c..40d42ff 100644 --- a/tests/testthat/test-type.R +++ b/tests/testthat/test-type.R @@ -110,6 +110,11 @@ test_that("geoarrow_schema_parse() can parse a storage schema", { ) }) +test_that("schema checker works", { + expect_true(is_geoarrow_schema(na_extension_wkt())) + expect_false(is_geoarrow_schema(nanoarrow::na_decimal128())) +}) + test_that("enum matcher works", { expect_identical( enum_value(c("GEOMETRY", "MULTIPOINT", "NOT VALID"), "GeometryType"), From ecc5859d3488b886ec0aad4b44820afac85bfdb3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 20 Jan 2024 14:59:45 -0400 Subject: [PATCH 2/6] check colum spec generator --- R/geoparquet.R | 4 +- tests/testthat/test-geoparquet.R | 75 ++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/R/geoparquet.R b/R/geoparquet.R index 69febc1..8b54564 100644 --- a/R/geoparquet.R +++ b/R/geoparquet.R @@ -165,7 +165,7 @@ geoparquet_columns_from_schema <- function(schema, ) } -geoparquet_column_spec_from_type <- function(schema, add_geometry_types) { +geoparquet_column_spec_from_type <- function(schema, add_geometry_types = NULL) { spec <- list( encoding = "WKB", geometry_types = list() @@ -188,7 +188,7 @@ geoparquet_column_spec_from_type <- function(schema, add_geometry_types) { } ) - if (parsed$edge_type == "SPHERICAL") { + if (parsed$edge_type == enum$EdgeType$SPHERICAL) { spec$edges <- "spherical" } diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R index 8a51d2a..1a5c807 100644 --- a/tests/testthat/test-geoparquet.R +++ b/tests/testthat/test-geoparquet.R @@ -22,3 +22,78 @@ test_that("write_geoparquet_table() can write a data.frame", { wkb <- wk::wkb(as.vector(table$geometry)) expect_identical(wk::as_wkt(wkb), wk::wkt("POINT (0 1)")) }) + +test_that("geoparquet_column_spec_from_type() works", { + # non-geoarrow type + expect_identical( + geoparquet_column_spec_from_type(nanoarrow::na_string()), + list(encoding = "WKB", geometry_types = list()) + ) + + # geoarrow type with add_geometry_types = FALSE + expect_identical( + geoparquet_column_spec_from_type( + na_extension_geoarrow("POINT"), + add_geometry_types = FALSE + ), + list(encoding = "WKB", geometry_types = list()) + ) + + # geoarrow types with varying dimensions + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT")), + list(encoding = "WKB", geometry_types = list("Point")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZ")), + list(encoding = "WKB", geometry_types = list("Point Z")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYM")), + list(encoding = "WKB", geometry_types = list("Point M")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZM")), + list(encoding = "WKB", geometry_types = list("Point ZM")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POINT", "XYZ")), + list(encoding = "WKB", geometry_types = list("Point Z")) + ) + + # Also check geometry t ypes + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING")), + list(encoding = "WKB", geometry_types = list("LineString")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("POLYGON")), + list(encoding = "WKB", geometry_types = list("Polygon")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTIPOINT")), + list(encoding = "WKB", geometry_types = list("MultiPoint")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTILINESTRING")), + list(encoding = "WKB", geometry_types = list("MultiLineString")) + ) + + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("MULTIPOLYGON")), + list(encoding = "WKB", geometry_types = list("MultiPolygon")) + ) + + # Check edge types + expect_identical( + geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING", edges = "SPHERICAL")), + list(encoding = "WKB", geometry_types = list("LineString"), edges = "spherical") + ) +}) From 59acfd628b9416c91525d849464ed8fe7d81b2a2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 20 Jan 2024 20:01:16 -0400 Subject: [PATCH 3/6] test CRS --- R/geoparquet.R | 3 ++- tests/testthat/test-geoparquet.R | 13 ++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/R/geoparquet.R b/R/geoparquet.R index 8b54564..cf86374 100644 --- a/R/geoparquet.R +++ b/R/geoparquet.R @@ -175,8 +175,9 @@ geoparquet_column_spec_from_type <- function(schema, add_geometry_types = NULL) parsed <- geoarrow_schema_parse(schema) spec$crs <- switch( - parsed$crs_type, + enum_label(parsed$crs_type, "CrsType"), "NONE" = NULL, + "UNKNOWN" = , "PROJJSON" = jsonlite::fromJSON(parsed$crs, simplifyVector = FALSE), { crs_info <- sanitize_crs(parsed$crs) diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R index 1a5c807..6590a95 100644 --- a/tests/testthat/test-geoparquet.R +++ b/tests/testthat/test-geoparquet.R @@ -30,6 +30,11 @@ test_that("geoparquet_column_spec_from_type() works", { list(encoding = "WKB", geometry_types = list()) ) + # geoarrow type with crs + spec_crs <- geoparquet_column_spec_from_type(na_extension_wkb(crs = wk::wk_crs_longlat())) + expect_identical(spec_crs$encoding, "WKB") + expect_identical(spec_crs$crs$id, list(authority = "OGC", code = "CRS84")) + # geoarrow type with add_geometry_types = FALSE expect_identical( geoparquet_column_spec_from_type( @@ -65,7 +70,7 @@ test_that("geoparquet_column_spec_from_type() works", { list(encoding = "WKB", geometry_types = list("Point Z")) ) - # Also check geometry t ypes + # Also check geometry types expect_identical( geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING")), list(encoding = "WKB", geometry_types = list("LineString")) @@ -91,6 +96,12 @@ test_that("geoparquet_column_spec_from_type() works", { list(encoding = "WKB", geometry_types = list("MultiPolygon")) ) + # Check unknown geometry type + expect_identical( + geoparquet_column_spec_from_type(na_extension_wkb()), + list(encoding = "WKB", geometry_types = list()) + ) + # Check edge types expect_identical( geoparquet_column_spec_from_type(na_extension_geoarrow("LINESTRING", edges = "SPHERICAL")), From b6a607b9a6a7fd9937d1c12f5a4ade930af8c4b6 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Jan 2024 22:20:05 -0400 Subject: [PATCH 4/6] test the columns generator --- R/geoparquet.R | 21 +++++++++++++----- tests/testthat/test-geoparquet.R | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/R/geoparquet.R b/R/geoparquet.R index cf86374..dd74392 100644 --- a/R/geoparquet.R +++ b/R/geoparquet.R @@ -143,21 +143,30 @@ geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_co } geoparquet_columns_from_schema <- function(schema, - geometry_columns, - primary_geometry_column, - add_geometry_types) { + geometry_columns = NULL, + primary_geometry_column = NULL, + add_geometry_types = NULL) { schema_children <- schema$children if (is.null(geometry_columns)) { geometry_columns <- character() - if (!is.null(primary_geometry_column)) { - geometry_columns <- union(geometry_columns, primary_geometry_column) - } is_geoarrow <- vapply(schema_children, is_geoarrow_schema, logical(1)) geometry_columns <- union(geometry_columns, names(schema_children)[is_geoarrow]) } + geometry_columns <- union(geometry_columns, primary_geometry_column) + + missing_geometry_columns <- setdiff(geometry_columns, names(schema_children)) + if (length(missing_geometry_columns) > 0) { + stop( + sprintf( + "Specified geometry_columns %s not found in source", + paste0("'", missing_geometry_columns, "'", collapse = ",") + ) + ) + } + lapply( schema_children[geometry_columns], geoparquet_column_spec_from_type, diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R index 6590a95..534664f 100644 --- a/tests/testthat/test-geoparquet.R +++ b/tests/testthat/test-geoparquet.R @@ -23,6 +23,44 @@ test_that("write_geoparquet_table() can write a data.frame", { expect_identical(wk::as_wkt(wkb), wk::wkt("POINT (0 1)")) }) +test_that("geoparquet_columns_from_schema() always includes geometry_columns", { + schema <- nanoarrow::na_struct( + list( + geom = nanoarrow::na_string(), + geom2 = na_extension_wkt() + ) + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, geometry_columns = NULL)), + "geom2" + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, geometry_columns = "geom")), + "geom" + ) +}) + +test_that("geoparquet_columns_from_schema() always includes primary_geometry_column", { + schema <- nanoarrow::na_struct(list(geom = nanoarrow::na_string())) + + expect_identical( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = NULL)), + character() + ) + + expect_identical( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = "geom")), + "geom" + ) + + expect_error( + names(geoparquet_columns_from_schema(schema, primary_geometry_column = "not_a_col")), + "Specified geometry_columns" + ) +}) + test_that("geoparquet_column_spec_from_type() works", { # non-geoarrow type expect_identical( From ad7018c435b9dfe0d8850c0cd9c2aa5b0193436e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Jan 2024 22:30:38 -0400 Subject: [PATCH 5/6] check primary geometry column --- R/geoparquet.R | 2 +- tests/testthat/test-geoparquet.R | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/R/geoparquet.R b/R/geoparquet.R index dd74392..ed258a2 100644 --- a/R/geoparquet.R +++ b/R/geoparquet.R @@ -120,7 +120,7 @@ geoparquet_encode_chunked_array <- function(chunked_array_or_vctr, list(spec, item_out_chunked_array) } -geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_column) { +geoparquet_guess_primary_geometry_column <- function(schema, primary_geometry_column = NULL) { if (!is.null(primary_geometry_column)) { return(primary_geometry_column) } diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R index 534664f..e65e95d 100644 --- a/tests/testthat/test-geoparquet.R +++ b/tests/testthat/test-geoparquet.R @@ -23,6 +23,37 @@ test_that("write_geoparquet_table() can write a data.frame", { expect_identical(wk::as_wkt(wkb), wk::wkt("POINT (0 1)")) }) +test_that("geoparquet_guess_primary_geometry_column() works", { + schema <- nanoarrow::na_struct(list(geometry = nanoarrow::na_string())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "geometry" + ) + + expect_identical( + geoparquet_guess_primary_geometry_column(schema, "something_else"), + "something_else" + ) + + schema <- nanoarrow::na_struct(list(geography = nanoarrow::na_string())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "geography" + ) + + schema <- nanoarrow::na_struct(list(something_else = na_extension_wkt())) + expect_identical( + geoparquet_guess_primary_geometry_column(schema), + "something_else" + ) + + schema <- nanoarrow::na_struct(list()) + expect_error( + geoparquet_guess_primary_geometry_column(schema), + "requires at least one geometry column" + ) +}) + test_that("geoparquet_columns_from_schema() always includes geometry_columns", { schema <- nanoarrow::na_struct( list( From ed1c4d0760a488b3d645aad7f17e356a59a8050c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 30 Jan 2024 22:43:00 -0400 Subject: [PATCH 6/6] more tests --- R/geoparquet.R | 6 +++--- tests/testthat/test-geoparquet.R | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/R/geoparquet.R b/R/geoparquet.R index ed258a2..5749f68 100644 --- a/R/geoparquet.R +++ b/R/geoparquet.R @@ -85,9 +85,9 @@ geoparquet_metadata_from_schema <- function(schema, geoparquet_encode_chunked_array <- function(chunked_array_or_vctr, spec, - add_geometry_types, - add_bbox, - check_wkb) { + add_geometry_types = NULL, + add_bbox = NULL, + check_wkb = NULL) { # Only WKB is currently supported if (spec$encoding == "WKB") { item_out_vctr <- as_geoarrow_vctr(chunked_array_or_vctr, schema = na_extension_wkb()) diff --git a/tests/testthat/test-geoparquet.R b/tests/testthat/test-geoparquet.R index e65e95d..e89288a 100644 --- a/tests/testthat/test-geoparquet.R +++ b/tests/testthat/test-geoparquet.R @@ -177,3 +177,22 @@ test_that("geoparquet_column_spec_from_type() works", { list(encoding = "WKB", geometry_types = list("LineString"), edges = "spherical") ) }) + +test_that("geoparquet_encode_chunked_array() works", { + skip_if_not(has_geoparquet_dependencies()) + + chunked <- geoparquet_encode_chunked_array("POINT (0 1)", list(encoding = "WKB")) + expect_identical( + chunked[[1]], + list(encoding = "WKB") + ) + expect_true(chunked[[2]]$type$Equals(arrow::binary())) + expect_equal(chunked[[2]]$length(), 1L) + + expect_error( + geoparquet_encode_chunked_array("POINT (0 1)", list(encoding = "Not valid")), + "Expected column encoding 'WKB'" + ) +}) + +