From 86fb413a47f402f9773c7ad3ddbde0fa52aacebb Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 09:55:57 -0500 Subject: [PATCH 1/6] first pass on view types --- geoarrow-types/src/geoarrow/types/__init__.py | 4 ++++ .../src/geoarrow/types/constants.py | 10 +++++++- .../src/geoarrow/types/type_pyarrow.py | 16 ++++++++++--- .../src/geoarrow/types/type_spec.py | 24 ++++++++++++++++++- geoarrow-types/tests/test_type_pyarrow.py | 2 ++ 5 files changed, 51 insertions(+), 5 deletions(-) diff --git a/geoarrow-types/src/geoarrow/types/__init__.py b/geoarrow-types/src/geoarrow/types/__init__.py index 4dcd822..6cbea64 100644 --- a/geoarrow-types/src/geoarrow/types/__init__.py +++ b/geoarrow-types/src/geoarrow/types/__init__.py @@ -17,6 +17,8 @@ large_wkb, wkt, large_wkt, + wkb_view, + wkt_view, box, point, linestring, @@ -42,6 +44,8 @@ "large_wkb", "wkt", "large_wkt", + "wkb_view", + "wkt_view", "geoarrow", "box", "point", diff --git a/geoarrow-types/src/geoarrow/types/constants.py b/geoarrow-types/src/geoarrow/types/constants.py index 96478c4..94f68e8 100644 --- a/geoarrow-types/src/geoarrow/types/constants.py +++ b/geoarrow-types/src/geoarrow/types/constants.py @@ -91,7 +91,13 @@ class Encoding(TypeSpecEnum): LARGE_WKT = 4 """Well-known text encoding with 64-bit offsets""" - GEOARROW = 5 + WKB_VIEW = 5 + """Well-known binary encoding using binary views as a storage type""" + + WKT_VIEW = 6 + """Well-known binary encoding using string views as a storage type""" + + GEOARROW = 7 """GeoArrow native nested list encoding""" def is_serialized(self): @@ -100,6 +106,8 @@ def is_serialized(self): Encoding.LARGE_WKB, Encoding.WKT, Encoding.LARGE_WKT, + Encoding.WKB_VIEW, + Encoding.WKT_VIEW ) diff --git a/geoarrow-types/src/geoarrow/types/type_pyarrow.py b/geoarrow-types/src/geoarrow/types/type_pyarrow.py index eea929d..cc15987 100644 --- a/geoarrow-types/src/geoarrow/types/type_pyarrow.py +++ b/geoarrow-types/src/geoarrow/types/type_pyarrow.py @@ -558,6 +558,10 @@ def _parse_storage(storage_type): return [("string", ())] elif pa_types.is_large_string(storage_type): return [("large_string", ())] + elif hasattr(pa_types, "is_binary_view") and pa_types.is_binary_view(storage_type): + return [("binary_view", ())] + elif hasattr(pa_types, "is_string_view") and pa_types.is_string_view(storage_type): + return [("string_view", ())] elif pa_types.is_float64(storage_type): return [("double", ())] elif isinstance(storage_type, pa.ListType): @@ -925,9 +929,9 @@ def _add_union_types_to_native_storage_types(): for coord_type in ALL_COORD_TYPES: for dimension in ALL_DIMENSIONS: - _NATIVE_STORAGE_TYPES[ - (GeometryType.GEOMETRY, coord_type, dimension) - ] = _generate_union_storage(coord_type=coord_type, dimensions=[dimension]) + _NATIVE_STORAGE_TYPES[(GeometryType.GEOMETRY, coord_type, dimension)] = ( + _generate_union_storage(coord_type=coord_type, dimensions=[dimension]) + ) # With unknown dimensions, we reigster the massive catch-all union _NATIVE_STORAGE_TYPES[ @@ -1014,6 +1018,10 @@ def _spec_short_repr(spec, ext_name): Encoding.LARGE_WKB: pa.large_binary(), } +if hasattr(pa, "binary_view"): + _SERIALIZED_STORAGE_TYPES[Encoding.WKT_VIEW] = pa.string_view() + _SERIALIZED_STORAGE_TYPES[Encoding.WKB_VIEW] = pa.binary_view() + _NATIVE_STORAGE_TYPES = _generate_storage_types() _add_union_types_to_native_storage_types() @@ -1022,6 +1030,8 @@ def _spec_short_repr(spec, ext_name): ("large_binary",): Encoding.LARGE_WKB, ("string",): Encoding.WKT, ("large_string",): Encoding.LARGE_WKT, + ("binary_view",): Encoding.WKB_VIEW, + ("string_view",): Encoding.WKT_VIEW, ("struct",): TypeSpec( encoding=Encoding.GEOARROW, geometry_type=GeometryType.POINT, diff --git a/geoarrow-types/src/geoarrow/types/type_spec.py b/geoarrow-types/src/geoarrow/types/type_spec.py index 1e8440b..c8af38b 100644 --- a/geoarrow-types/src/geoarrow/types/type_spec.py +++ b/geoarrow-types/src/geoarrow/types/type_spec.py @@ -357,12 +357,22 @@ def wkb(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: def large_wkb(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: """Large well-known binary encoding - Create a :class:`TypeSpec` denoting a well-known binary type with + Create a :class:`TypeSpec` denoting a well-known binary type with 64-bit data offsets. See :func:`type_spec` for parameter definitions. """ return type_spec(encoding=Encoding.LARGE_WKB, edge_type=edge_type, crs=crs) +def wkb_view(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: + """Well-known binary view encoding + + Create a :class:`TypeSpec` denoting a well-known binary type using + binary views as the underlying storage type. See :func:`type_spec` + for parameter definitions. + """ + return type_spec(encoding=Encoding.WKB_VIEW, edge_type=edge_type, crs=crs) + + def wkt(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: """Well-known text encoding @@ -381,6 +391,16 @@ def large_wkt(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: return type_spec(encoding=Encoding.LARGE_WKT, edge_type=edge_type, crs=crs) +def wkt_view(*, edge_type=None, crs=crs.UNSPECIFIED) -> TypeSpec: + """Well-known text encoding + + Create a :class:`TypeSpec` denoting a well-known text type using + string views as the underlying storage type. See :func:`type_spec` + for parameter definitions. + """ + return type_spec(encoding=Encoding.WKT_VIEW, edge_type=edge_type, crs=crs) + + def geoarrow( *, geometry_type=None, @@ -619,6 +639,8 @@ def type_spec( Encoding.LARGE_WKB: "geoarrow.wkb", Encoding.WKT: "geoarrow.wkt", Encoding.LARGE_WKT: "geoarrow.wkt", + Encoding.WKB_VIEW: "geoarrow.wkb", + Encoding.WKT_VIEW: "geoarrow.wkt", } _GEOARROW_EXT_NAMES = { diff --git a/geoarrow-types/tests/test_type_pyarrow.py b/geoarrow-types/tests/test_type_pyarrow.py index 0e5d0e3..8aa17fe 100644 --- a/geoarrow-types/tests/test_type_pyarrow.py +++ b/geoarrow-types/tests/test_type_pyarrow.py @@ -441,6 +441,8 @@ def test_multipolygon_array_from_geobuffers(): gt.large_wkt(), gt.wkb(), gt.large_wkb(), + gt.wkt_view(), + gt.wkb_view(), # Geometry types gt.box(), gt.point(), From 86aa9bcdb372fb06d474887008325a8242f20dbf Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 10:29:08 -0500 Subject: [PATCH 2/6] a few more view references --- .../src/geoarrow/pyarrow/__init__.py | 4 ++++ .../src/geoarrow/pyarrow/_type.py | 24 +++++++++++++++++++ geoarrow-pyarrow/tests/test_pyarrow.py | 2 ++ .../src/geoarrow/types/constants.py | 6 ++++- .../src/geoarrow/types/type_pyarrow.py | 6 ++--- 5 files changed, 38 insertions(+), 4 deletions(-) diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py index b4d8ef4..aabe9eb 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py @@ -30,8 +30,10 @@ MultiPolygonType, wkb, large_wkb, + wkb_view, wkt, large_wkt, + wkt_view, point, linestring, polygon, @@ -92,8 +94,10 @@ "MultiPolygonType", "wkb", "large_wkb", + "wkb_view", "wkt", "large_wkt", + "wkt_view", "point", "linestring", "polygon", diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py index 289e573..a0a9038 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py @@ -40,6 +40,18 @@ def large_wkb() -> WkbType: return WkbType.__arrow_ext_deserialize__(pa.large_binary(), b"") +def wkb_view() -> WkbType: + """Well-known binary using binary views as the underlying storage. + + >>> import geoarrow.pyarrow as ga + >>> ga.wkb_view() + WkbType(geoarrow.wkb) + >>> ga.wkb().storage_type + DataType(binary_view) + """ + return WkbType.__arrow_ext_deserialize__(pa.binary_view(), b"") + + def wkt() -> WktType: """Well-known text with a maximum array size of 2 GB per chunk. @@ -64,6 +76,18 @@ def large_wkt() -> WktType: return WktType.__arrow_ext_deserialize__(pa.large_utf8(), b"") +def wkt_view() -> WktType: + """Well-known text using string views as the underlying storage. + + >>> import geoarrow.pyarrow as ga + >>> ga.wkt_view() + WktType(geoarrow.wkt) + >>> ga.wkt().storage_type + DataType(string_view) + """ + return WktType.__arrow_ext_deserialize__(pa.string_view(), b"") + + def point() -> PointType: """Geoarrow-encoded point features. diff --git a/geoarrow-pyarrow/tests/test_pyarrow.py b/geoarrow-pyarrow/tests/test_pyarrow.py index 6396741..b628a50 100644 --- a/geoarrow-pyarrow/tests/test_pyarrow.py +++ b/geoarrow-pyarrow/tests/test_pyarrow.py @@ -68,8 +68,10 @@ def test_type_with_crs_pyproj(): def test_constructors(): assert ga.wkb().extension_name == "geoarrow.wkb" assert ga.large_wkb().extension_name == "geoarrow.wkb" + assert ga.wkb_view().extension_name == "geoarrow.wkb" assert ga.wkt().extension_name == "geoarrow.wkt" assert ga.large_wkt().extension_name == "geoarrow.wkt" + assert ga.wkt_view().extension_name == "geoarrow.wkt" assert ga.point().extension_name == "geoarrow.point" assert ga.linestring().extension_name == "geoarrow.linestring" assert ga.polygon().extension_name == "geoarrow.polygon" diff --git a/geoarrow-types/src/geoarrow/types/constants.py b/geoarrow-types/src/geoarrow/types/constants.py index 94f68e8..7fd2c15 100644 --- a/geoarrow-types/src/geoarrow/types/constants.py +++ b/geoarrow-types/src/geoarrow/types/constants.py @@ -107,7 +107,7 @@ def is_serialized(self): Encoding.WKT, Encoding.LARGE_WKT, Encoding.WKB_VIEW, - Encoding.WKT_VIEW + Encoding.WKT_VIEW, ) @@ -268,10 +268,14 @@ class EdgeType(TypeSpecEnum): (Encoding.WKB, Encoding.LARGE_WKB): Encoding.LARGE_WKB, (Encoding.WKB, Encoding.WKT): Encoding.WKB, (Encoding.WKB, Encoding.LARGE_WKT): Encoding.LARGE_WKB, + (Encoding.WKB, Encoding.WKB_VIEW): Encoding.WKB_VIEW, (Encoding.WKB, Encoding.GEOARROW): Encoding.WKB, + (Encoding.WKB_VIEW, Encoding.LARGE_WKB): Encoding.WKB_VIEW, (Encoding.WKT, Encoding.LARGE_WKT): Encoding.LARGE_WKT, (Encoding.WKT, Encoding.LARGE_WKB): Encoding.LARGE_WKB, + (Encoding.WKT, Encoding.WKT_VIEW): Encoding.WKT_VIEW, (Encoding.WKT, Encoding.GEOARROW): Encoding.WKB, + (Encoding.WKT_VIEW, Encoding.LARGE_WKT): Encoding.WKT_VIEW, (GeometryType.POINT, GeometryType.MULTIPOINT): GeometryType.MULTIPOINT, ( GeometryType.LINESTRING, diff --git a/geoarrow-types/src/geoarrow/types/type_pyarrow.py b/geoarrow-types/src/geoarrow/types/type_pyarrow.py index cc15987..29fa09c 100644 --- a/geoarrow-types/src/geoarrow/types/type_pyarrow.py +++ b/geoarrow-types/src/geoarrow/types/type_pyarrow.py @@ -929,9 +929,9 @@ def _add_union_types_to_native_storage_types(): for coord_type in ALL_COORD_TYPES: for dimension in ALL_DIMENSIONS: - _NATIVE_STORAGE_TYPES[(GeometryType.GEOMETRY, coord_type, dimension)] = ( - _generate_union_storage(coord_type=coord_type, dimensions=[dimension]) - ) + _NATIVE_STORAGE_TYPES[ + (GeometryType.GEOMETRY, coord_type, dimension) + ] = _generate_union_storage(coord_type=coord_type, dimensions=[dimension]) # With unknown dimensions, we reigster the massive catch-all union _NATIVE_STORAGE_TYPES[ From d89b35818ccb41c73df96567aeda454bf2c48c0a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 10:48:45 -0500 Subject: [PATCH 3/6] move geoarrow-c back in --- geoarrow-pyarrow/pyproject.toml | 5 ++--- geoarrow-pyarrow/tests/test_pyarrow.py | 8 ++++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/geoarrow-pyarrow/pyproject.toml b/geoarrow-pyarrow/pyproject.toml index 9203d26..bf25eff 100644 --- a/geoarrow-pyarrow/pyproject.toml +++ b/geoarrow-pyarrow/pyproject.toml @@ -23,11 +23,10 @@ description = "" authors = [{name = "Dewey Dunnington", email = "dewey@dunnington.ca"}] license = {text = "Apache-2.0"} requires-python = ">=3.8" -dependencies = ["pyarrow >= 14.0.2", "geoarrow-types"] +dependencies = ["pyarrow >= 14.0.2", "geoarrow-types", "geoarrow-c"] [project.optional-dependencies] -test = ["pytest", "pandas", "numpy", "geopandas", "pyogrio", "pyproj", "geoarrow-c"] -compute = ["geoarrow-c"] +test = ["pytest", "pandas", "numpy", "geopandas", "pyogrio", "pyproj"] [project.urls] homepage = "https://geoarrow.org" diff --git a/geoarrow-pyarrow/tests/test_pyarrow.py b/geoarrow-pyarrow/tests/test_pyarrow.py index b628a50..45afaa0 100644 --- a/geoarrow-pyarrow/tests/test_pyarrow.py +++ b/geoarrow-pyarrow/tests/test_pyarrow.py @@ -124,6 +124,10 @@ def test_array(): assert array.type == ga.large_wkt() assert array.type.storage_type == pa.large_utf8() + array = ga.array(["POINT (30 10)"], ga.wkt_view()) + assert array.type == ga.wkt_view() + assert array.type.storage_type == pa.string_view() + array = ga.array([wkb_item], ga.wkb()) assert array.type == ga.wkb() assert array.type.storage_type == pa.binary() @@ -132,6 +136,10 @@ def test_array(): assert array.type == ga.large_wkb() assert array.type.storage_type == pa.large_binary() + array = ga.array([wkb_item], ga.wkb_view()) + assert array.type == ga.wkb_view() + assert array.type.storage_type == pa.binary_view() + def test_array_repr(): array = ga.array(["POINT (30 10)"]) From 7d01dea0319ba92debcbabbbce63517c6cb7ae8e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 14:07:45 -0500 Subject: [PATCH 4/6] maybe fix for old pyarrows --- geoarrow-pyarrow/tests/test_pyarrow.py | 18 ++++++++++++++---- geoarrow-types/tests/test_type_pyarrow.py | 6 ++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/geoarrow-pyarrow/tests/test_pyarrow.py b/geoarrow-pyarrow/tests/test_pyarrow.py index 45afaa0..b50b31a 100644 --- a/geoarrow-pyarrow/tests/test_pyarrow.py +++ b/geoarrow-pyarrow/tests/test_pyarrow.py @@ -124,10 +124,6 @@ def test_array(): assert array.type == ga.large_wkt() assert array.type.storage_type == pa.large_utf8() - array = ga.array(["POINT (30 10)"], ga.wkt_view()) - assert array.type == ga.wkt_view() - assert array.type.storage_type == pa.string_view() - array = ga.array([wkb_item], ga.wkb()) assert array.type == ga.wkb() assert array.type.storage_type == pa.binary() @@ -136,6 +132,20 @@ def test_array(): assert array.type == ga.large_wkb() assert array.type.storage_type == pa.large_binary() + +def test_array_view_types(): + # This one requires pyarrow >= 18, because that's when the necessary + # cast() was added. + try: + pa.array(["foofy"]).cast(pa.string_view()) + except pa.lib.ArrowNotImplementedError: + pytest.skip("ga.array() with view types requires pyarrow >= 18.0.0") + + array = ga.array(["POINT (30 10)"], ga.wkt_view()) + assert array.type == ga.wkt_view() + assert array.type.storage_type == pa.string_view() + + wkb_item = b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3e\x40\x00\x00\x00\x00\x00\x00\x24\x40" array = ga.array([wkb_item], ga.wkb_view()) assert array.type == ga.wkb_view() assert array.type.storage_type == pa.binary_view() diff --git a/geoarrow-types/tests/test_type_pyarrow.py b/geoarrow-types/tests/test_type_pyarrow.py index 8aa17fe..9501935 100644 --- a/geoarrow-types/tests/test_type_pyarrow.py +++ b/geoarrow-types/tests/test_type_pyarrow.py @@ -472,6 +472,12 @@ def test_multipolygon_array_from_geobuffers(): ], ) def test_roundtrip_extension_type(spec): + if not hasattr(pa, "binary_view") and spec.encoding in ( + gt.Encoding.WKB_VIEW, + gt.Encoding.WKT_VIEW, + ): + pytest.skip("binary_view/string_view requires pyarrow >= 14") + extension_type = type_pyarrow.extension_type(spec) serialized = extension_type.__arrow_ext_serialize__() extension_type2 = type_pyarrow._deserialize_storage( From 486ec879945b1af0e8bcc9c01d6c9d7014e07b6b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 14:11:22 -0500 Subject: [PATCH 5/6] doctests --- geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py index a0a9038..0eb1782 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py @@ -46,7 +46,7 @@ def wkb_view() -> WkbType: >>> import geoarrow.pyarrow as ga >>> ga.wkb_view() WkbType(geoarrow.wkb) - >>> ga.wkb().storage_type + >>> ga.wkb_view().storage_type DataType(binary_view) """ return WkbType.__arrow_ext_deserialize__(pa.binary_view(), b"") @@ -82,7 +82,7 @@ def wkt_view() -> WktType: >>> import geoarrow.pyarrow as ga >>> ga.wkt_view() WktType(geoarrow.wkt) - >>> ga.wkt().storage_type + >>> ga.wkt_view().storage_type DataType(string_view) """ return WktType.__arrow_ext_deserialize__(pa.string_view(), b"") From d818f524163371808ab8b0da147e9ca85a7815c0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 26 May 2025 14:14:05 -0500 Subject: [PATCH 6/6] one more doctest --- geoarrow-types/src/geoarrow/types/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geoarrow-types/src/geoarrow/types/constants.py b/geoarrow-types/src/geoarrow/types/constants.py index 7fd2c15..56c37b6 100644 --- a/geoarrow-types/src/geoarrow/types/constants.py +++ b/geoarrow-types/src/geoarrow/types/constants.py @@ -73,7 +73,7 @@ class Encoding(TypeSpecEnum): >>> from geoarrow import types >>> types.Encoding.GEOARROW - + """ UNSPECIFIED = 0