diff --git a/geoarrow-pandas/src/geoarrow/pandas/lib.py b/geoarrow-pandas/src/geoarrow/pandas/lib.py index 93d4b37..dc65e96 100644 --- a/geoarrow-pandas/src/geoarrow/pandas/lib.py +++ b/geoarrow-pandas/src/geoarrow/pandas/lib.py @@ -3,7 +3,7 @@ import pyarrow as _pa import pyarrow_hotfix as _ # noqa: F401 import numpy as _np -from geoarrow.c import lib +from geoarrow.types import TypeSpec, type_spec, Encoding import geoarrow.pyarrow as _ga @@ -83,8 +83,7 @@ class GeoArrowExtensionArray(_pd.api.extensions.ExtensionArray): def __init__(self, obj, type=None): if type is not None: self._dtype = GeoArrowExtensionDtype(type) - arrow_type = _ga.GeometryExtensionType._from_ctype(self._dtype._parent) - self._data = _ga.array(obj, arrow_type) + self._data = _ga.array(obj, self._dtype._parent) else: self._data = _ga.array(obj) self._dtype = GeoArrowExtensionDtype(self._data.type) @@ -247,8 +246,8 @@ def to_numpy(self, dtype=None, copy=False, na_value=None): return _np.array(list(self), dtype=object) - def __array__(self, dtype=None): - return self.to_numpy(dtype=dtype) + def __array__(self, dtype=None, copy=True): + return self.to_numpy(dtype=dtype, copy=copy) @_pd.api.extensions.register_extension_dtype @@ -271,20 +270,20 @@ class GeoArrowExtensionDtype(_pd.api.extensions.ExtensionDtype): def __init__(self, parent): if isinstance(parent, _ga.GeometryExtensionType): - self._parent = parent._type - elif isinstance(parent, lib.CVectorType): self._parent = parent + elif isinstance(parent, TypeSpec): + self._parent = _ga.extension_type(parent) elif isinstance(parent, GeoArrowExtensionDtype): self._parent = parent._parent else: raise TypeError( - "`geoarrow_type` must inherit from geoarrow.pyarrow.VectorType, " - "geoarrow.CVectorType, or geoarrow.pandas.GeoArrowExtensionDtype" + "`geoarrow_type` must be a pyarrow extension type, " + "geoarrow.types.TypeSpec, or geoarrow.pandas.GeoArrowExtensionDtype" ) @property def pyarrow_dtype(self): - return _ga.GeometryExtensionType._from_ctype(self._parent) + return self._parent @property def type(self): @@ -323,9 +322,9 @@ def construct_from_string(cls, string): if params["coord_type"] == "[interleaved]": coord_type = _ga.CoordType.INTERLEAVED elif params["type"] in ("wkt", "wkb"): - coord_type = _ga.CoordType.UNKNOWN + coord_type = _ga.CoordType.UNSPECIFIED else: - coord_type = _ga.CoordType.SEPARATE + coord_type = _ga.CoordType.SEPARATED if params["type"] == "point": geometry_type = _ga.GeometryType.POINT @@ -347,7 +346,9 @@ def construct_from_string(cls, string): elif params["type"] == "wkt": base_type = _ga.wkt() else: - base_type = _ga.extension_type(geometry_type, dims, coord_type) + base_type = _ga.extension_type( + type_spec(Encoding.GEOARROW, geometry_type, dims, coord_type) + ) try: if params["metadata"]: @@ -368,7 +369,7 @@ def __str__(self): ext_name = self._parent.extension_name ext_dims = self._parent.dimensions ext_coord = self._parent.coord_type - ext_meta = self._parent.extension_metadata.decode("UTF-8") + ext_meta = self._parent.__arrow_ext_serialize__().decode("UTF-8") if ext_dims == _ga.Dimensions.XYZ: dims_str = "[z]" @@ -440,7 +441,14 @@ def _wrap_series(self, array_or_chunked): ) def _obj_is_geoarrow(self): - return isinstance(self._obj.dtype, GeoArrowExtensionDtype) + if isinstance(self._obj.dtype, GeoArrowExtensionDtype): + return True + + if not isinstance(self._obj.dtype, _pd.ArrowDtype): + return False + + arrow_type = self._obj.dtype.pyarrow_dtype + return isinstance(arrow_type, _ga.GeometryExtensionType) def parse_all(self): """See :func:`geoarrow.pyarrow.parse_all`""" @@ -529,9 +537,9 @@ def with_edge_type(self, edge_type): """See :func:`geoarrow.pyarrow.with_edge_type`""" return self._wrap_series(_ga.with_edge_type(self._obj, edge_type)) - def with_crs(self, crs, crs_type=None): + def with_crs(self, crs): """See :func:`geoarrow.pyarrow.with_crs`""" - return self._wrap_series(_ga.with_crs(self._obj, crs=crs, crs_type=crs_type)) + return self._wrap_series(_ga.with_crs(self._obj, crs=crs)) def with_dimensions(self, dimensions): """See :func:`geoarrow.pyarrow.with_dimensions`""" diff --git a/geoarrow-pandas/tests/test_geoarrow_pandas.py b/geoarrow-pandas/tests/test_geoarrow_pandas.py index 533ccb3..cafc07d 100644 --- a/geoarrow-pandas/tests/test_geoarrow_pandas.py +++ b/geoarrow-pandas/tests/test_geoarrow_pandas.py @@ -6,7 +6,6 @@ import pyarrow as pa import geoarrow.pandas as gapd import geoarrow.pyarrow as ga -from geoarrow.c import lib import numpy as np @@ -18,10 +17,10 @@ def test_dtype_constructor(): from_pyarrow = gapd.GeoArrowExtensionDtype(ga.point()) assert from_pyarrow.name == "geoarrow.point" - from_ctype = gapd.GeoArrowExtensionDtype(ga.point()._type) - assert from_ctype.name == "geoarrow.point" + from_spec = gapd.GeoArrowExtensionDtype(ga.point().spec) + assert from_spec.name == "geoarrow.point" - from_dtype = gapd.GeoArrowExtensionDtype(from_ctype) + from_dtype = gapd.GeoArrowExtensionDtype(from_spec) assert from_dtype.name == "geoarrow.point" with pytest.raises(TypeError): @@ -34,8 +33,8 @@ def test_dtype_strings(): dtype2 = gapd.GeoArrowExtensionDtype.construct_from_string(str(dtype)) assert dtype2 == dtype - dtype = gapd.GeoArrowExtensionDtype(ga.point().with_crs("EPSG:1234")) - assert str(dtype) == 'geoarrow.point{"crs":"EPSG:1234"}' + dtype = gapd.GeoArrowExtensionDtype(ga.point().with_crs(ga.OGC_CRS84)) + assert str(dtype) == 'geoarrow.point{"crs": ' + ga.OGC_CRS84.to_json() + "}" dtype2 = gapd.GeoArrowExtensionDtype.construct_from_string(str(dtype)) assert dtype2 == dtype @@ -182,23 +181,10 @@ def test_array_concat(): assert len(concatenated_diff_type) == 6 -def test_pyarrow_integration(): - pa_array = ga.array(["POINT (0 1)", "POINT (1 2)", None]) - series = pa_array.to_pandas() - assert series.dtype == gapd.GeoArrowExtensionDtype(ga.wkt()) - assert series[0] == gapd.GeoArrowExtensionScalar("POINT (0 1)") - assert pa.array(series) is pa_array - - pa_chunked_array = pa.chunked_array([pa_array]) - series = pa_chunked_array.to_pandas() - assert series.dtype == gapd.GeoArrowExtensionDtype(ga.wkt()) - assert series[0] == gapd.GeoArrowExtensionScalar("POINT (0 1)") - - def test_accessor_parse_all(): series = pd.Series(["POINT (0 1)"]) assert series.geoarrow.parse_all() is series - with pytest.raises(lib.GeoArrowCException): + with pytest.raises(Exception, match="Expected geometry type at byte 0"): pd.Series(["NOT WKT"]).geoarrow.parse_all() @@ -278,8 +264,8 @@ def test_accessor_with_edge_type(): def test_accessor_with_crs(): - ga_series = pd.Series(["POINT (0 1)"]).geoarrow.with_crs("EPSG:1234") - assert ga_series.dtype.pyarrow_dtype.crs == "EPSG:1234" + ga_series = pd.Series(["POINT (0 1)"]).geoarrow.with_crs(ga.OGC_CRS84) + assert ga_series.dtype.pyarrow_dtype.crs == ga.OGC_CRS84 def test_accessor_with_dimensions(): diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py index 1febaf3..b4d8ef4 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/__init__.py @@ -9,7 +9,14 @@ from geoarrow.types._version import __version__, __version_tuple__ # NOQA: F401 -from geoarrow.c.lib import GeometryType, Dimensions, CoordType, EdgeType, CrsType +from geoarrow.types import ( + GeometryType, + Dimensions, + CoordType, + EdgeType, + Encoding, + OGC_CRS84, +) from geoarrow.pyarrow._type import ( GeometryExtensionType, @@ -33,6 +40,9 @@ multipolygon, extension_type, geometry_type_common, +) + +from geoarrow.types.type_pyarrow import ( register_extension_types, unregister_extension_types, ) @@ -69,10 +79,11 @@ "Dimensions", "CoordType", "EdgeType", - "CrsType", + "Encoding", "GeometryExtensionType", "WktType", "WkbType", + "OGC_CRS84", "PointType", "LinestringType", "PolygonType", diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_array.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_array.py index 2a569c3..f210d53 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_array.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_array.py @@ -55,10 +55,10 @@ def __repr__(self): tail_str = [f"<{item.as_py()}>" for item in tail] for i in range(len(head)): if len(head_str[i]) > max_width: - head_str[i] = f"{head_str[i][:(max_width - 4)]}...>" + head_str[i] = f"{head_str[i][: (max_width - 4)]}...>" for i in range(len(tail)): if len(tail_str[i]) > max_width: - tail_str[i] = f"{tail_str[i][:(max_width - 4)]}...>" + tail_str[i] = f"{tail_str[i][: (max_width - 4)]}...>" type_name = type(self).__name__ head_str = "\n".join(head_str) @@ -138,7 +138,7 @@ def array(obj, type_=None, *args, **kwargs) -> GeometryExtensionArray: # Convert GeoPandas to WKB if type(obj).__name__ == "GeoSeries": if obj.crs: - type_ = wkb().with_crs(obj.crs.to_json(), lib.CrsType.PROJJSON) + type_ = wkb().with_crs(obj.crs) else: type_ = wkb() diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_compute.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_compute.py index 8028074..09f1a12 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_compute.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_compute.py @@ -1,7 +1,16 @@ import pyarrow as pa import pyarrow.compute as pc import pyarrow_hotfix as _ # noqa: F401 -from geoarrow.c.lib import CoordType, Dimensions, EdgeType, GeometryType + +from geoarrow.types import ( + type_spec, + Encoding, + CoordType, + Dimensions, + EdgeType, + GeometryType, + TypeSpec, +) from geoarrow.pyarrow import _type from geoarrow.pyarrow._array import array from geoarrow.pyarrow._kernel import Kernel @@ -93,8 +102,8 @@ def unique_geometry_types(obj): return pa.array( [ { - "geometry_type": obj.type.geometry_type, - "dimensions": obj.type.dimensions, + "geometry_type": obj.type.geometry_type.value, + "dimensions": obj.type.dimensions.value, } ], type=out_type, @@ -107,17 +116,18 @@ def unique_geometry_types(obj): py_geometry_types = [] for item in result: item_int = item.as_py() + if item_int >= 3000: - dimensions = Dimensions.XYZM + dimensions = Dimensions.XYZM.value item_int -= 3000 elif item_int >= 2000: - dimensions = Dimensions.XYM + dimensions = Dimensions.XYM.value item_int -= 2000 elif item_int >= 1000: - dimensions = Dimensions.XYZ + dimensions = Dimensions.XYZ.value item_int -= 1000 else: - dimensions = Dimensions.XY + dimensions = Dimensions.XY.value py_geometry_types.append({"geometry_type": item_int, "dimensions": dimensions}) @@ -146,7 +156,7 @@ def infer_type_common(obj, coord_type=None, promote_multi=False, _geometry_types return obj.type.with_coord_type(coord_type) if coord_type is None: - coord_type = CoordType.SEPARATE + coord_type = CoordType.SEPARATED if _geometry_types is None: types = unique_geometry_types(obj) @@ -159,53 +169,23 @@ def infer_type_common(obj, coord_type=None, promote_multi=False, _geometry_types types = types.flatten() - unique_dims = types[1].unique().to_pylist() - has_z = any(dim in (Dimensions.XYZ, Dimensions.XYZM) for dim in unique_dims) - has_m = any(dim in (Dimensions.XYM, Dimensions.XYZM) for dim in unique_dims) - if has_z and has_m: - dimensions = Dimensions.XYZM - elif has_z: - dimensions = Dimensions.XYZ - elif has_m: - dimensions = Dimensions.XYM - else: - dimensions = Dimensions.XY + dims = [Dimensions(dim) for dim in types[1].to_pylist()] + dims = Dimensions.common(*dims) - unique_geom_types = types[0].unique().to_pylist() - if len(unique_geom_types) == 1: - geometry_type = unique_geom_types[0] - elif all( - t in (GeometryType.POINT, GeometryType.MULTIPOINT) for t in unique_geom_types - ): - geometry_type = GeometryType.MULTIPOINT - elif all( - t in (GeometryType.LINESTRING, GeometryType.MULTILINESTRING) - for t in unique_geom_types - ): - geometry_type = GeometryType.MULTILINESTRING - elif all( - t in (GeometryType.POLYGON, GeometryType.MULTIPOLYGON) - for t in unique_geom_types - ): - geometry_type = GeometryType.MULTIPOLYGON - else: - return ( - _type.wkb() - .with_edge_type(obj.type.edge_type) - .with_crs(obj.type.crs, obj.type.crs_type) - ) + geometry_types = [ + GeometryType(geometry_type) for geometry_type in types[0].to_pylist() + ] + geometry_type = GeometryType.common(*geometry_types) - if promote_multi and geometry_type <= GeometryType.POLYGON: - geometry_type += 3 + if promote_multi and geometry_type.value in (1, 2, 3): + geometry_type = GeometryType(geometry_type.value + 3) - return _type.extension_type( - geometry_type, - dimensions, - coord_type, - edge_type=obj.type.edge_type, - crs=obj.type.crs, - crs_type=obj.type.crs_type, - ) + spec = TypeSpec.coalesce( + type_spec(Encoding.GEOARROW, dims, geometry_type, coord_type=coord_type), + obj.type.spec, + ).canonicalize() + + return _type.extension_type(spec) def as_wkt(obj): @@ -237,9 +217,7 @@ def as_wkb(obj, strict_iso_wkb=False): obj = as_geoarrow(obj, _type.wkb()) if check_wkb and strict_iso_wkb and _any_ewkb(obj): - return push_all( - Kernel.as_geoarrow, obj, args={"type_id": _type.wkb().geoarrow_id} - ) + return push_all(Kernel.as_geoarrow, obj, args={"type_id": 100001}) else: return obj @@ -283,10 +261,16 @@ def as_geoarrow(obj, type=None, coord_type=None, promote_multi=False): obj, coord_type=coord_type, promote_multi=promote_multi ) - if obj.type.geoarrow_id == type.geoarrow_id: + if obj.type.spec == type.spec: return obj - return push_all(Kernel.as_geoarrow, obj, args={"type_id": type.geoarrow_id}) + from geoarrow.c import lib + + cschema = lib.SchemaHolder() + type._export_to_c(cschema._addr()) + ctype = lib.CVectorType.FromExtension(cschema) + + return push_all(Kernel.as_geoarrow, obj, args={"type_id": ctype.id}) def format_wkt(obj, precision=None, max_element_size_bytes=None): @@ -313,7 +297,7 @@ def format_wkt(obj, precision=None, max_element_size_bytes=None): ) -def make_point(x, y, z=None, m=None, crs=None, crs_type=None): +def make_point(x, y, z=None, m=None, crs=None): """Create a geoarrow-encoded point array from two or more arrays representing x, y, and/or z, and/or m values. In many cases, this is a zero-copy operation if the input arrays are already in a @@ -343,7 +327,7 @@ def make_point(x, y, z=None, m=None, crs=None, crs_type=None): field_names = ["x", "y"] type = _type.extension_type( - GeometryType.POINT, dimensions, crs=crs, crs_type=crs_type + type_spec(Encoding.GEOARROW, GeometryType.POINT, dimensions, crs=crs) ) args = [x, y] + [el for el in [z, m] if el is not None] args = [pa.array(el, pa.float64()) for el in args] @@ -389,12 +373,16 @@ def box(obj): obj = obj_as_array_or_chunked(obj) # Spherical edges aren't supported by this algorithm - if obj.type.edge_type == EdgeType.SPHERICAL: - raise TypeError("Can't compute box of type with spherical edges") + if obj.type.edge_type != EdgeType.PLANAR: + raise TypeError("Can't compute box of type with non-planar edges") # Optimization: a box of points is just x, x, y, y with zero-copy # if the coord type is struct - if obj.type.coord_type == CoordType.SEPARATE and len(obj) > 0: + if ( + obj.type.coord_type == CoordType.SEPARATED + and len(obj) > 0 + and obj.null_count == 0 + ): if obj.type.geometry_type == GeometryType.POINT and isinstance( obj, pa.ChunkedArray ): @@ -439,7 +427,7 @@ def box_agg(obj): # Optimization: pyarrow's minmax kernel is fast and we can use it if we have struct # coords. So far, only a measurable improvement for points. - if obj.type.coord_type == CoordType.SEPARATE and len(obj) > 0: + if obj.type.coord_type == CoordType.SEPARATED and len(obj) > 0: if obj.type.geometry_type == GeometryType.POINT and isinstance( obj, pa.ChunkedArray ): @@ -525,16 +513,16 @@ def with_edge_type(obj, edge_type): return new_type.wrap_array(ensure_storage(obj)) -def with_crs(obj, crs, crs_type=None): +def with_crs(obj, crs): """Force a :class:`geoarrow.CrsType`/crs value on an array. >>> import geoarrow.pyarrow as ga - >>> ga.with_crs(["POINT (0 1)"], "EPSG:1234") - GeometryExtensionArray:WktType(geoarrow.wkt )[1] + >>> ga.with_crs(["POINT (0 1)"], ga.OGC_CRS84) + GeometryExtensionArray:WktType(geoarrow.wkt )[1] """ obj = obj_as_array_or_chunked(obj) - new_type = obj.type.with_crs(crs, crs_type) + new_type = obj.type.with_crs(crs) return new_type.wrap_array(ensure_storage(obj)) @@ -625,7 +613,15 @@ def to_geopandas(obj): import geopandas import pandas as pd - # Ideally we will avoid serialization via geobuffers + from_ragged_array() + # Attempt GeoPandas from_arrow first + try: + return geopandas.GeoSeries.from_arrow(obj) + except ValueError: + pass + except AttributeError: + pass + + # Fall back on wkb conversion wkb_array_or_chunked = as_wkb(obj) # Avoids copy on convert to pandas @@ -634,4 +630,8 @@ def to_geopandas(obj): dtype=pd.ArrowDtype(wkb_array_or_chunked.type.storage_type), ) - return geopandas.GeoSeries.from_wkb(wkb_pandas, crs=wkb_array_or_chunked.type.crs) + crs = wkb_array_or_chunked.type.crs + if crs is not None: + crs = crs.to_json() + + return geopandas.GeoSeries.from_wkb(wkb_pandas, crs=crs) diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_scalar.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_scalar.py index 0710b8b..ef30631 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_scalar.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_scalar.py @@ -1,7 +1,7 @@ import pyarrow as pa import pyarrow_hotfix as _ # noqa: F401 from geoarrow.pyarrow._kernel import Kernel -from geoarrow.pyarrow._type import GeometryExtensionType +from geoarrow.types.type_pyarrow import GeometryExtensionType class GeometryExtensionScalar(pa.ExtensionScalar): diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py index eb5789f..2448795 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/_type.py @@ -1,468 +1,20 @@ -import json - +from typing import Iterable import pyarrow as pa import pyarrow_hotfix as _ # noqa: F401 -from geoarrow.c import lib - - -class GeometryExtensionType(pa.ExtensionType): - """Extension type base class for vector geometry types.""" - - _extension_name = None - - # These are injected into the class when imported by the type and scalar - # modules to avoid a circular import. As a result, you can't - # use this module directly (import geoarrow.pyarrow first). - _array_cls_from_name = None - _scalar_cls_from_name = None - - def __init__(self, c_vector_type): - if not isinstance(c_vector_type, lib.CVectorType): - raise TypeError( - "geoarrow.pyarrow.VectorType must be created from a CVectorType" - ) - self._type = c_vector_type - if self._type.extension_name != type(self)._extension_name: - raise ValueError( - f'Expected CVectorType with extension name "{type(self)._extension_name}" but got "{self._type.extension_name}"' - ) - - storage_schema = self._type.to_storage_schema() - storage_type = pa.DataType._import_from_c(storage_schema._addr()) - pa.ExtensionType.__init__(self, storage_type, self._type.extension_name) - - def __repr__(self): - return f"{type(self).__name__}({repr(self._type)})" - - def __arrow_ext_serialize__(self): - return self._type.extension_metadata - - @staticmethod - def _import_from_c(addr): - field = pa.Field._import_from_c(addr) - if not field.metadata or "ARROW:extension:name" not in field.metadata: - return field.type - - schema = lib.SchemaHolder() - field._export_to_c(schema._addr()) - - c_vector_type = lib.CVectorType.FromExtension(schema) - cls = type_cls_from_name(c_vector_type.extension_name.decode("UTF-8")) - cls(c_vector_type) - - @classmethod - def __arrow_ext_deserialize__(cls, storage_type, serialized): - schema = lib.SchemaHolder() - storage_type._export_to_c(schema._addr()) - - c_vector_type = lib.CVectorType.FromStorage( - schema, cls._extension_name.encode("UTF-8"), serialized - ) - - return cls(c_vector_type) - - @staticmethod - def _from_ctype(c_vector_type): - cls = type_cls_from_name(c_vector_type.extension_name) - schema = c_vector_type.to_schema() - storage_type = pa.DataType._import_from_c(schema._addr()) - return cls.__arrow_ext_deserialize__( - storage_type, c_vector_type.extension_metadata - ) - - def __arrow_ext_class__(self): - return GeometryExtensionType._array_cls_from_name(self.extension_name) - - def __arrow_ext_scalar_class__(self): - return GeometryExtensionType._scalar_cls_from_name(self.extension_name) - - def to_pandas_dtype(self): - from geoarrow.pandas import GeoArrowExtensionDtype - - return GeoArrowExtensionDtype(self) - - def from_geobuffers(self, *args, **kwargs): - """Create an array from the appropriate number of buffers - for this type. - """ - raise NotImplementedError() - - def _from_geobuffers_internal(self, args): - builder = lib.CBuilder(self._type.to_schema()) - for i, buf_type, buf in args: - if buf is None: - continue - if buf_type == "uint8": - builder.set_buffer_uint8(i, buf) - elif buf_type == "int32": - builder.set_buffer_int32(i, buf) - elif buf_type == "double": - builder.set_buffer_double(i, buf) - else: - raise ValueError(f"Unknown type: {buf_type}") - - carray = builder.finish() - return pa.Array._import_from_c(carray._addr(), self) - - @property - def geoarrow_id(self): - """A unique identifier for the memory layout of this type. - - >>> import geoarrow.pyarrow as ga - >>> int(ga.wkb().geoarrow_id) - 100001 - """ - return self._type.id - - @property - def geometry_type(self): - """The :class:`geoarrow.GeometryType` of this type or ``GEOMETRY`` for - types where this is not constant (i.e., WKT and WKB). - - >>> import geoarrow.pyarrow as ga - >>> ga.wkb().geometry_type == ga.GeometryType.GEOMETRY - True - >>> ga.linestring().geometry_type == ga.GeometryType.LINESTRING - True - """ - return self._type.geometry_type - - @property - def dimensions(self): - """The :class:`geoarrow.Dimensions` of this type or ``UNKNOWN`` for - types where this is not constant (i.e., WKT and WKT). - - >>> import geoarrow.pyarrow as ga - >>> ga.wkb().dimensions == ga.Dimensions.UNKNOWN - True - >>> ga.linestring().dimensions == ga.Dimensions.XY - True - """ - return self._type.dimensions - - @property - def coord_type(self): - """The :class:`geoarrow.CoordType` of this type. - - >>> import geoarrow.pyarrow as ga - >>> ga.linestring().coord_type == ga.CoordType.SEPARATE - True - >>> ga.linestring().with_coord_type(ga.CoordType.INTERLEAVED).coord_type - - """ - return self._type.coord_type - - @property - def edge_type(self): - """The :class:`geoarrow.EdgeType` of this type. - - >>> import geoarrow.pyarrow as ga - >>> ga.linestring().edge_type == ga.EdgeType.PLANAR - True - >>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL).edge_type - - """ - return self._type.edge_type - - @property - def crs_type(self): - """The :class:`geoarrow.CrsType` of the :attr:`crs` value. - - >>> import geoarrow.pyarrow as ga - >>> ga.point().crs_type == ga.CrsType.NONE - True - >>> ga.point().with_crs("EPSG:1234").crs_type - - """ - return self._type.crs_type - - @property - def crs(self): - """The coordinate reference system of this type. - - >>> import geoarrow.pyarrow as ga - >>> ga.point().with_crs("EPSG:1234").crs - 'EPSG:1234' - """ - return self._type.crs.decode("UTF-8") - - def with_metadata(self, metadata): - """This type with the extension metadata (e.g., copied from some other type) - - >>> import geoarrow.pyarrow as ga - >>> ga.point().with_metadata('{"crs": "EPSG:1234"}').crs - 'EPSG:1234' - """ - if isinstance(metadata, str): - metadata = metadata.encode("UTF-8") - return type(self).__arrow_ext_deserialize__(self.storage_type, metadata) - - def with_geometry_type(self, geometry_type): - """Returns a new type with the specified :class:`geoarrow.GeometryType`. - - >>> import geoarrow.pyarrow as ga - >>> ga.point().with_geometry_type(ga.GeometryType.LINESTRING) - LinestringType(geoarrow.linestring) - """ - ctype = self._type.with_geometry_type(geometry_type) - return _ctype_to_extension_type(ctype) - - def with_dimensions(self, dimensions): - """Returns a new type with the specified :class:`geoarrow.Dimensions`. - - >>> import geoarrow.pyarrow as ga - >>> ga.point().with_dimensions(ga.Dimensions.XYZ) - PointType(geoarrow.point_z) - """ - ctype = self._type.with_dimensions(dimensions) - return _ctype_to_extension_type(ctype) - - def with_coord_type(self, coord_type): - """Returns a new type with the specified :class:`geoarrow.CoordType`. - - >>> import geoarrow.pyarrow as ga - >>> ga.point().with_coord_type(ga.CoordType.INTERLEAVED) - PointType(interleaved geoarrow.point) - """ - ctype = self._type.with_coord_type(coord_type) - return _ctype_to_extension_type(ctype) - - def with_edge_type(self, edge_type): - """Returns a new type with the specified :class:`geoarrow.EdgeType`. - - >>> import geoarrow.pyarrow as ga - >>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL) - LinestringType(spherical geoarrow.linestring) - """ - ctype = self._type.with_edge_type(edge_type) - return _ctype_to_extension_type(ctype) - - def with_crs(self, crs, crs_type=None): - """Returns a new type with the specified coordinate reference system - :class:`geoarrow.CrsType` combination. The ``crs_type`` defaults to - ``NONE`` if ``crs`` is ``None``, otherwise ``UNKNOWN``. - - >>> import geoarrow.pyarrow as ga - >>> ga.linestring().with_crs("EPSG:1234") - LinestringType(geoarrow.linestring ) - """ - if crs is None: - crs = b"" - default_crs_type = lib.CrsType.NONE - elif isinstance(crs, str): - crs = crs.encode("UTF-8") - default_crs_type = lib.CrsType.UNKNOWN - elif isinstance(crs, bytes): - default_crs_type = lib.CrsType.UNKNOWN - elif isinstance(crs, dict): - crs = json.dumps(crs).encode("UTF-8") - default_crs_type = lib.CrsType.PROJJSON - elif hasattr(crs, "to_json"): - crs = crs.to_json().encode("UTF-8") - default_crs_type = lib.CrsType.PROJJSON - else: - raise TypeError("Unknown type for crs object") - - if crs_type is None: - crs_type = default_crs_type - - ctype = self._type.with_crs(crs, crs_type) - return _ctype_to_extension_type(ctype) - - -class WkbType(GeometryExtensionType): - """Extension type whose storage is a binary or large binary array of - well-known binary. Even though the draft specification currently mandates - ISO well-known binary, EWKB is supported by the parser. - """ - - _extension_name = "geoarrow.wkb" - - -class WktType(GeometryExtensionType): - """Extension type whose storage is a utf8 or large utf8 array of - well-known text. - """ - - _extension_name = "geoarrow.wkt" - - -class PointType(GeometryExtensionType): - """Extension type whose storage is an array of points stored - as either a struct with one child per dimension or a fixed-size - list whose single child is composed of interleaved ordinate values. - """ - - _extension_name = "geoarrow.point" - - def from_geobuffers(self, validity, x, y=None, z_or_m=None, m=None): - buffers = [ - (0, "uint8", validity), - (1, "double", x), - (2, "double", y), - (3, "double", z_or_m), - (4, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - - -class LinestringType(GeometryExtensionType): - """Extension type whose storage is an array of linestrings stored - as a list of points as described in :class:`PointType`. - """ - - _extension_name = "geoarrow.linestring" - - def from_geobuffers(self, validity, coord_offsets, x, y=None, z_or_m=None, m=None): - buffers = [ - (0, "uint8", validity), - (1, "int32", coord_offsets), - (2, "double", x), - (3, "double", y), - (4, "double", z_or_m), - (5, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - - -class PolygonType(GeometryExtensionType): - """Extension type whose storage is an array of polygons stored - as a list of a list of points as described in :class:`PointType`. - """ - - _extension_name = "geoarrow.polygon" - - def from_geobuffers( - self, validity, ring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None - ): - buffers = [ - (0, "uint8", validity), - (1, "int32", ring_offsets), - (2, "int32", coord_offsets), - (3, "double", x), - (4, "double", y), - (5, "double", z_or_m), - (6, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - -class MultiPointType(GeometryExtensionType): - """Extension type whose storage is an array of polygons stored - as a list of points as described in :class:`PointType`. - """ - - _extension_name = "geoarrow.multipoint" - - def from_geobuffers(self, validity, coord_offsets, x, y=None, z_or_m=None, m=None): - buffers = [ - (0, "uint8", validity), - (1, "int32", coord_offsets), - (2, "double", x), - (3, "double", y), - (4, "double", z_or_m), - (5, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - - -class MultiLinestringType(GeometryExtensionType): - """Extension type whose storage is an array of multilinestrings stored - as a list of a list of points as described in :class:`PointType`. - """ - - _extension_name = "geoarrow.multilinestring" - - def from_geobuffers( - self, - validity, - linestring_offsets, - coord_offsets, - x, - y=None, - z_or_m=None, - m=None, - ): - buffers = [ - (0, "uint8", validity), - (1, "int32", linestring_offsets), - (2, "int32", coord_offsets), - (3, "double", x), - (4, "double", y), - (5, "double", z_or_m), - (6, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - - -class MultiPolygonType(GeometryExtensionType): - """Extension type whose storage is an array of multilinestrings stored - as a list of a list of a list of points as described in :class:`PointType`. - """ - - _extension_name = "geoarrow.multipolygon" - - def from_geobuffers( - self, - validity, - polygon_offsets, - ring_offsets, - coord_offsets, - x, - y=None, - z_or_m=None, - m=None, - ): - buffers = [ - (0, "uint8", validity), - (1, "int32", polygon_offsets), - (2, "int32", ring_offsets), - (3, "int32", coord_offsets), - (4, "double", x), - (5, "double", y), - (6, "double", z_or_m), - (7, "double", m), - ] - - return self._from_geobuffers_internal(buffers) - - -def type_cls_from_name(name): - if name == "geoarrow.wkb": - return WkbType - elif name == "geoarrow.wkt": - return WktType - elif name == "geoarrow.point": - return PointType - elif name == "geoarrow.linestring": - return LinestringType - elif name == "geoarrow.polygon": - return PolygonType - elif name == "geoarrow.multipoint": - return MultiPointType - elif name == "geoarrow.multilinestring": - return MultiLinestringType - elif name == "geoarrow.multipolygon": - return MultiPolygonType - else: - raise ValueError(f'Expected valid extension name but got "{name}"') - - -def _ctype_to_extension_type(ctype): - cls = type_cls_from_name(ctype.extension_name) - return cls(ctype) - - -def _make_default(geometry_type, cls): - ctype = lib.CVectorType.Make( - geometry_type, lib.Dimensions.XY, lib.CoordType.SEPARATE - ) - return cls(ctype) +from geoarrow import types +from geoarrow.types.type_pyarrow import ( + GeometryExtensionType, + PointType, + LinestringType, + PolygonType, + MultiPointType, + MultiLinestringType, + MultiPolygonType, + WkbType, + WktType, + extension_type, +) def wkb() -> WkbType: @@ -520,21 +72,21 @@ def point() -> PointType: >>> ga.point() PointType(geoarrow.point) >>> ga.point().storage_type - StructType(struct) + StructType(struct) """ - return _make_default(lib.GeometryType.POINT, PointType) + return extension_type(types.point()) -def linestring() -> PointType: +def linestring() -> LinestringType: """Geoarrow-encoded line features. >>> import geoarrow.pyarrow as ga >>> ga.linestring() LinestringType(geoarrow.linestring) >>> ga.linestring().storage_type - ListType(list>) + ListType(list not null>) """ - return _make_default(lib.GeometryType.LINESTRING, LinestringType) + return extension_type(types.linestring()) def polygon() -> PolygonType: @@ -544,9 +96,9 @@ def polygon() -> PolygonType: >>> ga.polygon() PolygonType(geoarrow.polygon) >>> ga.polygon().storage_type - ListType(list>>) + ListType(list not null> not null>) """ - return _make_default(lib.GeometryType.POLYGON, PolygonType) + return extension_type(types.polygon()) def multipoint() -> MultiPointType: @@ -556,9 +108,9 @@ def multipoint() -> MultiPointType: >>> ga.multipoint() MultiPointType(geoarrow.multipoint) >>> ga.multipoint().storage_type - ListType(list>) + ListType(list not null>) """ - return _make_default(lib.GeometryType.MULTIPOINT, MultiPointType) + return extension_type(types.multipoint()) def multilinestring() -> MultiLinestringType: @@ -568,9 +120,9 @@ def multilinestring() -> MultiLinestringType: >>> ga.multilinestring() MultiLinestringType(geoarrow.multilinestring) >>> ga.multilinestring().storage_type - ListType(list>>) + ListType(list not null> not null>) """ - return _make_default(lib.GeometryType.MULTILINESTRING, MultiLinestringType) + return extension_type(types.multilinestring()) def multipolygon() -> MultiPolygonType: @@ -580,55 +132,14 @@ def multipolygon() -> MultiPolygonType: >>> ga.multipolygon() MultiPolygonType(geoarrow.multipolygon) >>> ga.multipolygon().storage_type - ListType(list>>>) + ListType(list not null> not null> not null>) """ - return _make_default(lib.GeometryType.MULTIPOLYGON, MultiPolygonType) + return extension_type(types.multipolygon()) -def extension_type( - geometry_type, - dimensions=lib.Dimensions.XY, - coord_type=lib.CoordType.SEPARATE, - edge_type=lib.EdgeType.PLANAR, - crs=None, - crs_type=None, +def geometry_type_common( + type_objects: Iterable[GeometryExtensionType], ) -> GeometryExtensionType: - """Generic vector geometry type constructor. - - >>> import geoarrow.pyarrow as ga - >>> ga.extension_type(ga.GeometryType.POINT, crs="EPSG:1234") - PointType(geoarrow.point ) - """ - ctype = lib.CVectorType.Make(geometry_type, dimensions, coord_type) - cls = type_cls_from_name(ctype.extension_name) - return cls(ctype).with_edge_type(edge_type).with_crs(crs, crs_type) - - -def _vector_type_common2(a, b): - if not isinstance(a, GeometryExtensionType) or not isinstance( - b, GeometryExtensionType - ): - raise ValueError( - f"Can't compute common type between '{a}' and '{b}': non-geometry type" - ) - - if a == b: - return a - - # This computation doesn't handle non-equal metadata (crs, edge type) - metadata_a = a._type.extension_metadata - metadata_b = b._type.extension_metadata - if metadata_a != metadata_b: - raise ValueError( - f"Can't compute common type between '{a}' and '{b}': metadata not equal" - ) - - # TODO: There are a number of other things we can try (e.g., promote multi) - # For now, just use wkb() if the types aren't exactly the same - return wkb().with_metadata(metadata_a) - - -def geometry_type_common(types): """Compute common type From a sequence of GeoArrow types, return a type to which all can be cast @@ -640,89 +151,14 @@ def geometry_type_common(types): >>> ga.geometry_type_common([ga.point(), ga.point()]) PointType(geoarrow.point) """ - types = list(types) + type_objects = list(type_objects) - if len(types) == 0: + if len(type_objects) == 0: # Would be nice to have an empty type option here return wkb() - elif len(types) == 1: - return types[0] + elif len(type_objects) == 1: + return type_objects[0] - for i in reversed(range(len(types) - 1)): - types[i] = _vector_type_common2(types[i], types[i + 1]) - - return types[0] - - -_extension_types_registered = False - - -def register_extension_types(lazy=True): - """Register the extension types in the geoarrow namespace with the pyarrow - registry. This enables geoarrow types to be read, written, imported, and - exported like any other Arrow type. - """ - global _extension_types_registered - - if lazy and _extension_types_registered is True: - return - - _extension_types_registered = None - - all_types = [ - wkt(), - wkb(), - point(), - linestring(), - polygon(), - multipoint(), - multilinestring(), - multipolygon(), - ] - - n_registered = 0 - for t in all_types: - try: - pa.register_extension_type(t) - n_registered += 1 - except pa.ArrowException: - pass - - if n_registered != len(all_types): - raise RuntimeError("Failed to register one or more extension types") - - _extension_types_registered = True - - -def unregister_extension_types(lazy=True): - """Unregister extension types in the geoarrow namespace.""" - global _extension_types_registered - - if lazy and _extension_types_registered is False: - return - - _extension_types_registered = None - - all_type_names = [ - "geoarrow.wkb", - "geoarrow.wkt", - "geoarrow.point", - "geoarrow.linestring", - "geoarrow.polygon", - "geoarrow.multipoint", - "geoarrow.multilinestring", - "geoarrow.multipolygon", - ] - - n_unregistered = 0 - for t_name in all_type_names: - try: - pa.unregister_extension_type(t_name) - n_unregistered += 1 - except pa.ArrowException: - pass - - if n_unregistered != len(all_type_names): - raise RuntimeError("Failed to unregister one or more extension types") - - _extension_types_registered = False + specs = [t.spec for t in type_objects] + spec = types.TypeSpec.common(*specs).canonicalize() + return extension_type(spec) diff --git a/geoarrow-pyarrow/src/geoarrow/pyarrow/io.py b/geoarrow-pyarrow/src/geoarrow/pyarrow/io.py index ad7cf2a..593fdca 100644 --- a/geoarrow-pyarrow/src/geoarrow/pyarrow/io.py +++ b/geoarrow-pyarrow/src/geoarrow/pyarrow/io.py @@ -240,10 +240,7 @@ def _geoparquet_chunked_array_to_geoarrow(item, spec): if encoding in ("WKB", "WKT"): item = _ga.array(item) elif encoding in _GEOARROW_ENCODINGS: - extension_name = "geoarrow." + encoding - type = _type.type_cls_from_name(extension_name).__arrow_ext_deserialize__( - item.type, b"" - ) + type = _GEOARROW_ENCODINGS[encoding].__arrow_ext_deserialize__(item.type, b"") item = type.wrap_array(item) else: raise ValueError(f"Invalid GeoParquet encoding value: '{encoding}'") @@ -309,20 +306,17 @@ def _geoparquet_column_spec_from_type(type, add_geometry_types=None, encoding=No # Pass along extra information from GeoArrow extension type metadata if isinstance(type, _ga.GeometryExtensionType): # If encoding is unspecified and data is already geoarrow, don't serialize to WKB - if encoding is None and type.coord_type != _ga.CoordType.UNKNOWN: + if encoding is None and type.coord_type != _ga.CoordType.UNSPECIFIED: spec["encoding"] = geoparquet_encoding_geoarrow() - if type.crs_type == _ga.CrsType.PROJJSON: - spec["crs"] = json.loads(type.crs) - elif type.crs_type == _ga.CrsType.NONE: + crs = type.crs + if crs is None: spec["crs"] = None else: - import pyproj - - spec["crs"] = pyproj.CRS(type.crs).to_json_dict() + spec["crs"] = type.crs.to_json_dict() - if type.edge_type == _ga.EdgeType.SPHERICAL: - spec["edges"] = "spherical" + if type.edge_type != _ga.EdgeType.PLANAR: + spec["edges"] = type.edge_type.name.lower() # GeoArrow-encoded types can confidently declare a single geometry type maybe_known_geometry_type = type.geometry_type @@ -332,8 +326,10 @@ def _geoparquet_column_spec_from_type(type, add_geometry_types=None, encoding=No and maybe_known_geometry_type != _ga.GeometryType.GEOMETRY and maybe_known_dimensions != _ga.Dimensions.UNKNOWN ): - geometry_type = _GEOPARQUET_GEOMETRY_TYPE_LABELS[maybe_known_geometry_type] - dimensions = _GEOPARQUET_DIMENSION_LABELS[maybe_known_dimensions] + geometry_type = _GEOPARQUET_GEOMETRY_TYPE_LABELS[ + maybe_known_geometry_type.value + ] + dimensions = _GEOPARQUET_DIMENSION_LABELS[maybe_known_dimensions.value] spec["geometry_types"] = [f"{geometry_type}{dimensions}"] if spec["encoding"] is None: @@ -432,15 +428,17 @@ def _geoparquet_encode_chunked_array( unique_geometry_types = None geoarrow_type = None inferred_geoarrow_encoding = None - if spec["encoding"] in (_GEOARROW_ENCODINGS + (geoparquet_encoding_geoarrow(),)): + if spec["encoding"] in ( + tuple(_GEOARROW_ENCODINGS.keys()) + (geoparquet_encoding_geoarrow(),) + ): unique_geometry_types = _ga.unique_geometry_types(item) geoarrow_type = _ga.infer_type_common( item, - coord_type=_ga.CoordType.SEPARATE, + coord_type=_ga.CoordType.SEPARATED, _geometry_types=unique_geometry_types, ) - if geoarrow_type.coord_type == _ga.CoordType.UNKNOWN: + if geoarrow_type.coord_type == _ga.CoordType.UNSPECIFIED: raise ValueError( "Can't encode column with incompatable geometry types as geoarrow" ) @@ -514,14 +512,14 @@ def geoparquet_encoding_geoarrow(): _GEOPARQUET_DIMENSION_LABELS = [None, "", " Z", " M", " ZM"] -_GEOARROW_ENCODINGS = ( - "point", - "linestring", - "polygon", - "multipoint", - "multilinestring", - "multipolygon", -) +_GEOARROW_ENCODINGS = { + "point": _type.point(), + "linestring": _type.linestring(), + "polygon": _type.polygon(), + "multipoint": _type.multipoint(), + "multilinestring": _type.multilinestring(), + "multipolygon": _type.multipolygon(), +} _CRS_LONLAT = { "$schema": "https://proj.org/schemas/v0.7/projjson.schema.json", diff --git a/geoarrow-pyarrow/tests/test_compute.py b/geoarrow-pyarrow/tests/test_compute.py index 08ebdca..a6f46e4 100644 --- a/geoarrow-pyarrow/tests/test_compute.py +++ b/geoarrow-pyarrow/tests/test_compute.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from geoarrow import types import geoarrow.pyarrow as ga import geoarrow.pyarrow._kernel as _kernel import geoarrow.pyarrow._compute as _compute @@ -112,8 +113,8 @@ def test_format_wkt(): def test_unique_geometry_types(): ga_array = ga.as_geoarrow(pa.array([], type=pa.utf8()), ga.point()) out = _compute.unique_geometry_types(ga_array).flatten() - assert out[0] == pa.array([ga.GeometryType.POINT], type=pa.int32()) - assert out[1] == pa.array([ga.Dimensions.XY], type=pa.int32()) + assert out[0] == pa.array([ga.GeometryType.POINT.value], type=pa.int32()) + assert out[1] == pa.array([ga.Dimensions.XY.value], type=pa.int32()) wkt_array = ga.array( [ @@ -127,20 +128,20 @@ def test_unique_geometry_types(): out = _compute.unique_geometry_types(wkt_array).flatten() assert out[0] == pa.array( [ - ga.GeometryType.MULTIPOINT, - ga.GeometryType.POLYGON, - ga.GeometryType.LINESTRING, - ga.GeometryType.POINT, + ga.GeometryType.MULTIPOINT.value, + ga.GeometryType.POLYGON.value, + ga.GeometryType.LINESTRING.value, + ga.GeometryType.POINT.value, ], type=pa.int32(), ) assert out[1] == pa.array( [ - ga.Dimensions.XY, - ga.Dimensions.XYZ, - ga.Dimensions.XYM, - ga.Dimensions.XYZM, + ga.Dimensions.XY.value, + ga.Dimensions.XYZ.value, + ga.Dimensions.XYM.value, + ga.Dimensions.XYZM.value, ], type=pa.int32(), ) @@ -153,85 +154,73 @@ def test_infer_type_common(): already_geoarrow = ga.as_geoarrow(["POINT (0 1)"]) common = _compute.infer_type_common(already_geoarrow) - assert common.geoarrow_id == already_geoarrow.type.geoarrow_id + assert common.spec == already_geoarrow.type.spec common_interleaved = _compute.infer_type_common( already_geoarrow, coord_type=ga.CoordType.INTERLEAVED ) assert ( - common_interleaved.geoarrow_id - == already_geoarrow.type.with_coord_type(ga.CoordType.INTERLEAVED).geoarrow_id + common_interleaved.spec + == already_geoarrow.type.with_coord_type(ga.CoordType.INTERLEAVED).spec ) - point = ga.wkt().with_crs("EPSG:1234").wrap_array(pa.array(["POINT (0 1)"])) + point = ga.wkt().with_crs(types.OGC_CRS84).wrap_array(pa.array(["POINT (0 1)"])) common = _compute.infer_type_common(point) - assert common.geoarrow_id == ga.point().geoarrow_id - assert common.crs == "EPSG:1234" + assert common.spec == ga.point().with_crs(types.OGC_CRS84).spec common_promote_multi = _compute.infer_type_common(point, promote_multi=True) - assert common_promote_multi.geoarrow_id == ga.multipoint().geoarrow_id + assert common_promote_multi.spec == ga.multipoint().with_crs(types.OGC_CRS84).spec point_z_and_zm = ga.array(["POINT (0 1)", "POINT ZM (0 1 2 3)"]) common = _compute.infer_type_common(point_z_and_zm) - assert ( - common.geoarrow_id == ga.point().with_dimensions(ga.Dimensions.XYZM).geoarrow_id - ) + assert common.spec == ga.point().with_dimensions(ga.Dimensions.XYZM).spec point_m_and_z = ga.array(["POINT M (0 1 2)", "POINT Z (0 1 2)"]) common = _compute.infer_type_common(point_m_and_z) - assert ( - common.geoarrow_id == ga.point().with_dimensions(ga.Dimensions.XYZM).geoarrow_id - ) + assert common.spec == ga.point().with_dimensions(ga.Dimensions.XYZM).spec mixed = ( ga.wkt() - .with_crs("EPSG:1234") + .with_crs(types.OGC_CRS84) .wrap_array(pa.array(["POINT (0 1)", "LINESTRING (0 1, 2 3)"])) ) common = _compute.infer_type_common(mixed) - assert common.geoarrow_id == ga.wkb().geoarrow_id - assert common.crs == "EPSG:1234" + assert common.spec == ga.wkb().with_crs(types.OGC_CRS84).spec point_and_multi = ga.array(["POINT (0 1)", "MULTIPOINT (2 3)"]) common = _compute.infer_type_common(point_and_multi) - assert common.geoarrow_id == ga.multipoint().geoarrow_id + assert common.spec == ga.multipoint().spec linestring_and_multi = ga.array( ["LINESTRING (0 1, 2 3)", "MULTILINESTRING ((0 1, 2 3))"] ) common = _compute.infer_type_common(linestring_and_multi) - assert common.geoarrow_id == ga.multilinestring().geoarrow_id + assert common.spec == ga.multilinestring().spec polygon_and_multi = ga.array( ["POLYGON ((0 0, 0 1, 1 0, 0 0))", "MULTIPOLYGON (((0 0, 0 1, 1 0, 0 0)))"] ) common = _compute.infer_type_common(polygon_and_multi) - assert common.geoarrow_id == ga.multipolygon().geoarrow_id + assert common.spec == ga.multipolygon().spec def test_as_geoarrow(): array = _compute.as_geoarrow(["POINT (0 1)"]) - assert array.type.geoarrow_id == ga.point().geoarrow_id + assert array.type.spec == ga.point().spec array2 = _compute.as_geoarrow(array) assert array2 is array array2 = _compute.as_geoarrow(array, coord_type=ga.CoordType.INTERLEAVED) - assert ( - array2.type.geoarrow_id - == ga.point().with_coord_type(ga.CoordType.INTERLEAVED).geoarrow_id - ) + assert array2.type.spec == ga.point().with_coord_type(ga.CoordType.INTERLEAVED).spec array = _compute.as_geoarrow(["POINT (0 1)"], coord_type=ga.CoordType.INTERLEAVED) - assert ( - array.type.geoarrow_id - == ga.point().with_coord_type(ga.CoordType.INTERLEAVED).geoarrow_id - ) + assert array.type.spec == ga.point().with_coord_type(ga.CoordType.INTERLEAVED).spec array = _compute.as_geoarrow(["POINT (0 1)"], type=ga.multipoint()) - assert array.type.geoarrow_id == ga.multipoint().geoarrow_id + assert array.type.spec == ga.multipoint().spec array = _compute.as_geoarrow(["POINT (0 1)", "LINESTRING (0 1, 2 3)"]) - assert array.type.geoarrow_id == ga.wkb().geoarrow_id + assert array.type.spec == ga.wkb().spec def test_make_point(): @@ -272,8 +261,8 @@ def test_make_point(): "POINT ZM (3 6 9 12)", ] - xy_crs = _compute.make_point(xs, ys, crs="EPSG:1234") - assert xy_crs.type.crs == "EPSG:1234" + xy_crs = _compute.make_point(xs, ys, crs=types.OGC_CRS84) + assert xy_crs.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() def test_box(): @@ -357,17 +346,16 @@ def test_with_edge_type(): def test_with_crs(): storage_array = pa.array(["POINT (0 1)", "POINT (2 3)"]) - crsified = _compute.with_crs(storage_array, "EPSG:1234") + crsified = _compute.with_crs(storage_array, types.OGC_CRS84) assert isinstance(crsified.type, ga.WktType) - assert crsified.type.crs == "EPSG:1234" + assert crsified.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() crsnope = _compute.with_crs(crsified, None) - assert crsnope.type.crs == "" - assert crsnope.type.crs_type == ga.CrsType.NONE + assert crsnope.type.crs is None crsnope_chunked = pa.chunked_array([crsnope]) - crsified_chunked = _compute.with_crs(crsnope_chunked, "EPSG:1234") - assert crsified_chunked.type.crs == "EPSG:1234" + crsified_chunked = _compute.with_crs(crsnope_chunked, types.OGC_CRS84) + assert crsified_chunked.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() def test_with_coord_type(): @@ -375,8 +363,8 @@ def test_with_coord_type(): with_interleaved = _compute.with_coord_type(wkt_array, ga.CoordType.INTERLEAVED) assert with_interleaved.type.coord_type == ga.CoordType.INTERLEAVED - with_struct = _compute.with_coord_type(with_interleaved, ga.CoordType.SEPARATE) - assert with_struct.type.coord_type == ga.CoordType.SEPARATE + with_struct = _compute.with_coord_type(with_interleaved, ga.CoordType.SEPARATED) + assert with_struct.type.coord_type == ga.CoordType.SEPARATED def test_with_dimensions(): diff --git a/geoarrow-pyarrow/tests/test_geopandas.py b/geoarrow-pyarrow/tests/test_geopandas.py index fbf25e6..f167b35 100644 --- a/geoarrow-pyarrow/tests/test_geopandas.py +++ b/geoarrow-pyarrow/tests/test_geopandas.py @@ -1,4 +1,5 @@ import pytest +from geoarrow import types import geoarrow.pyarrow as ga @@ -9,8 +10,7 @@ def test_from_geopandas(): geoseries = geopandas.GeoSeries.from_wkt(["POINT (30 10)"]).set_crs("OGC:CRS84") array = ga.array(geoseries) assert isinstance(array.type, ga.WkbType) - assert array.type.crs_type == ga.CrsType.PROJJSON - assert "CRS84" in array.type.crs + assert "CRS84" in repr(array.type.crs) assert ga.format_wkt(array)[0].as_py() == "POINT (30 10)" @@ -32,7 +32,7 @@ def test_to_geopandas(): def test_to_geopandas_with_crs(): - array = ga.with_crs(ga.array(["POINT (30 10)"]), "OGC:CRS84") + array = ga.with_crs(ga.array(["POINT (30 10)"]), types.OGC_CRS84) geoseries = ga.to_geopandas(array) assert isinstance(geoseries, geopandas.GeoSeries) assert len(geoseries) == 1 diff --git a/geoarrow-pyarrow/tests/test_io.py b/geoarrow-pyarrow/tests/test_io.py index 5f41fb5..1ef5d83 100644 --- a/geoarrow-pyarrow/tests/test_io.py +++ b/geoarrow-pyarrow/tests/test_io.py @@ -5,6 +5,7 @@ import pyarrow as pa from pyarrow import parquet +from geoarrow import types import geoarrow.pyarrow as ga from geoarrow.pyarrow import io @@ -45,7 +46,9 @@ def test_write_geoparquet_table_default(): io.write_geoparquet_table(tab, temp_pq, geometry_encoding=None) tab2 = parquet.read_table(temp_pq) assert b"geo" in tab2.schema.metadata - assert tab2.schema.types[0] == ga.point().storage_type + ga.as_wkt(ga.point().wrap_array(tab2["geometry"])).to_pylist() == [ + "POINT (0 1)" + ] def test_write_geoparquet_table_wkb(): @@ -72,7 +75,9 @@ def test_write_geoparquet_table_geoarrow(): meta = json.loads(tab2.schema.metadata[b"geo"]) assert meta["version"] == "1.1.0" assert meta["columns"]["geometry"]["encoding"] == "point" - assert tab2.schema.types[0] == ga.point().storage_type + ga.as_wkt(ga.point().wrap_array(tab2["geometry"])).to_pylist() == [ + "POINT (0 1)" + ] def test_read_geoparquet_table_wkb(): @@ -136,15 +141,9 @@ def test_geoparquet_column_spec_from_type_crs(): assert spec_none["crs"] is None spec_projjson = io._geoparquet_column_spec_from_type( - ga.wkb().with_crs("{}", ga.CrsType.PROJJSON) + ga.wkb().with_crs(types.OGC_CRS84) ) - assert spec_projjson["crs"] == {} - - pytest.importorskip("pyproj") - spec_not_projjson = io._geoparquet_column_spec_from_type( - ga.wkb().with_crs("OGC:CRS84") - ) - assert spec_not_projjson["crs"]["id"]["code"] == "CRS84" + assert spec_projjson["crs"]["id"]["code"] == "CRS84" def test_geoparquet_column_spec_from_type_edges(): @@ -352,13 +351,12 @@ def test_chunked_array_to_geoarrow_crs(): item_missing_crs = io._geoparquet_chunked_array_to_geoarrow( item_binary, {"encoding": "WKB"} ) - assert item_missing_crs.type.crs_type == ga.CrsType.PROJJSON + assert item_missing_crs.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() item_explicit_crs = io._geoparquet_chunked_array_to_geoarrow( item_binary, {"encoding": "WKB", "crs": {}} ) - assert item_explicit_crs.type.crs_type == ga.CrsType.PROJJSON - assert item_explicit_crs.type.crs == "{}" + assert item_explicit_crs.type.crs.to_json_dict() == {} def test_chunked_array_to_geoarrow_edges(): @@ -390,7 +388,7 @@ def test_table_to_geoarrow(): tab_geo = io._geoparquet_table_to_geoarrow(tab, {"col_name": {"encoding": "WKB"}}) assert "col_name" in tab_geo.schema.names assert isinstance(tab_geo["col_name"].type, ga.GeometryExtensionType) - assert tab_geo["col_name"].type.crs_type == ga.CrsType.PROJJSON + assert tab_geo["col_name"].type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() # Check with no columns selected tab_no_cols = tab.drop_columns(["col_name"]) diff --git a/geoarrow-pyarrow/tests/test_pyarrow.py b/geoarrow-pyarrow/tests/test_pyarrow.py index d0152ba..9b45156 100644 --- a/geoarrow-pyarrow/tests/test_pyarrow.py +++ b/geoarrow-pyarrow/tests/test_pyarrow.py @@ -1,5 +1,4 @@ import sys -import json import re from math import inf @@ -8,6 +7,7 @@ import pytest import geoarrow.c.lib as lib +from geoarrow import types import geoarrow.pyarrow as ga import geoarrow.pyarrow._type as _type import geoarrow.pyarrow._array as _array @@ -18,31 +18,23 @@ def test_version(): def test_geometry_type_basic(): - ctype = lib.CVectorType.Make( - ga.GeometryType.POINT, ga.Dimensions.XY, ga.CoordType.SEPARATE - ) - - pa_type = _type.PointType(ctype) + pa_type = _type.point() assert pa_type.geometry_type == ga.GeometryType.POINT assert pa_type.dimensions == ga.Dimensions.XY - assert pa_type.coord_type == ga.CoordType.SEPARATE + assert pa_type.coord_type == ga.CoordType.SEPARATED expected_storage = pa.struct( - [pa.field("x", pa.float64()), pa.field("y", pa.float64())] + [ + pa.field("x", pa.float64(), nullable=False), + pa.field("y", pa.float64(), nullable=False), + ] ) assert pa_type.storage_type == expected_storage - with pytest.raises(ValueError): - _type.LinestringType(ctype) - def test_geometry_type_with(): - ctype = lib.CVectorType.Make( - ga.GeometryType.POINT, ga.Dimensions.XY, ga.CoordType.SEPARATE - ) - - type_obj = _type.PointType(ctype) + type_obj = _type.point() type_linestring = type_obj.with_geometry_type(ga.GeometryType.LINESTRING) assert type_linestring.geometry_type == ga.GeometryType.LINESTRING @@ -57,37 +49,8 @@ def test_geometry_type_with(): assert type_spherical.edge_type == ga.EdgeType.SPHERICAL # Explicit type - type_crs = type_obj.with_crs("EPSG:1234", ga.CrsType.UNKNOWN) - assert type_crs.crs_type == ga.CrsType.UNKNOWN - assert type_crs.crs == "EPSG:1234" - - type_crs = type_obj.with_crs(b"EPSG:1234", ga.CrsType.UNKNOWN) - assert type_crs.crs_type == ga.CrsType.UNKNOWN - assert type_crs.crs == "EPSG:1234" - - type_crs = type_obj.with_crs({}, ga.CrsType.UNKNOWN) - assert type_crs.crs_type == ga.CrsType.UNKNOWN - assert type_crs.crs == "{}" - - type_crs = type_obj.with_crs("{}", ga.CrsType.PROJJSON) - assert type_crs.crs_type == ga.CrsType.PROJJSON - assert type_crs.crs == "{}" - - with pytest.raises(TypeError, match="Unknown type for crs object"): - type_obj.with_crs(pa.array([]), ga.CrsType.UNKNOWN) - - # Implicit type - type_crs = type_obj.with_crs("EPSG:1234") - assert type_crs.crs_type == ga.CrsType.UNKNOWN - assert type_crs.crs == "EPSG:1234" - - type_crs = type_obj.with_crs(b"EPSG:1234") - assert type_crs.crs_type == ga.CrsType.UNKNOWN - assert type_crs.crs == "EPSG:1234" - - type_crs = type_obj.with_crs({}) - assert type_crs.crs_type == ga.CrsType.PROJJSON - assert type_crs.crs == "{}" + type_crs = type_obj.with_crs(types.OGC_CRS84) + assert type_crs.crs == types.OGC_CRS84 def test_type_with_crs_pyproj(): @@ -96,14 +59,7 @@ def test_type_with_crs_pyproj(): # Implicit type type_crs = type_obj.with_crs(pyproj.CRS("EPSG:32620")) - assert type_crs.crs_type == ga.CrsType.PROJJSON - crs_dict = json.loads(type_crs.crs) - assert crs_dict["id"]["code"] == 32620 - - # Explicit type - type_crs = type_obj.with_crs(pyproj.CRS("EPSG:32620"), ga.CrsType.PROJJSON) - assert type_crs.crs_type == ga.CrsType.PROJJSON - crs_dict = json.loads(type_crs.crs) + crs_dict = type_crs.crs.to_json_dict() assert crs_dict["id"]["code"] == 32620 @@ -120,18 +76,20 @@ def test_constructors(): assert ga.multipolygon().extension_name == "geoarrow.multipolygon" generic = ga.extension_type( - ga.GeometryType.POINT, - ga.Dimensions.XYZ, - ga.CoordType.INTERLEAVED, - ga.EdgeType.SPHERICAL, - "EPSG:1234", + types.type_spec( + ga.Encoding.GEOARROW, + ga.GeometryType.POINT, + ga.Dimensions.XYZ, + ga.CoordType.INTERLEAVED, + ga.EdgeType.SPHERICAL, + crs=types.OGC_CRS84, + ) ) assert generic.geometry_type == ga.GeometryType.POINT assert generic.dimensions == ga.Dimensions.XYZ assert generic.coord_type == ga.CoordType.INTERLEAVED assert generic.edge_type == ga.EdgeType.SPHERICAL - assert generic.crs == "EPSG:1234" - assert generic.crs_type == ga.CrsType.UNKNOWN + assert generic.crs == types.OGC_CRS84 def test_type_common(): @@ -141,29 +99,6 @@ def test_type_common(): assert ga.geometry_type_common([ga.point(), ga.linestring()]) == ga.wkb() -def test_register_extension_types(): - # Unregistering once is ok - ga.unregister_extension_types(lazy=False) - - # Unregistering twice with lazy=True is ok - ga.unregister_extension_types(lazy=True) - - # Unregistering twice with lazy=False is not - with pytest.raises(RuntimeError): - ga.unregister_extension_types(lazy=False) - - # Same concept with registration - ga.register_extension_types(lazy=False) - ga.register_extension_types(lazy=True) - with pytest.raises(RuntimeError): - ga.register_extension_types(lazy=False) - - # Reset state - ga.unregister_extension_types() - ga.register_extension_types() - assert _type._extension_types_registered is True - - def test_array(): array = ga.array(["POINT (30 10)"]) assert array.type == ga.wkt() @@ -267,29 +202,29 @@ def test_kernel_void(): def test_kernel_as(): - array = ga.array(["POINT (30 10)"], ga.wkt().with_crs("EPSG:1234")) + array = ga.array(["POINT (30 10)"], ga.wkt().with_crs(types.OGC_CRS84)) kernel = ga.Kernel.as_wkt(array.type) out = kernel.push(array) assert out.type.extension_name == "geoarrow.wkt" - assert out.type.crs == "EPSG:1234" + assert out.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() assert isinstance(out, _array.GeometryExtensionArray) - array = ga.array(["POINT (30 10)"], ga.wkt().with_crs("EPSG:1234")) + array = ga.array(["POINT (30 10)"], ga.wkt().with_crs(types.OGC_CRS84)) kernel = ga.Kernel.as_wkb(array.type) out = kernel.push(array) assert out.type.extension_name == "geoarrow.wkb" - assert out.type.crs == "EPSG:1234" + assert out.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() assert isinstance(out, _array.GeometryExtensionArray) if sys.byteorder == "little": wkb_item = b"\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3e\x40\x00\x00\x00\x00\x00\x00\x24\x40" assert out[0].as_py() == wkb_item - array = ga.array(["POINT (30 10)"], ga.wkt().with_crs("EPSG:1234")) - kernel = ga.Kernel.as_geoarrow(array.type, ga.point().geoarrow_id) + array = ga.array(["POINT (30 10)"], ga.wkt().with_crs(types.OGC_CRS84)) + kernel = ga.Kernel.as_geoarrow(array.type, 1) out = kernel.push(array) assert out.type.extension_name == "geoarrow.point" - assert out.type.crs == "EPSG:1234" + assert out.type.crs.to_json_dict() == types.OGC_CRS84.to_json_dict() assert isinstance(out, _array.PointArray) diff --git a/geoarrow-types/src/geoarrow/types/crs.py b/geoarrow-types/src/geoarrow/types/crs.py index 10f9db9..2377c68 100644 --- a/geoarrow-types/src/geoarrow/types/crs.py +++ b/geoarrow-types/src/geoarrow/types/crs.py @@ -80,6 +80,16 @@ def __init__(self, obj: Union[Crs, Mapping, str, bytes]) -> None: "ProjJsonCrs can only be created from Crs, dict, str, or bytes" ) + def __eq__(self, value): + # Some duplication with _crs_equal(), but using that here + # involves some recursion that's difficult to avoid + if isinstance(value, UnspecifiedCrs): + return False + elif hasattr(value, "to_json_dict"): + return self.to_json_dict() == value.to_json_dict() + else: + return False + def to_json(self) -> str: if self._str is None: self._str = json.dumps(self._obj) @@ -98,7 +108,7 @@ def __repr__(self) -> str: if "id" in crs_dict: crs_id = crs_dict["id"] if "authority" in crs_id and "code" in crs_id: - return f'ProjJsonCrs({crs_id["authority"]}:{crs_id["code"]})' + return f"ProjJsonCrs({crs_id['authority']}:{crs_id['code']})" except ValueError: pass diff --git a/geoarrow-types/src/geoarrow/types/type_pyarrow.py b/geoarrow-types/src/geoarrow/types/type_pyarrow.py index c616615..d871665 100644 --- a/geoarrow-types/src/geoarrow/types/type_pyarrow.py +++ b/geoarrow-types/src/geoarrow/types/type_pyarrow.py @@ -19,6 +19,8 @@ class GeometryExtensionType(pa.ExtensionType): """Extension type base class for vector geometry types.""" _extension_name = None + _array_cls_from_name = None + _scalar_cls_from_name = None def __init__( self, spec: TypeSpec, *, storage_type=None, validate_storage_type=True @@ -46,7 +48,7 @@ def __init__( pa.ExtensionType.__init__(self, storage_type, self._spec.extension_name()) def __repr__(self): - return f"{type(self).__name__}({repr(self._spec)})" + return f"{type(self).__name__}({_spec_short_repr(self.spec, self._extension_name)})" def __arrow_ext_serialize__(self): return self._spec.extension_metadata().encode() @@ -62,6 +64,46 @@ def to_pandas_dtype(self): return ArrowDtype(self) + def __arrow_ext_class__(self): + if GeometryExtensionType._array_cls_from_name: + return GeometryExtensionType._array_cls_from_name(self.extension_name) + else: + return super().__arrow_ext_class__() + + def __arrow_ext_scalar_class__(self): + if GeometryExtensionType._scalar_cls_from_name: + return GeometryExtensionType._scalar_cls_from_name(self.extension_name) + else: + return super().__arrow_ext_scalar_class__() + + def from_geobuffers(self, *args, **kwargs): + """Create an array from the appropriate number of buffers + for this type. + """ + raise NotImplementedError() + + def wrap_array(self, storage): + # Often this storage has the correct type except for nullable/ + # non/nullable-ness of children. First check for the easy case + # (exactly correct storage type). + if storage.type == self.storage_type: + return super().wrap_array(storage) + + # A cast won't work because pyarrow won't cast nullable to + # non-nullable; however, we can attempt to export to C and + # reimport against this after making sure that the storage parses + # to the appropriate geometry type. + + # Handle ChunkedArray + if isinstance(storage, pa.ChunkedArray): + chunks = [self.wrap_array(chunk) for chunk in storage.chunks] + return pa.chunked_array(chunks, self) + + _, c_array = storage.__arrow_c_array__() + c_schema = self.storage_type.__arrow_c_schema__() + storage = pa.Array._import_from_c_capsule(c_schema, c_array) + return super().wrap_array(storage) + @property def spec(self) -> TypeSpec: return self._spec @@ -101,10 +143,10 @@ def coord_type(self) -> CoordType: """The :class:`CoordType` of this type. >>> import geoarrow.pyarrow as ga - >>> ga.linestring().coord_type == ga.CoordType.SEPARATE + >>> ga.linestring().coord_type == ga.CoordType.SEPARATED True >>> ga.linestring().with_coord_type(ga.CoordType.INTERLEAVED).coord_type - + """ return self._spec.coord_type @@ -116,7 +158,7 @@ def edge_type(self) -> EdgeType: >>> ga.linestring().edge_type == ga.EdgeType.PLANAR True >>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL).edge_type - + """ return self._spec.edge_type @@ -125,11 +167,72 @@ def crs(self) -> Optional[Crs]: """The coordinate reference system of this type. >>> import geoarrow.pyarrow as ga - >>> ga.point().with_crs("EPSG:1234").crs - 'EPSG:1234' + >>> ga.point().with_crs(ga.OGC_CRS84).crs + ProjJsonCrs(OGC:CRS84) """ return self._spec.crs + def with_metadata(self, metadata): + """This type with the extension metadata (e.g., copied from some other type) + >>> import geoarrow.pyarrow as ga + >>> ga.linestring().with_metadata('{"edges": "spherical"}').edge_type + + """ + if isinstance(metadata, str): + metadata = metadata.encode("UTF-8") + return type(self).__arrow_ext_deserialize__(self.storage_type, metadata) + + def with_geometry_type(self, geometry_type): + """Returns a new type with the specified :class:`geoarrow.GeometryType`. + >>> import geoarrow.pyarrow as ga + >>> ga.point().with_geometry_type(ga.GeometryType.LINESTRING) + LinestringType(geoarrow.linestring) + """ + spec = type_spec(Encoding.GEOARROW, geometry_type=geometry_type) + spec = TypeSpec.coalesce(spec, self.spec).canonicalize() + return extension_type(spec) + + def with_dimensions(self, dimensions): + """Returns a new type with the specified :class:`geoarrow.Dimensions`. + >>> import geoarrow.pyarrow as ga + >>> ga.point().with_dimensions(ga.Dimensions.XYZ) + PointType(geoarrow.point_z) + """ + spec = type_spec(dimensions=dimensions) + spec = TypeSpec.coalesce(spec, self.spec).canonicalize() + return extension_type(spec) + + def with_coord_type(self, coord_type): + """Returns a new type with the specified :class:`geoarrow.CoordType`. + >>> import geoarrow.pyarrow as ga + >>> ga.point().with_coord_type(ga.CoordType.INTERLEAVED) + PointType(interleaved geoarrow.point) + """ + spec = type_spec(coord_type=coord_type) + spec = TypeSpec.coalesce(spec, self.spec).canonicalize() + return extension_type(spec) + + def with_edge_type(self, edge_type): + """Returns a new type with the specified :class:`geoarrow.EdgeType`. + >>> import geoarrow.pyarrow as ga + >>> ga.linestring().with_edge_type(ga.EdgeType.SPHERICAL) + LinestringType(spherical geoarrow.linestring) + """ + spec = type_spec(edge_type=edge_type) + spec = TypeSpec.coalesce(spec, self.spec).canonicalize() + return extension_type(spec) + + def with_crs(self, crs): + """Returns a new type with the specified coordinate reference system + :class:`geoarrow.CrsType` combination. + >>> import geoarrow.pyarrow as ga + >>> ga.linestring().with_crs(ga.OGC_CRS84) + LinestringType(geoarrow.linestring ) + """ + spec = type_spec(crs=crs) + spec = TypeSpec.coalesce(spec, self.spec).canonicalize() + return extension_type(spec) + class WkbType(GeometryExtensionType): """Extension type whose storage is a binary or large binary array of @@ -156,6 +259,10 @@ class PointType(GeometryExtensionType): _extension_name = "geoarrow.point" + def from_geobuffers(self, validity, x, y=None, z_or_m=None, m=None): + storage = _from_buffers_point(self.storage_type, validity, x, y, z_or_m, m) + return self.wrap_array(storage) + class LinestringType(GeometryExtensionType): """Extension type whose storage is an array of linestrings stored @@ -164,6 +271,12 @@ class LinestringType(GeometryExtensionType): _extension_name = "geoarrow.linestring" + def from_geobuffers(self, validity, coord_offsets, x, y=None, z_or_m=None, m=None): + storage = _from_buffers_linestring( + self.storage_type, validity, coord_offsets, x, y, z_or_m, m + ) + return self.wrap_array(storage) + class PolygonType(GeometryExtensionType): """Extension type whose storage is an array of polygons stored @@ -172,6 +285,14 @@ class PolygonType(GeometryExtensionType): _extension_name = "geoarrow.polygon" + def from_geobuffers( + self, validity, ring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None + ): + storage = _from_buffers_polygon( + self.storage_type, validity, ring_offsets, coord_offsets, x, y, z_or_m, m + ) + return self.wrap_array(storage) + class MultiPointType(GeometryExtensionType): """Extension type whose storage is an array of polygons stored @@ -180,6 +301,12 @@ class MultiPointType(GeometryExtensionType): _extension_name = "geoarrow.multipoint" + def from_geobuffers(self, validity, coord_offsets, x, y=None, z_or_m=None, m=None): + storage = _from_buffers_linestring( + self.storage_type, validity, coord_offsets, x, y, z_or_m, m + ) + return self.wrap_array(storage) + class MultiLinestringType(GeometryExtensionType): """Extension type whose storage is an array of multilinestrings stored @@ -188,6 +315,28 @@ class MultiLinestringType(GeometryExtensionType): _extension_name = "geoarrow.multilinestring" + def from_geobuffers( + self, + validity, + linestring_offsets, + coord_offsets, + x, + y=None, + z_or_m=None, + m=None, + ): + storage = _from_buffers_polygon( + self.storage_type, + validity, + linestring_offsets, + coord_offsets, + x, + y, + z_or_m, + m, + ) + return self.wrap_array(storage) + class MultiPolygonType(GeometryExtensionType): """Extension type whose storage is an array of multilinestrings stored @@ -196,6 +345,30 @@ class MultiPolygonType(GeometryExtensionType): _extension_name = "geoarrow.multipolygon" + def from_geobuffers( + self, + validity, + polygon_offsets, + ring_offsets, + coord_offsets, + x, + y=None, + z_or_m=None, + m=None, + ): + storage = _from_buffers_multipolygon( + self.storage_type, + validity, + polygon_offsets, + ring_offsets, + coord_offsets, + x, + y, + z_or_m, + m, + ) + return self.wrap_array(storage) + def extension_type( spec: TypeSpec, storage_type=None, validate_storage_type=True @@ -479,6 +652,94 @@ def _nested_type(coord, names): return coord +def _from_buffer_ordinate(x): + mv = memoryview(x) + if mv.format != "d": + mv = mv.cast("d") + + return pa.array(mv, pa.float64()) + + +def _pybuffer_offset(x): + mv = memoryview(x) + if mv.format != "i": + mv = mv.cast("i") + + return len(mv), pa.py_buffer(mv) + + +def _from_buffers_point(type_, validity, x, y=None, z_or_m=None, m=None): + validity = pa.py_buffer(validity) if validity is not None else None + children = [_from_buffer_ordinate(x)] + if y is not None: + children.append(_from_buffer_ordinate(y)) + if z_or_m is not None: + children.append(_from_buffer_ordinate(z_or_m)) + if m is not None: + children.append(_from_buffer_ordinate(m)) + + if pa_types.is_fixed_size_list(type_): + length = len(x) // type_.list_size + else: + length = len(x) + + return pa.Array.from_buffers(type_, length, buffers=[validity], children=children) + + +def _from_buffers_linestring( + type_, validity, coord_offsets, x, y=None, z_or_m=None, m=None +): + validity = pa.py_buffer(validity) if validity is not None else None + n_offsets, coord_offsets = _pybuffer_offset(coord_offsets) + coords = _from_buffers_point(type_.field(0).type, None, x, y, z_or_m, m) + return pa.Array.from_buffers( + type_, + n_offsets - 1, + buffers=[validity, pa.py_buffer(coord_offsets)], + children=[coords], + ) + + +def _from_buffers_polygon( + type_, validity, ring_offsets, coord_offsets, x, y=None, z_or_m=None, m=None +): + validity = pa.py_buffer(validity) if validity is not None else None + rings = _from_buffers_linestring( + type_.field(0).type, None, coord_offsets, x, y, z_or_m, m + ) + n_offsets, ring_offsets = _pybuffer_offset(ring_offsets) + return pa.Array.from_buffers( + type_, + n_offsets - 1, + buffers=[validity, pa.py_buffer(ring_offsets)], + children=[rings], + ) + + +def _from_buffers_multipolygon( + type_, + validity, + polygon_offsets, + ring_offsets, + coord_offsets, + x, + y=None, + z_or_m=None, + m=None, +): + validity = pa.py_buffer(validity) if validity is not None else None + polygons = _from_buffers_polygon( + type_.field(0).type, None, ring_offsets, coord_offsets, x, y, z_or_m, m + ) + n_offsets, polygon_offsets = _pybuffer_offset(polygon_offsets) + return pa.Array.from_buffers( + type_, + n_offsets - 1, + buffers=[validity, pa.py_buffer(ring_offsets)], + children=[polygons], + ) + + def _generate_storage_types(): coord_storage = { (CoordType.SEPARATED, Dimensions.XY): _struct_fields("xy"), @@ -517,6 +778,41 @@ def _generate_storage_types(): return all_storage_types +# A shorter version of repr(spec) that matches what geoarrow-c used to do +# (to reduce mayhem on docstring updates). +def _spec_short_repr(spec, ext_name): + non_planar = spec.edge_type != EdgeType.PLANAR + interleaved = spec.coord_type == CoordType.INTERLEAVED + + if spec.dimensions == Dimensions.XYZM: + dims = "_zm" + elif spec.dimensions == Dimensions.XYZ: + dims = "_z" + elif spec.dimensions == Dimensions.XYM: + dims = "_m" + else: + dims = "" + + if non_planar and interleaved: + type_prefix = f"{spec.edge_type.name.lower()} interleaved " + elif non_planar: + type_prefix = f"{spec.edge_type.name.lower()} " + elif interleaved: + type_prefix = "interleaved " + else: + type_prefix = "" + + if spec.crs is not None: + crs = f" <{repr(spec.crs)}>" + else: + crs = "" + + if len(crs) > 40: + crs = crs[:36] + "...>" + + return f"{type_prefix}{ext_name}{dims}{crs}" + + _EXTENSION_CLASSES = { "geoarrow.wkb": WkbType, "geoarrow.wkt": WktType, diff --git a/geoarrow-types/src/geoarrow/types/type_spec.py b/geoarrow-types/src/geoarrow/types/type_spec.py index 3dcd384..b1a824c 100644 --- a/geoarrow-types/src/geoarrow/types/type_spec.py +++ b/geoarrow-types/src/geoarrow/types/type_spec.py @@ -116,13 +116,17 @@ def with_defaults(self, defaults=None): return TypeSpec.coalesce(self, defaults) def canonicalize(self): - """Canonicalize the representation of serialized types + """Canonicalize the representation of a spec If this type specification represents a serialized type, ensure that the dimensions are UNKNOWN, the geometry type is GEOMETRY, - and the coord type is UNSPECIFIED. These ensure that when a type + and the coord type is UNSPECIFIED. Conversely, when geometry + type is UNKNOWN, the geometry type can't be guessed and we + need to set the encoding to a serialized type. + + These ensure that when a type implementation needs to construct a concrete type that its - components are represented consistently for serialized types. + components are represented consistently. """ if self.encoding.is_serialized(): return self.override( @@ -130,6 +134,8 @@ def canonicalize(self): dimensions=Dimensions.UNKNOWN, coord_type=CoordType.UNSPECIFIED, ) + elif self.geometry_type == GeometryType.GEOMETRY: + return self.override(encoding=Encoding.WKB).canonicalize() else: return self diff --git a/geoarrow-types/tests/test_type_pyarrow.py b/geoarrow-types/tests/test_type_pyarrow.py index 6959f59..19861b5 100644 --- a/geoarrow-types/tests/test_type_pyarrow.py +++ b/geoarrow-types/tests/test_type_pyarrow.py @@ -1,3 +1,4 @@ +import numpy as np import pyarrow as pa import pytest @@ -5,6 +6,22 @@ from geoarrow.types import type_pyarrow +def test_wrap_array_non_exact(): + from pyarrow import compute as pc + + storage = pc.make_struct( + pa.array([1.0, 2.0, 3.0]), pa.array([3.0, 4.0, 5.0]), field_names=["x", "y"] + ) + point = gt.point().to_pyarrow() + point_ext = point.wrap_array(storage) + assert point_ext.type.storage_type.field(0).nullable is False + + storage_chunked = pa.chunked_array([storage, storage]) + point_chunked_ext = point.wrap_array(storage_chunked) + assert point_chunked_ext.type.storage_type.field(0).nullable is False + assert point_chunked_ext.num_chunks == 2 + + def test_classes_serialized(): wkt = gt.wkt().to_pyarrow() assert isinstance(wkt, type_pyarrow.WktType) @@ -268,6 +285,113 @@ def test_deserialize_infer_dimensions_interleaved(): ) +def test_point_array_from_geobuffers(): + pa_type = gt.point(dimensions=gt.Dimensions.XYZM).to_pyarrow() + arr = pa_type.from_geobuffers( + b"\xff", + np.array([1.0, 2.0, 3.0]), + np.array([4.0, 5.0, 6.0]), + np.array([7.0, 8.0, 9.0]), + np.array([10.0, 11.0, 12.0]), + ) + assert len(arr) == 3 + assert arr.type == pa_type + assert arr.storage == pa.array( + [ + {"x": 1.0, "y": 4.0, "z": 7.0, "m": 10.0}, + {"x": 2.0, "y": 5.0, "z": 8.0, "m": 11.0}, + {"x": 3.0, "y": 6.0, "z": 9.0, "m": 12.0}, + ], + pa_type.storage_type, + ) + + pa_type = gt.point(coord_type=gt.CoordType.INTERLEAVED).to_pyarrow() + arr = pa_type.from_geobuffers(None, np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])) + assert len(arr) == 3 + assert arr.storage == pa.array( + [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], pa_type.storage_type + ) + + +@pytest.mark.parametrize( + "pa_type", [gt.linestring().to_pyarrow(), gt.multipoint().to_pyarrow()] +) +def test_linestringish_array_from_geobuffers(pa_type): + arr = pa_type.from_geobuffers( + b"\xff", + np.array([0, 4], np.int32), + np.array([0.0, 1.0, 0.0, 0.0]), + np.array([0.0, 0.0, 1.0, 0.0]), + ) + assert len(arr) == 1 + assert arr.storage == pa.array( + [ + [ + {"x": 0.0, "y": 0.0}, + {"x": 1.0, "y": 0.0}, + {"x": 0.0, "y": 1.0}, + {"x": 0.0, "y": 0.0}, + ] + ], + pa_type.storage_type, + ) + + +@pytest.mark.parametrize( + "pa_type", [gt.polygon().to_pyarrow(), gt.multilinestring().to_pyarrow()] +) +def test_polygonish_array_from_geobuffers(pa_type): + arr = pa_type.from_geobuffers( + b"\xff", + np.array([0, 1], np.int32), + np.array([0, 4], np.int32), + np.array([0.0, 1.0, 0.0, 0.0]), + np.array([0.0, 0.0, 1.0, 0.0]), + ) + assert len(arr) == 1 + assert arr.storage == pa.array( + [ + [ + [ + {"x": 0.0, "y": 0.0}, + {"x": 1.0, "y": 0.0}, + {"x": 0.0, "y": 1.0}, + {"x": 0.0, "y": 0.0}, + ] + ] + ], + pa_type.storage_type, + ) + + +def test_multipolygon_array_from_geobuffers(): + pa_type = gt.multipolygon().to_pyarrow() + arr = pa_type.from_geobuffers( + b"\xff", + np.array([0, 1], np.int32), + np.array([0, 1], np.int32), + np.array([0, 4], np.int32), + np.array([0.0, 1.0, 0.0, 0.0]), + np.array([0.0, 0.0, 1.0, 0.0]), + ) + assert len(arr) == 1 + assert arr.storage == pa.array( + [ + [ + [ + [ + {"x": 0.0, "y": 0.0}, + {"x": 1.0, "y": 0.0}, + {"x": 0.0, "y": 1.0}, + {"x": 0.0, "y": 0.0}, + ] + ] + ] + ], + pa_type.storage_type, + ) + + @pytest.mark.parametrize( "spec", [