diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bafa06f0..fe65c0475 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - `MultiBackendJobManager`: costs has been added as a column in tracking databases ([[#588](https://github.com/Open-EO/openeo-python-client/issues/588)]) +- When passing a path/string as `geometry` to `DataCube.aggregate_spatial()`, `DataCube.mask_polygon()`, etc.: + this is not translated automatically anymore to deprecated, non-standard `read_vector` usage. + Instead, if it is a local GeoJSON file, the GeoJSON data will be loaded directly client-side. + ([#104](https://github.com/Open-EO/openeo-python-client/issues/104), [#457](https://github.com/Open-EO/openeo-python-client/issues/457)) ### Removed diff --git a/docs/cookbook/tricks.rst b/docs/cookbook/tricks.rst index 4b9fb3fb2..c1eed20a5 100644 --- a/docs/cookbook/tricks.rst +++ b/docs/cookbook/tricks.rst @@ -80,3 +80,51 @@ For example: # `create_job` with URL to JSON file job = connection.create_job("https://jsonbin.example/my/process-graph.json") + + +.. _legacy_read_vector: + + +Legacy ``read_vector`` usage +---------------------------- + +In versions up to 0.35.0 of the openEO Python client library, +there was an old, deprecated feature in geometry handling +of :py:class:`~openeo.rest.datacube.DataCube` methods like +:py:meth:`~openeo.rest.datacube.DataCube.aggregate_spatial()` and +:py:meth:`~openeo.rest.datacube.DataCube.mask_polygon()` +where you could pass a *backend-side* path as ``geometries``, e.g.: + +.. code-block:: python + + cube = cube.aggregate_spatial( + geometries="/backend/path/to/geometries.json", + reducer="mean" + ) + +The client would handle this by automatically adding a ``read_vector`` process +in the process graph, with that path as argument, to instruct the backend to load the geometries from there. +This ``read_vector`` process was however a backend-specific, experimental and now deprecated process. +Moreover, it assumes that the user has access to (or at least knowledge of) the backend's file system, +which violates the openEO principle of abstracting away backend-specific details. + +In version 0.36.0, this old deprecated ``read_vector`` feature has been *removed*, +to allow other and better convenience functionality +when providing a string in the ``geometries`` argument: +e.g. load from a URL with standard process ``load_url``, +or load GeoJSON from a local clientside path. + +If your workflow however depends on the old, deprecated ``read_vector`` functionality, +it is possible to reconstruct that by manually adding a ``read_vector`` process in your workflow, +for example as follows: + +.. code-block:: python + + from openeo.processes import process + + cube = cube.aggregate_spatial( + geometries=process("read_vector", filename="/backend/path/to/geometries.json"), + reducer="mean" + ) + +Note that this is also works with older versions of the openEO Python client library. diff --git a/openeo/rest/datacube.py b/openeo/rest/datacube.py index fe3ef2d31..3e80edbe5 100644 --- a/openeo/rest/datacube.py +++ b/openeo/rest/datacube.py @@ -59,7 +59,7 @@ from openeo.rest.service import Service from openeo.rest.udp import RESTUserDefinedProcess from openeo.rest.vectorcube import VectorCube -from openeo.util import dict_no_none, guess_format, normalize_crs, rfc3339 +from openeo.util import dict_no_none, guess_format, load_json, normalize_crs, rfc3339 if typing.TYPE_CHECKING: # Imports for type checking only (circular import issue at runtime). @@ -609,7 +609,8 @@ def filter_spatial( (also see :py:func:`Connection.list_file_formats() `), e.g. GeoJSON, GeoParquet, etc. A ``load_url`` process will automatically be added to the process graph. - - a path (that is valid for the back-end) to a GeoJSON file. + - a path (:py:class:`str` or :py:class:`~pathlib.Path`) to a local, client-side GeoJSON file, + which will be loaded automatically to get the geometries as GeoJSON construct. - a :py:class:`~openeo.rest.vectorcube.VectorCube` instance. - a :py:class:`~openeo.api.process.Parameter` instance. @@ -619,6 +620,12 @@ def filter_spatial( .. versionchanged:: 0.36.0 Support passing a URL as ``geometries`` argument, which will be loaded with the ``load_url`` process. + + .. versionchanged:: 0.36.0 + Support for passing a backend-side path as ``geometries`` argument was removed + (also see :ref:`legacy_read_vector`). + Instead, it's possible to provide a client-side path to a GeoJSON file + (which will be loaded client-side to get the geometries as GeoJSON construct). """ valid_geojson_types = [ "Point", "MultiPoint", "LineString", "MultiLineString", @@ -1053,7 +1060,7 @@ def _merge_operator_binary_cubes( def _get_geometry_argument( self, - geometry: Union[ + argument: Union[ shapely.geometry.base.BaseGeometry, dict, str, @@ -1065,19 +1072,19 @@ def _get_geometry_argument( crs: Optional[str] = None, ) -> Union[dict, Parameter, PGNode]: """ - Convert input to a geometry as "geojson" subtype object. + Convert input to a geometry as "geojson" subtype object or vectorcube. :param crs: value that encodes a coordinate reference system. See :py:func:`openeo.util.normalize_crs` for more details about additional normalization that is applied to this argument. """ - if isinstance(geometry, Parameter): - return geometry - elif isinstance(geometry, _FromNodeMixin): - return geometry.from_node() + if isinstance(argument, Parameter): + return argument + elif isinstance(argument, _FromNodeMixin): + return argument.from_node() - if isinstance(geometry, str) and re.match(r"^https?://", geometry, flags=re.I): + if isinstance(argument, str) and re.match(r"^https?://", argument, flags=re.I): # Geometry provided as URL: load with `load_url` (with best-effort format guess) - url = urllib.parse.urlparse(geometry) + url = urllib.parse.urlparse(argument) suffix = pathlib.Path(url.path.lower()).suffix format = { ".json": "GeoJSON", @@ -1086,18 +1093,20 @@ def _get_geometry_argument( ".parquet": "Parquet", ".geoparquet": "Parquet", }.get(suffix, suffix.split(".")[-1]) - return self.connection.load_url(url=geometry, format=format) - - if isinstance(geometry, (str, pathlib.Path)): - # Assumption: `geometry` is path to polygon is a path to vector file at backend. - # TODO #104: `read_vector` is non-standard process. - # TODO: If path exists client side: load it client side? - return PGNode(process_id="read_vector", arguments={"filename": str(geometry)}) + return self.connection.load_url(url=argument, format=format) - if isinstance(geometry, shapely.geometry.base.BaseGeometry): - geometry = mapping(geometry) - if not isinstance(geometry, dict): - raise OpenEoClientException("Invalid geometry argument: {g!r}".format(g=geometry)) + if ( + isinstance(argument, (str, pathlib.Path)) + and pathlib.Path(argument).is_file() + and pathlib.Path(argument).suffix.lower() in [".json", ".geojson"] + ): + geometry = load_json(argument) + elif isinstance(argument, shapely.geometry.base.BaseGeometry): + geometry = mapping(argument) + elif isinstance(argument, dict): + geometry = argument + else: + raise OpenEoClientException(f"Invalid geometry argument: {argument!r}") if geometry.get("type") not in valid_geojson_types: raise OpenEoClientException("Invalid geometry type {t!r}, must be one of {s}".format( @@ -1147,7 +1156,8 @@ def aggregate_spatial( (also see :py:func:`Connection.list_file_formats() `), e.g. GeoJSON, GeoParquet, etc. A ``load_url`` process will automatically be added to the process graph. - - a path (that is valid for the back-end) to a GeoJSON file. + - a path (:py:class:`str` or :py:class:`~pathlib.Path`) to a local, client-side GeoJSON file, + which will be loaded automatically to get the geometries as GeoJSON construct. - a :py:class:`~openeo.rest.vectorcube.VectorCube` instance. - a :py:class:`~openeo.api.process.Parameter` instance. @@ -1177,6 +1187,12 @@ def aggregate_spatial( .. versionchanged:: 0.36.0 Support passing a URL as ``geometries`` argument, which will be loaded with the ``load_url`` process. + + .. versionchanged:: 0.36.0 + Support for passing a backend-side path as ``geometries`` argument was removed + (also see :ref:`legacy_read_vector`). + Instead, it's possible to provide a client-side path to a GeoJSON file + (which will be loaded client-side to get the geometries as GeoJSON construct). """ valid_geojson_types = [ "Point", "MultiPoint", "LineString", "MultiLineString", @@ -1502,7 +1518,8 @@ def apply_polygon( (also see :py:func:`Connection.list_file_formats() `), e.g. GeoJSON, GeoParquet, etc. A ``load_url`` process will automatically be added to the process graph. - - a path (that is valid for the back-end) to a GeoJSON file. + - a path (:py:class:`str` or :py:class:`~pathlib.Path`) to a local, client-side GeoJSON file, + which will be loaded automatically to get the geometries as GeoJSON construct. - a :py:class:`~openeo.rest.vectorcube.VectorCube` instance. - a :py:class:`~openeo.api.process.Parameter` instance. @@ -1519,6 +1536,12 @@ def apply_polygon( .. versionchanged:: 0.36.0 Support passing a URL as ``geometries`` argument, which will be loaded with the ``load_url`` process. + + .. versionchanged:: 0.36.0 + Support for passing a backend-side path as ``geometries`` argument was removed + (also see :ref:`legacy_read_vector`). + Instead, it's possible to provide a client-side path to a GeoJSON file + (which will be loaded client-side to get the geometries as GeoJSON construct). """ # TODO drop support for legacy `polygons` argument: # remove `kwargs, remove default `None` value for `geometries` and `process` @@ -2011,7 +2034,8 @@ def mask_polygon( (also see :py:func:`Connection.list_file_formats() `), e.g. GeoJSON, GeoParquet, etc. A ``load_url`` process will automatically be added to the process graph. - - a path (that is valid for the back-end) to a GeoJSON file. + - a path (:py:class:`str` or :py:class:`~pathlib.Path`) to a local, client-side GeoJSON file, + which will be loaded automatically to get the geometries as GeoJSON construct. - a :py:class:`~openeo.rest.vectorcube.VectorCube` instance. - a :py:class:`~openeo.api.process.Parameter` instance. @@ -2024,6 +2048,12 @@ def mask_polygon( .. versionchanged:: 0.36.0 Support passing a URL as ``geometries`` argument, which will be loaded with the ``load_url`` process. + + .. versionchanged:: 0.36.0 + Support for passing a backend-side path as ``geometries`` argument was removed + (also see :ref:`legacy_read_vector`). + Instead, it's possible to provide a client-side path to a GeoJSON file + (which will be loaded client-side to get the geometries as GeoJSON construct). """ valid_geojson_types = ["Polygon", "MultiPolygon", "GeometryCollection", "Feature", "FeatureCollection"] mask = self._get_geometry_argument(mask, valid_geojson_types=valid_geojson_types, crs=srs) diff --git a/tests/data/1.0.0/aggregate_zonal_path.json b/tests/data/1.0.0/aggregate_zonal_path.json index 29aaf784c..3c89eef6c 100644 --- a/tests/data/1.0.0/aggregate_zonal_path.json +++ b/tests/data/1.0.0/aggregate_zonal_path.json @@ -21,11 +21,11 @@ } } }, - "readvector1": { - "process_id": "read_vector", + "loadurl1": { + "process_id": "load_url", "arguments": { - "filename": "/some/path/to/GeometryCollection.geojson" - } + "url": "https://example.com/geometries.geojson", + "format": "GeoJSON"} }, "aggregatespatial1": { "process_id": "aggregate_spatial", @@ -34,7 +34,7 @@ "from_node": "filterbbox1" }, "geometries": { - "from_node": "readvector1" + "from_node": "loadurl1" }, "reducer": { "process_graph": { diff --git a/tests/data/geojson/polygon02.json b/tests/data/geojson/polygon02.json new file mode 100644 index 000000000..1c9103d13 --- /dev/null +++ b/tests/data/geojson/polygon02.json @@ -0,0 +1,23 @@ +{ + "type": "Polygon", + "coordinates": [ + [ + [ + 3, + 50 + ], + [ + 4, + 50 + ], + [ + 4, + 51 + ], + [ + 3, + 50 + ] + ] + ] +} diff --git a/tests/rest/datacube/test_datacube100.py b/tests/rest/datacube/test_datacube100.py index e0a2b289a..052ed78ba 100644 --- a/tests/rest/datacube/test_datacube100.py +++ b/tests/rest/datacube/test_datacube100.py @@ -365,6 +365,25 @@ def test_filter_spatial(con100: Connection): } +@pytest.mark.parametrize("path_factory", [str, pathlib.Path]) +def test_filter_spatial_local_path(con100: Connection, path_factory, test_data): + path = path_factory(test_data.get_path("geojson/polygon02.json")) + cube = con100.load_collection("S2") + masked = cube.filter_spatial(geometries=path) + assert get_download_graph(masked, drop_save_result=True, drop_load_collection=True) == { + "filterspatial1": { + "process_id": "filter_spatial", + "arguments": { + "data": {"from_node": "loadcollection1"}, + "geometries": { + "type": "Polygon", + "coordinates": [[[3, 50], [4, 50], [4, 51], [3, 50]]], + }, + }, + } + } + + @pytest.mark.parametrize( ["url", "expected_format"], [ @@ -659,6 +678,38 @@ def test_aggregate_spatial_geometry_url(con100: Connection, url, expected_format } +@pytest.mark.parametrize("path_factory", [str, pathlib.Path]) +def test_aggregate_spatial_geometry_local_path(con100: Connection, path_factory, test_data): + cube = con100.load_collection("S2") + path = path_factory(test_data.get_path("geojson/polygon02.json")) + result = cube.aggregate_spatial(geometries=path, reducer="mean") + assert get_download_graph(result, drop_save_result=True, drop_load_collection=True) == { + "aggregatespatial1": { + "process_id": "aggregate_spatial", + "arguments": { + "data": {"from_node": "loadcollection1"}, + "geometries": {"type": "Polygon", "coordinates": [[[3, 50], [4, 50], [4, 51], [3, 50]]]}, + "reducer": { + "process_graph": { + "mean1": { + "process_id": "mean", + "arguments": {"data": {"from_parameter": "data"}}, + "result": True, + } + } + }, + }, + }, + } + + +def test_aggregate_spatial_geometry_local_path_invalid(con100: Connection): + path = "nope/invalid:path%here.json" + cube = con100.load_collection("S2") + with pytest.raises(OpenEoClientException, match="Invalid geometry argument"): + _ = cube.aggregate_spatial(geometries=path, reducer="mean") + + def test_aggregate_spatial_window(con100: Connection): img = con100.load_collection("S2") size = [5, 3] @@ -827,24 +878,29 @@ def test_mask_polygon_parameter(con100: Connection): } -def test_mask_polygon_path(con100: Connection): - img = con100.load_collection("S2") - masked = img.mask_polygon(mask="path/to/polygon.json") - assert sorted(masked.flat_graph().keys()) == ["loadcollection1", "maskpolygon1", "readvector1"] - assert masked.flat_graph()["maskpolygon1"] == { - "process_id": "mask_polygon", - "arguments": { - "data": {"from_node": "loadcollection1"}, - "mask": {"from_node": "readvector1"}, +@pytest.mark.parametrize("path_factory", [str, pathlib.Path]) +def test_mask_polygon_geometry_local_path(con100: Connection, path_factory, test_data): + path = path_factory(test_data.get_path("geojson/polygon02.json")) + cube = con100.load_collection("S2") + masked = cube.mask_polygon(mask=path) + assert get_download_graph(masked, drop_save_result=True, drop_load_collection=True) == { + "maskpolygon1": { + "process_id": "mask_polygon", + "arguments": { + "data": {"from_node": "loadcollection1"}, + "mask": {"type": "Polygon", "coordinates": [[[3, 50], [4, 50], [4, 51], [3, 50]]]}, + }, }, - "result": True, - } - assert masked.flat_graph()["readvector1"] == { - "process_id": "read_vector", - "arguments": {"filename": "path/to/polygon.json"}, } +def test_mask_polygon_geometry_local_path_invalid(con100: Connection): + path = "nope/invalid:path%here.json" + cube = con100.load_collection("S2") + with pytest.raises(OpenEoClientException, match="Invalid geometry argument"): + _ = cube.mask_polygon(mask=path) + + @pytest.mark.parametrize("get_geometries", [ lambda c: PGNode("load_vector", url="https://geo.test/features.json"), lambda c: openeo.processes.process("load_vector", url="https://geo.test/features.json"), @@ -1583,18 +1639,19 @@ def test_chunk_polygon_parameter(con100: Connection): } -def test_chunk_polygon_path(con100: Connection): +@pytest.mark.parametrize("path_factory", [str, pathlib.Path]) +def test_chunk_polygon_path(con100: Connection, test_data, path_factory): + path = path_factory(test_data.get_path("geojson/polygon02.json")) cube = con100.load_collection("S2") process = lambda data: data.run_udf(udf="myfancycode", runtime="Python") with pytest.warns(UserDeprecationWarning, match="Use `apply_polygon`"): - result = cube.chunk_polygon(chunks="path/to/polygon.json", process=process) + result = cube.chunk_polygon(chunks=path, process=process) assert get_download_graph(result, drop_save_result=True, drop_load_collection=True) == { - "readvector1": {"process_id": "read_vector", "arguments": {"filename": "path/to/polygon.json"}}, "chunkpolygon1": { "process_id": "chunk_polygon", "arguments": { "data": {"from_node": "loadcollection1"}, - "chunks": {"from_node": "readvector1"}, + "chunks": {"type": "Polygon", "coordinates": [[[3, 50], [4, 50], [4, 51], [3, 50]]]}, "process": { "process_graph": { "runudf1": { @@ -1797,21 +1854,17 @@ def test_apply_polygon_parameter(con100: Connection, geometries_argument, geomet ("geometries", "geometries"), ], ) -def test_apply_polygon_path(con100: Connection, geometries_argument, geometries_parameter): +def test_apply_polygon_local_path(con100: Connection, geometries_argument, geometries_parameter, test_data): + path = test_data.get_path("geojson/polygon02.json") cube = con100.load_collection("S2") process = UDF(code="myfancycode", runtime="Python") - result = cube.apply_polygon(**{geometries_argument: "path/to/polygon.json"}, process=process) + result = cube.apply_polygon(**{geometries_argument: path}, process=process) assert get_download_graph(result, drop_save_result=True, drop_load_collection=True) == { - "readvector1": { - # TODO #104 #457 get rid of non-standard read_vector - "process_id": "read_vector", - "arguments": {"filename": "path/to/polygon.json"}, - }, "applypolygon1": { "process_id": "apply_polygon", "arguments": { "data": {"from_node": "loadcollection1"}, - geometries_parameter: {"from_node": "readvector1"}, + geometries_parameter: {"type": "Polygon", "coordinates": [[[3, 50], [4, 50], [4, 51], [3, 50]]]}, "process": { "process_graph": { "runudf1": { @@ -1830,6 +1883,14 @@ def test_apply_polygon_path(con100: Connection, geometries_argument, geometries_ } +def test_apply_polygon_local_path_invalid(con100: Connection): + path = "nope/invalid:path%here.json" + cube = con100.load_collection("S2") + process = UDF(code="myfancycode", runtime="Python") + with pytest.raises(OpenEoClientException, match="Invalid geometry argument"): + _ = cube.apply_polygon(geometries=path, process=process) + + @pytest.mark.parametrize( ["geometries_argument", "geometries_parameter"], [ diff --git a/tests/rest/datacube/test_zonal_stats.py b/tests/rest/datacube/test_zonal_stats.py index bac0ccac2..90ce88c99 100644 --- a/tests/rest/datacube/test_zonal_stats.py +++ b/tests/rest/datacube/test_zonal_stats.py @@ -24,21 +24,13 @@ def test_aggregate_spatial(connection, api_version, reducer, test_data): assert get_execute_graph(res) == test_data.load_json("%s/aggregate_zonal_polygon.json" % api_version) -def test_polygon_timeseries_path(connection, api_version, test_data): - res = ( - connection.load_collection('S2') - .filter_bbox(west=3, east=6, north=52, south=50) - .polygonal_mean_timeseries(polygon="/some/path/to/GeometryCollection.geojson") - ) - assert get_execute_graph(res) == test_data.load_json("%s/aggregate_zonal_path.json" % api_version) - @pytest.mark.parametrize("reducer", ["mean", openeo.processes.mean, lambda x: x.mean()]) -def test_aggregate_spatial_read_vector(connection, api_version, reducer, test_data): +def test_aggregate_spatial_with_geometry_url(connection, api_version, reducer, test_data): res = ( connection.load_collection("S2") - .filter_bbox(3, 6, 52, 50) - .aggregate_spatial(geometries="/some/path/to/GeometryCollection.geojson", reducer=reducer) + .filter_bbox(3, 6, 52, 50) + .aggregate_spatial(geometries="https://example.com/geometries.geojson", reducer=reducer) ) assert get_execute_graph(res) == test_data.load_json("%s/aggregate_zonal_path.json" % api_version)