filter bunch by geometry type (#11) (#12)

sdp5 · martinfleis · web-flow · commit 21cc3efd59dd · 2023-09-07T10:34:23.000+02:00
* filter bunch by geometry type (#11) * add method: filter_by_geometry and enum: GeometryType * use xyzservices filter solution --------- Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -33,4 +33,4 @@ The database of dataset metadata is handled via custom dict-based classes.
 
 .. autoclass:: Bunch
    :exclude-members: clear, copy, fromkeys, get, items, keys, pop, popitem, setdefault, update, values
-   :members: flatten, query_name
+   :members: filter, flatten, query_name
diff --git a/geodatasets/lib.py b/geodatasets/lib.py
@@ -8,7 +8,9 @@
 
 import json
 import uuid
+from typing import Callable
 
+GEOMETRY_TYPES = ["POINT", "LINESTRING", "POLYGON", "MIXED"]
 QUERY_NAME_TRANSLATION = str.maketrans({x: "" for x in "., -_/"})
 
 
@@ -28,7 +30,6 @@ def __dir__(self):
         return self.keys()
 
     def _repr_html_(self, inside=False):
-
         children = ""
         for key in self.keys():
             if isinstance(self[key], Dataset):
@@ -118,6 +119,139 @@ def query_name(self, name: str) -> Dataset:
 
         raise ValueError(f"No matching item found for the query '{name}'.")
 
+    def filter(
+        self,
+        keyword: str | None = None,
+        name: str | None = None,
+        geometry_type: str | None = None,
+        function: Callable[[Dataset], bool] = None,
+    ) -> Bunch:
+        """Return a subset of the :class:`Bunch` matching the filter conditions
+
+        Each :class:`Dataset` within a :class:`Bunch` is checked against one or
+        more specified conditions and kept if they are satisfied or removed if at least
+        one condition is not met.
+
+        Parameters
+        ----------
+        keyword : str (optional)
+            Condition returns ``True`` if ``keyword`` string is present in any string
+            value in a :class:`Dataset` object.
+            The comparison is not case sensitive.
+        name : str (optional)
+            Condition returns ``True`` if ``name`` string is present in
+            the name attribute of :class:`Dataset` object.
+            The comparison is not case sensitive.
+        geometry_type : str (optional)
+            Condition returns ``True`` if :meth:`Dataset.geometry_type` is
+            matches the ``geometry_type``.
+            Possible options are ``["Point", "LineString", "Polygon", "Mixed"]``.
+            The comparison is not case sensitive.
+        function : callable (optional)
+            Custom function taking :class:`Dataset` as an argument and returns
+            bool. If ``function`` is given, other parameters are ignored.
+
+        Returns
+        -------
+        filtered : Bunch
+
+        Examples
+        --------
+        >>> from geodatasets import data
+
+        You can filter all Point datasets:
+
+        >>> points = data.filter(geometry_type="Point")
+
+        Or all datasets with ``chicago`` in the name:
+
+        >>> chicago_datasets = data.filter(name="chicago")
+
+        You can use keyword search to find all datasets in a CSV format:
+
+        >>> csv_datasets = data.filter(keyword="csv")
+
+        You can combine multiple conditions to find datasets based with ``chicago`` in
+        name of Polygon geometry type:
+
+        >>> chicago_polygons = data.filter(name="chicago", geometry_type="Polygon")
+
+        You can also pass custom function that takes :class:`Dataset` and returns
+        boolean value. You can then find all datasets with ``nrows`` smaller than
+        100:
+
+        >>> def small_data(dataset):
+        ...    if hasattr(dataset, "nrows") and dataset.nrows < 100:
+        ...        return True
+        ...    return False
+        >>> small = data.filter(function=small_data)
+        """
+
+        def _validate(dataset, keyword, name, geometry_type):
+            cond = []
+
+            if keyword is not None:
+                keyword_match = False
+                for v in dataset.values():
+                    if isinstance(v, str) and keyword.lower() in v.lower():
+                        keyword_match = True
+                        break
+                cond.append(keyword_match)
+
+            if name is not None:
+                name_match = False
+                if name.lower() in dataset.name.lower():
+                    name_match = True
+                cond.append(name_match)
+
+            if geometry_type is not None:
+                geom_type_match = False
+                if (
+                    dataset.geometry_type.upper()
+                    == geometry_type.translate(QUERY_NAME_TRANSLATION).upper()
+                ):
+                    geom_type_match = True
+                cond.append(geom_type_match)
+
+            return all(cond)
+
+        def _filter_bunch(bunch, keyword, name, geometry_type, function):
+            new = Bunch()
+            for key, value in bunch.items():
+                if isinstance(value, Dataset):
+                    if function is None:
+                        if _validate(
+                            value,
+                            keyword=keyword,
+                            name=name,
+                            geometry_type=geometry_type,
+                        ):
+                            new[key] = value
+                    else:
+                        if function(value):
+                            new[key] = value
+
+                else:
+                    filtered = _filter_bunch(
+                        value,
+                        keyword=keyword,
+                        name=name,
+                        geometry_type=geometry_type,
+                        function=function,
+                    )
+                    if filtered:
+                        new[key] = filtered
+
+            return new
+
+        return _filter_bunch(
+            self,
+            keyword=keyword,
+            name=name,
+            geometry_type=geometry_type,
+            function=function,
+        )
+
 
 class Dataset(Bunch):
     """
@@ -178,7 +312,6 @@ def _repr_html_(self, inside=False):
 
 
 def _load_json(f):
-
     data = json.loads(f)
 
     items = Bunch()
diff --git a/geodatasets/tests/test_lib.py b/geodatasets/tests/test_lib.py
@@ -1,6 +1,7 @@
 import pytest
 
 from geodatasets import Bunch, Dataset, data
+from geodatasets.lib import GEOMETRY_TYPES
 
 
 @pytest.fixture
@@ -10,6 +11,7 @@ def data1():
         attribution="(C) geodatasets",
         name="my_public_data",
         filename="data.zip",
+        geometry_type="Polygon",
         hash="qwertyuiopasdfghjklzxcvbnm1234567890",
     )
 
@@ -21,6 +23,7 @@ def data2():
         attribution="(C) geodatasets",
         name="my_public_data2",
         filename="data2.json",
+        geometry_type="Point",
         hash="qwertyuiopasdfghjklzxcvbnm1234567890",
     )
 
@@ -37,7 +40,9 @@ def test_bunch(
 
 
 def test_dir(data1):
-    assert dir(data1) == sorted(["url", "attribution", "name", "filename", "hash"])
+    assert dir(data1) == sorted(
+        ["url", "attribution", "name", "filename", "geometry_type", "hash"]
+    )
 
 
 def test_expect_name_url_attribution():
@@ -134,3 +139,20 @@ def test_query_name():
 
     with pytest.raises(ValueError, match="No matching item found"):
         data.query_name("i don't exist")
+
+
+def test_filter(test_bunch):
+    assert len(test_bunch.filter(keyword="json").flatten()) == 1
+    assert len(test_bunch.filter(name="data2").flatten()) == 1
+    assert len(test_bunch.filter(geometry_type="Point").flatten()) == 1
+    assert (
+        len(test_bunch.filter(keyword="json", geometry_type="Polygon").flatten()) == 0
+    )
+    assert len(test_bunch.filter(name="nonsense").flatten()) == 0
+
+    def custom(provider):
+        if hasattr(provider, "filename") and provider.filename == "data.zip":
+            return True
+        return False
+
+    assert len(test_bunch.filter(function=custom).flatten()) == 1