Skip to content

Commit 21cc3ef

Browse files
sdp5martinfleis
andauthored
filter bunch by geometry type (#11) (#12)
* filter bunch by geometry type (#11) * add method: filter_by_geometry and enum: GeometryType * use xyzservices filter solution --------- Co-authored-by: Martin Fleischmann <[email protected]>
1 parent f310f6d commit 21cc3ef

File tree

3 files changed

+159
-4
lines changed

3 files changed

+159
-4
lines changed

doc/source/api.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,4 @@ The database of dataset metadata is handled via custom dict-based classes.
3333

3434
.. autoclass:: Bunch
3535
:exclude-members: clear, copy, fromkeys, get, items, keys, pop, popitem, setdefault, update, values
36-
:members: flatten, query_name
36+
:members: filter, flatten, query_name

geodatasets/lib.py

Lines changed: 135 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88

99
import json
1010
import uuid
11+
from typing import Callable
1112

13+
GEOMETRY_TYPES = ["POINT", "LINESTRING", "POLYGON", "MIXED"]
1214
QUERY_NAME_TRANSLATION = str.maketrans({x: "" for x in "., -_/"})
1315

1416

@@ -28,7 +30,6 @@ def __dir__(self):
2830
return self.keys()
2931

3032
def _repr_html_(self, inside=False):
31-
3233
children = ""
3334
for key in self.keys():
3435
if isinstance(self[key], Dataset):
@@ -118,6 +119,139 @@ def query_name(self, name: str) -> Dataset:
118119

119120
raise ValueError(f"No matching item found for the query '{name}'.")
120121

122+
def filter(
123+
self,
124+
keyword: str | None = None,
125+
name: str | None = None,
126+
geometry_type: str | None = None,
127+
function: Callable[[Dataset], bool] = None,
128+
) -> Bunch:
129+
"""Return a subset of the :class:`Bunch` matching the filter conditions
130+
131+
Each :class:`Dataset` within a :class:`Bunch` is checked against one or
132+
more specified conditions and kept if they are satisfied or removed if at least
133+
one condition is not met.
134+
135+
Parameters
136+
----------
137+
keyword : str (optional)
138+
Condition returns ``True`` if ``keyword`` string is present in any string
139+
value in a :class:`Dataset` object.
140+
The comparison is not case sensitive.
141+
name : str (optional)
142+
Condition returns ``True`` if ``name`` string is present in
143+
the name attribute of :class:`Dataset` object.
144+
The comparison is not case sensitive.
145+
geometry_type : str (optional)
146+
Condition returns ``True`` if :meth:`Dataset.geometry_type` is
147+
matches the ``geometry_type``.
148+
Possible options are ``["Point", "LineString", "Polygon", "Mixed"]``.
149+
The comparison is not case sensitive.
150+
function : callable (optional)
151+
Custom function taking :class:`Dataset` as an argument and returns
152+
bool. If ``function`` is given, other parameters are ignored.
153+
154+
Returns
155+
-------
156+
filtered : Bunch
157+
158+
Examples
159+
--------
160+
>>> from geodatasets import data
161+
162+
You can filter all Point datasets:
163+
164+
>>> points = data.filter(geometry_type="Point")
165+
166+
Or all datasets with ``chicago`` in the name:
167+
168+
>>> chicago_datasets = data.filter(name="chicago")
169+
170+
You can use keyword search to find all datasets in a CSV format:
171+
172+
>>> csv_datasets = data.filter(keyword="csv")
173+
174+
You can combine multiple conditions to find datasets based with ``chicago`` in
175+
name of Polygon geometry type:
176+
177+
>>> chicago_polygons = data.filter(name="chicago", geometry_type="Polygon")
178+
179+
You can also pass custom function that takes :class:`Dataset` and returns
180+
boolean value. You can then find all datasets with ``nrows`` smaller than
181+
100:
182+
183+
>>> def small_data(dataset):
184+
... if hasattr(dataset, "nrows") and dataset.nrows < 100:
185+
... return True
186+
... return False
187+
>>> small = data.filter(function=small_data)
188+
"""
189+
190+
def _validate(dataset, keyword, name, geometry_type):
191+
cond = []
192+
193+
if keyword is not None:
194+
keyword_match = False
195+
for v in dataset.values():
196+
if isinstance(v, str) and keyword.lower() in v.lower():
197+
keyword_match = True
198+
break
199+
cond.append(keyword_match)
200+
201+
if name is not None:
202+
name_match = False
203+
if name.lower() in dataset.name.lower():
204+
name_match = True
205+
cond.append(name_match)
206+
207+
if geometry_type is not None:
208+
geom_type_match = False
209+
if (
210+
dataset.geometry_type.upper()
211+
== geometry_type.translate(QUERY_NAME_TRANSLATION).upper()
212+
):
213+
geom_type_match = True
214+
cond.append(geom_type_match)
215+
216+
return all(cond)
217+
218+
def _filter_bunch(bunch, keyword, name, geometry_type, function):
219+
new = Bunch()
220+
for key, value in bunch.items():
221+
if isinstance(value, Dataset):
222+
if function is None:
223+
if _validate(
224+
value,
225+
keyword=keyword,
226+
name=name,
227+
geometry_type=geometry_type,
228+
):
229+
new[key] = value
230+
else:
231+
if function(value):
232+
new[key] = value
233+
234+
else:
235+
filtered = _filter_bunch(
236+
value,
237+
keyword=keyword,
238+
name=name,
239+
geometry_type=geometry_type,
240+
function=function,
241+
)
242+
if filtered:
243+
new[key] = filtered
244+
245+
return new
246+
247+
return _filter_bunch(
248+
self,
249+
keyword=keyword,
250+
name=name,
251+
geometry_type=geometry_type,
252+
function=function,
253+
)
254+
121255

122256
class Dataset(Bunch):
123257
"""
@@ -178,7 +312,6 @@ def _repr_html_(self, inside=False):
178312

179313

180314
def _load_json(f):
181-
182315
data = json.loads(f)
183316

184317
items = Bunch()

geodatasets/tests/test_lib.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22

33
from geodatasets import Bunch, Dataset, data
4+
from geodatasets.lib import GEOMETRY_TYPES
45

56

67
@pytest.fixture
@@ -10,6 +11,7 @@ def data1():
1011
attribution="(C) geodatasets",
1112
name="my_public_data",
1213
filename="data.zip",
14+
geometry_type="Polygon",
1315
hash="qwertyuiopasdfghjklzxcvbnm1234567890",
1416
)
1517

@@ -21,6 +23,7 @@ def data2():
2123
attribution="(C) geodatasets",
2224
name="my_public_data2",
2325
filename="data2.json",
26+
geometry_type="Point",
2427
hash="qwertyuiopasdfghjklzxcvbnm1234567890",
2528
)
2629

@@ -37,7 +40,9 @@ def test_bunch(
3740

3841

3942
def test_dir(data1):
40-
assert dir(data1) == sorted(["url", "attribution", "name", "filename", "hash"])
43+
assert dir(data1) == sorted(
44+
["url", "attribution", "name", "filename", "geometry_type", "hash"]
45+
)
4146

4247

4348
def test_expect_name_url_attribution():
@@ -134,3 +139,20 @@ def test_query_name():
134139

135140
with pytest.raises(ValueError, match="No matching item found"):
136141
data.query_name("i don't exist")
142+
143+
144+
def test_filter(test_bunch):
145+
assert len(test_bunch.filter(keyword="json").flatten()) == 1
146+
assert len(test_bunch.filter(name="data2").flatten()) == 1
147+
assert len(test_bunch.filter(geometry_type="Point").flatten()) == 1
148+
assert (
149+
len(test_bunch.filter(keyword="json", geometry_type="Polygon").flatten()) == 0
150+
)
151+
assert len(test_bunch.filter(name="nonsense").flatten()) == 0
152+
153+
def custom(provider):
154+
if hasattr(provider, "filename") and provider.filename == "data.zip":
155+
return True
156+
return False
157+
158+
assert len(test_bunch.filter(function=custom).flatten()) == 1

0 commit comments

Comments
 (0)