Skip to content

Commit 42ed5a6

Browse files
authored
Merge pull request #267 from NOAA-GSL/staging
extracting human-readable strings from heterogeneous API data
2 parents fe5ba5d + 6fb57a8 commit 42ed5a6

File tree

3 files changed

+183
-12
lines changed

3 files changed

+183
-12
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "zyra"
3-
version = "0.1.46"
3+
version = "0.1.47"
44
description = "A tool to ingest data from various sources and formats, create imagery or video based on that data, and send the results to various locations for dissemination."
55
authors = ["Eric Hackathorn <eric.j.hackathorn@noaa.gov>"]
66
include = [

src/zyra/connectors/discovery/api_search.py

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -299,22 +299,78 @@ def _parse_json_body(arg: str | None) -> Any | None:
299299
return None
300300

301301

302+
def _extract_name(value: Any) -> str | None:
303+
"""Extract a human-readable string from *value*.
304+
305+
Used to normalize heterogeneous fields (names, descriptions, links)
306+
into a single string representation.
307+
308+
- ``str`` → returned as-is.
309+
- ``dict`` (or any ``Mapping``) → drills into common label-like keys
310+
(``name``, ``title``, ``id``, ``path``, ``label``, ``url``,
311+
``href``, ``uri``, ``link``) and returns the first value that is
312+
not ``None`` and not the empty string, as a string. Nested dicts
313+
are handled recursively; if a nested dict yields no recognizable
314+
value, later keys are still tried.
315+
- Other types → converted via ``str()`` as a last resort.
316+
- ``None`` → returns ``None``.
317+
"""
318+
if value is None:
319+
return None
320+
if isinstance(value, str):
321+
return value
322+
if isinstance(value, Mapping):
323+
for key in (
324+
"name",
325+
"title",
326+
"id",
327+
"path",
328+
"label",
329+
"url",
330+
"href",
331+
"uri",
332+
"link",
333+
):
334+
v = value.get(key)
335+
if v is None or v == "":
336+
continue
337+
if isinstance(v, str):
338+
return v
339+
if isinstance(v, Mapping):
340+
nested = _extract_name(v)
341+
if nested not in (None, ""):
342+
return nested
343+
# Nested dict had no recognizable sub-key; keep searching
344+
continue
345+
return str(v)
346+
# No recognizable sub-key in this mapping
347+
return None
348+
return str(value)
349+
350+
351+
def _first_extractable(item: dict[str, Any], keys: tuple[str, ...]) -> str | None:
352+
"""Return the first non-empty extracted string from *item* candidate keys."""
353+
for key in keys:
354+
raw = item.get(key)
355+
if raw is None:
356+
continue
357+
result = _extract_name(raw)
358+
if result not in (None, ""):
359+
return result
360+
return None
361+
362+
302363
def _normalize_item(item: dict[str, Any], source_host: str) -> dict[str, Any]:
303364
"""Map remote item to unified row schema.
304365
305366
Attempts common fields used by Zyra and similar APIs; uses best-effort
306-
heuristics for generic sources.
367+
heuristics for generic sources. Iterates candidate keys for each field
368+
so that an unresolvable value (e.g. a nested dict with no known sub-keys)
369+
falls through to the next candidate rather than producing an empty string.
307370
"""
308-
# Preferred keys (Zyra API shape)
309-
name = (
310-
item.get("name") or item.get("title") or item.get("dataset") or item.get("id")
311-
)
312-
desc = item.get("description") or item.get("abstract") or None
313-
link = item.get("uri") or item.get("link") or item.get("url") or None
314-
# Strings only
315-
name_s = str(name) if name is not None else None
316-
desc_s = str(desc) if desc is not None else None
317-
link_s = str(link) if link is not None else None
371+
name_s = _first_extractable(item, ("name", "title", "dataset", "id"))
372+
desc_s = _first_extractable(item, ("description", "abstract"))
373+
link_s = _first_extractable(item, ("uri", "link", "href", "url"))
318374
return {
319375
"source": source_host,
320376
"dataset": name_s or "",

tests/connectors/test_search_api.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,3 +392,118 @@ def test_cli_api_openapi_diagnostics(monkeypatch, capsys):
392392
assert rc == 0
393393
out = capsys.readouterr().out
394394
assert "suggest --param for: q,limit" in out
395+
396+
397+
def test_extract_name_string_passthrough():
398+
from zyra.connectors.discovery.api_search import _extract_name
399+
400+
assert _extract_name("hello") == "hello"
401+
assert _extract_name(None) is None
402+
assert _extract_name(42) == "42"
403+
404+
405+
def test_extract_name_drills_into_dict():
406+
from zyra.connectors.discovery.api_search import _extract_name
407+
408+
assert (
409+
_extract_name({"name": "Synoptic-UAS", "path": "data/Synoptic-UAS"})
410+
== "Synoptic-UAS"
411+
)
412+
assert _extract_name({"path": "data/Synoptic-UAS"}) == "data/Synoptic-UAS"
413+
assert _extract_name({"title": "My Title"}) == "My Title"
414+
assert _extract_name({"id": "abc123"}) == "abc123"
415+
assert _extract_name({"label": "Atmospheric CO2"}) == "Atmospheric CO2"
416+
# Non-string values in name-like keys should be stringified
417+
assert _extract_name({"id": 123}) == "123"
418+
assert _extract_name({"name": 42}) == "42"
419+
# Nested dict in a name-like key is recursively drilled into
420+
assert _extract_name({"name": {"id": "x"}}) == "x"
421+
assert _extract_name({"name": {"title": "inner", "id": "y"}}) == "inner"
422+
423+
424+
def test_normalize_item_nested_dataset_dict():
425+
from zyra.connectors.discovery.api_search import _normalize_item
426+
427+
item = {
428+
"dataset": {"path": "data/Synoptic-UAS", "name": "Synoptic-UAS"},
429+
"score": 1,
430+
}
431+
row = _normalize_item(item, "example.com")
432+
assert row["dataset"] == "Synoptic-UAS"
433+
assert row["source"] == "example.com"
434+
435+
436+
def test_normalize_item_nested_dataset_dict_no_name():
437+
from zyra.connectors.discovery.api_search import _normalize_item
438+
439+
item = {"dataset": {"path": "data/foo"}}
440+
row = _normalize_item(item, "host")
441+
assert row["dataset"] == "data/foo"
442+
443+
444+
def test_normalize_item_nested_description_and_link():
445+
from zyra.connectors.discovery.api_search import _normalize_item
446+
447+
item = {
448+
"name": "DS1",
449+
"description": {"title": "A long description object"},
450+
"uri": {"path": "http://example.com/ds1"},
451+
}
452+
row = _normalize_item(item, "host")
453+
assert row["dataset"] == "DS1"
454+
assert row["description"] == "A long description object"
455+
assert row["link"] == "http://example.com/ds1"
456+
457+
458+
def test_extract_name_url_like_dict_keys():
459+
from zyra.connectors.discovery.api_search import _extract_name
460+
461+
assert _extract_name({"url": "http://x/data"}) == "http://x/data"
462+
assert _extract_name({"href": "http://x/ref"}) == "http://x/ref"
463+
assert _extract_name({"uri": "http://x/uri"}) == "http://x/uri"
464+
assert _extract_name({"link": "http://x/link"}) == "http://x/link"
465+
466+
467+
def test_normalize_item_href_link():
468+
from zyra.connectors.discovery.api_search import _normalize_item
469+
470+
item = {"name": "DS", "href": "http://example.com/ds"}
471+
row = _normalize_item(item, "host")
472+
assert row["link"] == "http://example.com/ds"
473+
474+
475+
def test_extract_name_mapping_type():
476+
from collections import OrderedDict
477+
478+
from zyra.connectors.discovery.api_search import _extract_name
479+
480+
assert _extract_name(OrderedDict([("name", "ordered")])) == "ordered"
481+
assert _extract_name(OrderedDict([("id", 99)])) == "99"
482+
483+
484+
def test_extract_name_skips_unresolvable_nested_dict():
485+
from zyra.connectors.discovery.api_search import _extract_name
486+
487+
# name is a dict with no recognizable keys; should skip to title
488+
assert _extract_name({"name": {"foo": "bar"}, "title": "Fallback"}) == "Fallback"
489+
# All keys are unresolvable nested dicts → returns None
490+
assert _extract_name({"name": {"foo": "bar"}}) is None
491+
492+
493+
def test_normalize_item_falls_back_on_unresolvable_field():
494+
from zyra.connectors.discovery.api_search import _normalize_item
495+
496+
# name is unresolvable dict → should fall back to title
497+
item = {"name": {"foo": "bar"}, "title": "Fallback Title", "uri": "http://x"}
498+
row = _normalize_item(item, "host")
499+
assert row["dataset"] == "Fallback Title"
500+
501+
# uri is unresolvable dict → should fall back to url
502+
item2 = {"name": "DS", "uri": {"foo": "bar"}, "url": "http://example.com"}
503+
row2 = _normalize_item(item2, "host")
504+
assert row2["link"] == "http://example.com"
505+
506+
# description is unresolvable dict → should fall back to abstract
507+
item3 = {"name": "DS", "description": {"foo": "bar"}, "abstract": "A summary"}
508+
row3 = _normalize_item(item3, "host")
509+
assert row3["description"] == "A summary"

0 commit comments

Comments
 (0)