Merge pull request #267 from NOAA-GSL/staging

Hackshaven · web-flow · commit 42ed5a609123 · 2026-03-11T15:15:08.000-06:00
extracting human-readable strings from heterogeneous API data
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "zyra"
-version = "0.1.46"
+version = "0.1.47"
 description = "A tool to ingest data from various sources and formats, create imagery or video based on that data, and send the results to various locations for dissemination."
 authors = ["Eric Hackathorn <eric.j.hackathorn@noaa.gov>"]
 include = [
diff --git a/src/zyra/connectors/discovery/api_search.py b/src/zyra/connectors/discovery/api_search.py
@@ -299,22 +299,78 @@ def _parse_json_body(arg: str | None) -> Any | None:
         return None
 
 
+def _extract_name(value: Any) -> str | None:
+    """Extract a human-readable string from *value*.
+
+    Used to normalize heterogeneous fields (names, descriptions, links)
+    into a single string representation.
+
+    - ``str`` → returned as-is.
+    - ``dict`` (or any ``Mapping``) → drills into common label-like keys
+      (``name``, ``title``, ``id``, ``path``, ``label``, ``url``,
+      ``href``, ``uri``, ``link``) and returns the first value that is
+      not ``None`` and not the empty string, as a string.  Nested dicts
+      are handled recursively; if a nested dict yields no recognizable
+      value, later keys are still tried.
+    - Other types → converted via ``str()`` as a last resort.
+    - ``None`` → returns ``None``.
+    """
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, Mapping):
+        for key in (
+            "name",
+            "title",
+            "id",
+            "path",
+            "label",
+            "url",
+            "href",
+            "uri",
+            "link",
+        ):
+            v = value.get(key)
+            if v is None or v == "":
+                continue
+            if isinstance(v, str):
+                return v
+            if isinstance(v, Mapping):
+                nested = _extract_name(v)
+                if nested not in (None, ""):
+                    return nested
+                # Nested dict had no recognizable sub-key; keep searching
+                continue
+            return str(v)
+        # No recognizable sub-key in this mapping
+        return None
+    return str(value)
+
+
+def _first_extractable(item: dict[str, Any], keys: tuple[str, ...]) -> str | None:
+    """Return the first non-empty extracted string from *item* candidate keys."""
+    for key in keys:
+        raw = item.get(key)
+        if raw is None:
+            continue
+        result = _extract_name(raw)
+        if result not in (None, ""):
+            return result
+    return None
+
+
 def _normalize_item(item: dict[str, Any], source_host: str) -> dict[str, Any]:
     """Map remote item to unified row schema.
 
     Attempts common fields used by Zyra and similar APIs; uses best-effort
-    heuristics for generic sources.
+    heuristics for generic sources.  Iterates candidate keys for each field
+    so that an unresolvable value (e.g. a nested dict with no known sub-keys)
+    falls through to the next candidate rather than producing an empty string.
     """
-    # Preferred keys (Zyra API shape)
-    name = (
-        item.get("name") or item.get("title") or item.get("dataset") or item.get("id")
-    )
-    desc = item.get("description") or item.get("abstract") or None
-    link = item.get("uri") or item.get("link") or item.get("url") or None
-    # Strings only
-    name_s = str(name) if name is not None else None
-    desc_s = str(desc) if desc is not None else None
-    link_s = str(link) if link is not None else None
+    name_s = _first_extractable(item, ("name", "title", "dataset", "id"))
+    desc_s = _first_extractable(item, ("description", "abstract"))
+    link_s = _first_extractable(item, ("uri", "link", "href", "url"))
     return {
         "source": source_host,
         "dataset": name_s or "",
diff --git a/tests/connectors/test_search_api.py b/tests/connectors/test_search_api.py
@@ -392,3 +392,118 @@ def test_cli_api_openapi_diagnostics(monkeypatch, capsys):
     assert rc == 0
     out = capsys.readouterr().out
     assert "suggest --param for: q,limit" in out
+
+
+def test_extract_name_string_passthrough():
+    from zyra.connectors.discovery.api_search import _extract_name
+
+    assert _extract_name("hello") == "hello"
+    assert _extract_name(None) is None
+    assert _extract_name(42) == "42"
+
+
+def test_extract_name_drills_into_dict():
+    from zyra.connectors.discovery.api_search import _extract_name
+
+    assert (
+        _extract_name({"name": "Synoptic-UAS", "path": "data/Synoptic-UAS"})
+        == "Synoptic-UAS"
+    )
+    assert _extract_name({"path": "data/Synoptic-UAS"}) == "data/Synoptic-UAS"
+    assert _extract_name({"title": "My Title"}) == "My Title"
+    assert _extract_name({"id": "abc123"}) == "abc123"
+    assert _extract_name({"label": "Atmospheric CO2"}) == "Atmospheric CO2"
+    # Non-string values in name-like keys should be stringified
+    assert _extract_name({"id": 123}) == "123"
+    assert _extract_name({"name": 42}) == "42"
+    # Nested dict in a name-like key is recursively drilled into
+    assert _extract_name({"name": {"id": "x"}}) == "x"
+    assert _extract_name({"name": {"title": "inner", "id": "y"}}) == "inner"
+
+
+def test_normalize_item_nested_dataset_dict():
+    from zyra.connectors.discovery.api_search import _normalize_item
+
+    item = {
+        "dataset": {"path": "data/Synoptic-UAS", "name": "Synoptic-UAS"},
+        "score": 1,
+    }
+    row = _normalize_item(item, "example.com")
+    assert row["dataset"] == "Synoptic-UAS"
+    assert row["source"] == "example.com"
+
+
+def test_normalize_item_nested_dataset_dict_no_name():
+    from zyra.connectors.discovery.api_search import _normalize_item
+
+    item = {"dataset": {"path": "data/foo"}}
+    row = _normalize_item(item, "host")
+    assert row["dataset"] == "data/foo"
+
+
+def test_normalize_item_nested_description_and_link():
+    from zyra.connectors.discovery.api_search import _normalize_item
+
+    item = {
+        "name": "DS1",
+        "description": {"title": "A long description object"},
+        "uri": {"path": "http://example.com/ds1"},
+    }
+    row = _normalize_item(item, "host")
+    assert row["dataset"] == "DS1"
+    assert row["description"] == "A long description object"
+    assert row["link"] == "http://example.com/ds1"
+
+
+def test_extract_name_url_like_dict_keys():
+    from zyra.connectors.discovery.api_search import _extract_name
+
+    assert _extract_name({"url": "http://x/data"}) == "http://x/data"
+    assert _extract_name({"href": "http://x/ref"}) == "http://x/ref"
+    assert _extract_name({"uri": "http://x/uri"}) == "http://x/uri"
+    assert _extract_name({"link": "http://x/link"}) == "http://x/link"
+
+
+def test_normalize_item_href_link():
+    from zyra.connectors.discovery.api_search import _normalize_item
+
+    item = {"name": "DS", "href": "http://example.com/ds"}
+    row = _normalize_item(item, "host")
+    assert row["link"] == "http://example.com/ds"
+
+
+def test_extract_name_mapping_type():
+    from collections import OrderedDict
+
+    from zyra.connectors.discovery.api_search import _extract_name
+
+    assert _extract_name(OrderedDict([("name", "ordered")])) == "ordered"
+    assert _extract_name(OrderedDict([("id", 99)])) == "99"
+
+
+def test_extract_name_skips_unresolvable_nested_dict():
+    from zyra.connectors.discovery.api_search import _extract_name
+
+    # name is a dict with no recognizable keys; should skip to title
+    assert _extract_name({"name": {"foo": "bar"}, "title": "Fallback"}) == "Fallback"
+    # All keys are unresolvable nested dicts → returns None
+    assert _extract_name({"name": {"foo": "bar"}}) is None
+
+
+def test_normalize_item_falls_back_on_unresolvable_field():
+    from zyra.connectors.discovery.api_search import _normalize_item
+
+    # name is unresolvable dict → should fall back to title
+    item = {"name": {"foo": "bar"}, "title": "Fallback Title", "uri": "http://x"}
+    row = _normalize_item(item, "host")
+    assert row["dataset"] == "Fallback Title"
+
+    # uri is unresolvable dict → should fall back to url
+    item2 = {"name": "DS", "uri": {"foo": "bar"}, "url": "http://example.com"}
+    row2 = _normalize_item(item2, "host")
+    assert row2["link"] == "http://example.com"
+
+    # description is unresolvable dict → should fall back to abstract
+    item3 = {"name": "DS", "description": {"foo": "bar"}, "abstract": "A summary"}
+    row3 = _normalize_item(item3, "host")
+    assert row3["description"] == "A summary"