Batch CURIE validation requests and fix Ubergraph TLS (#46)

ubyndr · web-flow · commit d1f760325dec · 2025-11-28T11:30:52.000Z
* Improve CURIE validation efficiency

* Use certifi bundle for Ubergraph queries

* Update .gitignore

* Document utilities and tighten type hints

* Update formatting

* Add shared pre-commit hook for isort/black
diff --git a/.githooks/pre-commit b/.githooks/pre-commit
@@ -0,0 +1,17 @@
+#!/bin/sh
+# Run Black and isort before each commit to keep formatting consistent.
+
+set -euo pipefail
+
+echo "[pre-commit] Running isort..."
+poetry run isort pandasaurus test
+
+echo "[pre-commit] Running black..."
+poetry run black pandasaurus test
+
+if ! git diff --quiet --exit-code; then
+  echo "[pre-commit] Formatting changes were applied. Please review and stage them."
+  exit 1
+fi
+
+echo "[pre-commit] Formatting OK."
diff --git a/.gitignore b/.gitignore
@@ -129,3 +129,10 @@ dmypy.json
 .pyre/
 
 src/pandasaurus/main.py
+main.py
+.DS_Store
+.idea/
+docs/.DS_Store
+pandasaurus/.DS_Store
+test/.DS_Store
+test/data/.DS_Store
diff --git a/pandasaurus/curie_validator.py b/pandasaurus/curie_validator.py
@@ -1,9 +1,9 @@
 from abc import abstractmethod
-from typing import Dict, List
+from typing import Any, Dict, List, Optional, Union
 
 from pandasaurus.resources.term import Term
 from pandasaurus.utils.pandasaurus_exceptions import InvalidTerm, ObsoletedTerm
-from pandasaurus.utils.query_utils import run_sparql_query
+from pandasaurus.utils.query_utils import chunks, run_sparql_query
 from pandasaurus.utils.sparql_queries import get_label_query, get_replaced_by_query
 
 
@@ -12,6 +12,8 @@ class CurieValidator:
     replacements for obsoleted slim terms.
     """
 
+    _CURIE_CHUNK_SIZE = 90
+
     @staticmethod
     @abstractmethod
     def validate_curie_prefixes(curie_list: List[str]) -> Dict[str, bool]:
@@ -30,7 +32,7 @@ def validate_curie_prefixes(curie_list: List[str]) -> Dict[str, bool]:
         raise NotImplementedError
 
     @staticmethod
-    def validate_curie_list(curie_list: List[str]) -> Dict[str, bool]:
+    def validate_curie_list(curie_list: List[str]) -> dict[str, dict[str, Union[Optional[bool], Any]]]:
         """Reports whether the CURIEs are valid or not.
 
         Args:
@@ -46,8 +48,14 @@ def validate_curie_list(curie_list: List[str]) -> Dict[str, bool]:
             True or False status of the CURIE validation for each term
 
         """
-        query_string = get_label_query(curie_list)
-        result_dict = dict([(r.get("term"), r.get("label")) for r in run_sparql_query(query_string)])
+        result_dict: Dict[str, Optional[str]] = {}
+        for chunk in chunks(curie_list, CurieValidator._CURIE_CHUNK_SIZE):
+            # Large lists are split up to avoid massive VALUES blocks in SPARQL queries.
+            query_string = get_label_query(chunk)
+            for res in run_sparql_query(query_string):
+                term = res.get("term")
+                if term:
+                    result_dict[term] = res.get("label")
         return {
             curie: {
                 "label": result_dict.get(curie) if curie in result_dict else None,
@@ -73,8 +81,13 @@ def find_obsolete_terms(curie_list: List[str]) -> Dict:
             True or False status of the term for each term
 
         """
-        query_string = get_replaced_by_query(curie_list)
-        result_dict = dict([(r.get("term"), r) for r in run_sparql_query(query_string)])
+        result_dict: Dict[str, Dict[str, Any]] = {}
+        for chunk in chunks(curie_list, CurieValidator._CURIE_CHUNK_SIZE):
+            query_string = get_replaced_by_query(chunk)
+            for res in run_sparql_query(query_string):
+                term = res.get("term")
+                if term:
+                    result_dict[term] = res
         return result_dict
 
     @staticmethod
diff --git a/pandasaurus/query.py b/pandasaurus/query.py
@@ -256,7 +256,7 @@ def parent_enrichment(self):
         self.ancestor_enrichment(1)
 
     def synonym_lookup(self) -> pd.DataFrame:
-        """
+        """Return labels plus synonym rows for every seed term.
 
         Returns:
             A DataFrame containing labels and synonyms of the terms from the seed list.
@@ -284,7 +284,7 @@ def synonym_lookup(self) -> pd.DataFrame:
         return result_df
 
     def get_most_specific_objects(self, predicate: str, ontology: str):
-        """
+        """Return the most specific objects associated with the given predicate.
 
         Args:
             predicate: Relationship that wanted to be explored
@@ -296,6 +296,7 @@ def get_most_specific_objects(self, predicate: str, ontology: str):
                 - http://purl.obolibrary.org/obo/uberon.owl
 
         Returns:
+            DataFrame capturing subject, predicate, and object labels.
 
         """
         subject_list = [term.get_iri() for term in self._term_list]
@@ -310,7 +311,7 @@ def get_most_specific_objects(self, predicate: str, ontology: str):
         )
 
     def get_most_specific_subjects(self, predicate: str, ontology: str):
-        """
+        """Return the most specific subjects associated with the given predicate.
 
         Args:
             predicate: Relationship that wanted to be explored
@@ -322,6 +323,7 @@ def get_most_specific_subjects(self, predicate: str, ontology: str):
                 - http://purl.obolibrary.org/obo/uberon.owl
 
         Returns:
+            DataFrame capturing subject, predicate, and object labels.
 
         """
         object_list = [term.get_iri() for term in self._term_list]
@@ -354,7 +356,8 @@ def update_obsoleted_terms(self):
         """Replaces all obsoleted terms in the term list with the new term that obsoletes them."""
         [getattr(term, "update_obsoleted_term")() for term in self._term_list]
 
-    def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
+    def mirror_enrichment_for_graph_generation(self, term_list: List[str]) -> None:
+        """Populate `graph_df` with all pairwise enrichment edges for graph output."""
         # TODO definitely need a refactoring later on
         s_result = []
         for s_chunk in chunks(term_list, 45):
@@ -373,7 +376,8 @@ def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
             .reset_index(drop=True)
         )
 
-    def _generate_enrichment_graph(self, object_list):
+    def _generate_enrichment_graph(self, object_list: List[str]) -> None:
+        """Build the Graph representation backing the enrichment results."""
         self.mirror_enrichment_for_graph_generation(object_list)
         self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
         self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())
diff --git a/pandasaurus/resources/term.py b/pandasaurus/resources/term.py
@@ -6,20 +6,29 @@ class Term:
 
     def __init__(
         self,
-        label: str,
+        label: Optional[str],
         iri: str,
         is_valid: bool,
         new_label: Optional[str] = None,
         new_iri: Optional[str] = None,
     ):
+        """Initialize a term instance.
+
+        Args:
+            label: Human readable label; can be None for invalid CURIEs.
+            iri: The CURIE/IRI of the term.
+            is_valid: Whether the term exists in the ontology.
+            new_label: Replacement label for obsolete terms.
+            new_iri: Replacement IRI for obsolete terms.
+        """
         self.__label = label
         self.__iri = iri
         self.__is_valid = is_valid
         self.__new_label = new_label
         self.__new_iri = new_iri
         self.__is_obsolete: bool = True if new_label and new_iri else False
 
-    def get_label(self) -> str:
+    def get_label(self) -> Optional[str]:
         """Returns term label.
 
         Returns:
@@ -46,7 +55,7 @@ def get_is_valid(self) -> bool:
         """
         return self.__is_valid
 
-    def get_new_label(self) -> str:
+    def get_new_label(self) -> Optional[str]:
         """Returns new term label of obsoleted term.
 
         Returns:
@@ -55,7 +64,7 @@ def get_new_label(self) -> str:
         """
         return self.__new_label
 
-    def get_new_iri(self) -> str:
+    def get_new_iri(self) -> Optional[str]:
         """Returns new term IRI of obsoleted term.
 
         Returns:
@@ -64,7 +73,7 @@ def get_new_iri(self) -> str:
         """
         return self.__new_iri
 
-    def get_is_obsoleted(self) -> str:
+    def get_is_obsoleted(self) -> bool:
         """Returns term obsoletion status.
 
         Returns:
diff --git a/pandasaurus/slim_manager.py b/pandasaurus/slim_manager.py
@@ -52,5 +52,6 @@ def get_slim_members(slim_list: List[str]) -> List[str]:
         ]
 
     @staticmethod
-    def _get_ontology_list():
+    def _get_ontology_list() -> List[str]:
+        """Return ontology titles available in Ubergraph."""
         return [row.get("title") for row in run_sparql_query(get_ontology_list_query())]
diff --git a/pandasaurus/utils/logging_config.py b/pandasaurus/utils/logging_config.py
@@ -1,14 +1,18 @@
 import logging
 import sys
+from logging import Logger, LogRecord
 
 
-# Create a filter to exclude ERROR log records
 class NoErrorFilter(logging.Filter):
-    def filter(self, record):
+    """Filter that suppresses ERROR records, letting INFO/DEBUG through."""
+
+    def filter(self, record: LogRecord) -> bool:
+        """Return True when the log record is not an ERROR level entry."""
         return record.levelno != logging.ERROR
 
 
-def configure_logger():
+def configure_logger() -> Logger:
+    """Configure and return the shared pandasaurus logger."""
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
     # logger.propagate = False
diff --git a/pandasaurus/utils/query_utils.py b/pandasaurus/utils/query_utils.py
@@ -1,23 +1,28 @@
-from typing import Iterator
+import os
+from typing import Iterable, Iterator, List, Sequence, TypeVar
 
+import certifi
 from oaklib.implementations import UbergraphImplementation
 
+# Ensure HTTPS requests trust the certifi bundle; this avoids local certificate issues.
+os.environ.setdefault("SSL_CERT_FILE", certifi.where())
+os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
+
 oi = UbergraphImplementation()
+T = TypeVar("T")
 
 
 def run_sparql_query(query: str) -> Iterator:
+    """Execute a SPARQL query against Ubergraph."""
     return oi.query(query=query, prefixes=get_prefixes(query, oi.prefix_map().keys()))
 
 
-def chunks(lst, n):
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
-
+def chunks(items: Sequence[T], size: int) -> Iterator[Sequence[T]]:
+    """Yield slices of `items` with at most `size` entries."""
+    for i in range(0, len(items), size):
+        yield items[i : i + size]
 
-def get_prefixes(text, prefix_map):
-    _prefixes = []
-    for prefix in prefix_map:
-        if prefix + ":" in text:
-            _prefixes.append(prefix)
 
-    return _prefixes
+def get_prefixes(text: str, prefix_map: Iterable[str]) -> List[str]:
+    """Return CURIE prefixes referenced in `text`."""
+    return [prefix for prefix in prefix_map if f"{prefix}:" in text]
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,6 +12,7 @@ python = "^3.9"
 oaklib = "^0.6.23"
 pandas = "^2.0.1"
 rdflib = "^6.3.2"
+certifi = "^2024.2.2"
 sphinx = { version = "^7.2.6", optional = true }
 sphinx-rtd-theme = { version = "^1.3.0", optional = true }
 sphinx-copybutton = { version = "^0.5.2", optional = true }
@@ -37,4 +38,4 @@ docs = ["sphinx", "sphinx-rtd-theme", "sphinx-copybutton"]
 line-length = 120
 
 [tool.isort]
-profile = "black"
+profile = "black"
diff --git a/test/test_curie_validator.py b/test/test_curie_validator.py
@@ -30,6 +30,33 @@ def test_validate_curie_list(mocker):
     assert CurieValidator.validate_curie_list(get_validate_curie_list_data()) == get_expected_validate_curie_list()
 
 
+def test_validate_curie_list_batches_requests(mocker):
+    mocker.patch.object(CurieValidator, "_CURIE_CHUNK_SIZE", 2)
+    run_query_mock = mocker.patch(
+        "pandasaurus.curie_validator.run_sparql_query",
+        side_effect=[
+            iter(
+                [
+                    {"label": "kidney epithelial cell", "term": "CL:0002518"},
+                    {"label": "kidney cortical cell", "term": "CL:0002681"},
+                ]
+            ),
+            iter([{"label": "kidney interstitial cell", "term": "CL:1000500"}]),
+        ],
+    )
+    curie_list = ["CL:0002518", "CL:0002681", "CL:1000500", "CL:1234567"]
+
+    result = CurieValidator.validate_curie_list(curie_list)
+
+    assert run_query_mock.call_count == 2
+    assert result == {
+        "CL:0002518": {"label": "kidney epithelial cell", "valid": True},
+        "CL:0002681": {"label": "kidney cortical cell", "valid": True},
+        "CL:1000500": {"label": "kidney interstitial cell", "valid": True},
+        "CL:1234567": {"label": None, "valid": False},
+    }
+
+
 def test_find_obsolete_terms(mocker):
     mocker.patch(
         "pandasaurus.curie_validator.run_sparql_query",
@@ -41,6 +68,41 @@ def test_find_obsolete_terms(mocker):
     assert CurieValidator.find_obsolete_terms(get_find_obsolete_terms_data()) == get_expected_find_obsolete_terms()
 
 
+def test_find_obsolete_terms_batches_requests(mocker):
+    mocker.patch.object(CurieValidator, "_CURIE_CHUNK_SIZE", 2)
+    run_query_mock = mocker.patch(
+        "pandasaurus.curie_validator.run_sparql_query",
+        side_effect=[
+            iter(
+                [
+                    {
+                        "depr_status": "true",
+                        "label": "obsolete Muller cell",
+                        "new_term": "CL:0000636",
+                        "new_term_label": "Mueller cell",
+                        "term": "CL:0011107",
+                    }
+                ]
+            ),
+            iter([]),
+        ],
+    )
+    curie_list = ["CL:0011107", "CL:0000337", "CL:0002371"]
+
+    result = CurieValidator.find_obsolete_terms(curie_list)
+
+    assert run_query_mock.call_count == 2
+    assert result == {
+        "CL:0011107": {
+            "term": "CL:0011107",
+            "depr_status": "true",
+            "new_term": "CL:0000636",
+            "label": "obsolete Muller cell",
+            "new_term_label": "Mueller cell",
+        }
+    }
+
+
 def test_find_obsolete_term_replacement():
     pass
 

Original file line number	Diff line number	Diff line change
`@@ -52,5 +52,6 @@ def get_slim_members(slim_list: List[str]) -> List[str]:`
`52`	`52`	`]`
`53`	`53`
`54`	`54`	`@staticmethod`
`55`		`- def _get_ontology_list():`
	`55`	`+ def _get_ontology_list() -> List[str]:`
	`56`	`+ """Return ontology titles available in Ubergraph."""`
`56`	`57`	`return [row.get("title") for row in run_sparql_query(get_ontology_list_query())]`