Skip to content

Commit d1f7603

Browse files
authored
Batch CURIE validation requests and fix Ubergraph TLS (#46)
* Improve CURIE validation efficiency * Use certifi bundle for Ubergraph queries * Update .gitignore * Document utilities and tighten type hints * Update formatting * Add shared pre-commit hook for isort/black
1 parent 3a293fc commit d1f7603

File tree

11 files changed

+157
-34
lines changed

11 files changed

+157
-34
lines changed

.githooks/pre-commit

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/sh
2+
# Run Black and isort before each commit to keep formatting consistent.
3+
4+
set -euo pipefail
5+
6+
echo "[pre-commit] Running isort..."
7+
poetry run isort pandasaurus test
8+
9+
echo "[pre-commit] Running black..."
10+
poetry run black pandasaurus test
11+
12+
if ! git diff --quiet --exit-code; then
13+
echo "[pre-commit] Formatting changes were applied. Please review and stage them."
14+
exit 1
15+
fi
16+
17+
echo "[pre-commit] Formatting OK."

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,10 @@ dmypy.json
129129
.pyre/
130130

131131
src/pandasaurus/main.py
132+
main.py
133+
.DS_Store
134+
.idea/
135+
docs/.DS_Store
136+
pandasaurus/.DS_Store
137+
test/.DS_Store
138+
test/data/.DS_Store

pandasaurus/curie_validator.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
from abc import abstractmethod
2-
from typing import Dict, List
2+
from typing import Any, Dict, List, Optional, Union
33

44
from pandasaurus.resources.term import Term
55
from pandasaurus.utils.pandasaurus_exceptions import InvalidTerm, ObsoletedTerm
6-
from pandasaurus.utils.query_utils import run_sparql_query
6+
from pandasaurus.utils.query_utils import chunks, run_sparql_query
77
from pandasaurus.utils.sparql_queries import get_label_query, get_replaced_by_query
88

99

@@ -12,6 +12,8 @@ class CurieValidator:
1212
replacements for obsoleted slim terms.
1313
"""
1414

15+
_CURIE_CHUNK_SIZE = 90
16+
1517
@staticmethod
1618
@abstractmethod
1719
def validate_curie_prefixes(curie_list: List[str]) -> Dict[str, bool]:
@@ -30,7 +32,7 @@ def validate_curie_prefixes(curie_list: List[str]) -> Dict[str, bool]:
3032
raise NotImplementedError
3133

3234
@staticmethod
33-
def validate_curie_list(curie_list: List[str]) -> Dict[str, bool]:
35+
def validate_curie_list(curie_list: List[str]) -> dict[str, dict[str, Union[Optional[bool], Any]]]:
3436
"""Reports whether the CURIEs are valid or not.
3537
3638
Args:
@@ -46,8 +48,14 @@ def validate_curie_list(curie_list: List[str]) -> Dict[str, bool]:
4648
True or False status of the CURIE validation for each term
4749
4850
"""
49-
query_string = get_label_query(curie_list)
50-
result_dict = dict([(r.get("term"), r.get("label")) for r in run_sparql_query(query_string)])
51+
result_dict: Dict[str, Optional[str]] = {}
52+
for chunk in chunks(curie_list, CurieValidator._CURIE_CHUNK_SIZE):
53+
# Large lists are split up to avoid massive VALUES blocks in SPARQL queries.
54+
query_string = get_label_query(chunk)
55+
for res in run_sparql_query(query_string):
56+
term = res.get("term")
57+
if term:
58+
result_dict[term] = res.get("label")
5159
return {
5260
curie: {
5361
"label": result_dict.get(curie) if curie in result_dict else None,
@@ -73,8 +81,13 @@ def find_obsolete_terms(curie_list: List[str]) -> Dict:
7381
True or False status of the term for each term
7482
7583
"""
76-
query_string = get_replaced_by_query(curie_list)
77-
result_dict = dict([(r.get("term"), r) for r in run_sparql_query(query_string)])
84+
result_dict: Dict[str, Dict[str, Any]] = {}
85+
for chunk in chunks(curie_list, CurieValidator._CURIE_CHUNK_SIZE):
86+
query_string = get_replaced_by_query(chunk)
87+
for res in run_sparql_query(query_string):
88+
term = res.get("term")
89+
if term:
90+
result_dict[term] = res
7891
return result_dict
7992

8093
@staticmethod

pandasaurus/query.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ def parent_enrichment(self):
256256
self.ancestor_enrichment(1)
257257

258258
def synonym_lookup(self) -> pd.DataFrame:
259-
"""
259+
"""Return labels plus synonym rows for every seed term.
260260
261261
Returns:
262262
A DataFrame containing labels and synonyms of the terms from the seed list.
@@ -284,7 +284,7 @@ def synonym_lookup(self) -> pd.DataFrame:
284284
return result_df
285285

286286
def get_most_specific_objects(self, predicate: str, ontology: str):
287-
"""
287+
"""Return the most specific objects associated with the given predicate.
288288
289289
Args:
290290
predicate: Relationship that wanted to be explored
@@ -296,6 +296,7 @@ def get_most_specific_objects(self, predicate: str, ontology: str):
296296
- http://purl.obolibrary.org/obo/uberon.owl
297297
298298
Returns:
299+
DataFrame capturing subject, predicate, and object labels.
299300
300301
"""
301302
subject_list = [term.get_iri() for term in self._term_list]
@@ -310,7 +311,7 @@ def get_most_specific_objects(self, predicate: str, ontology: str):
310311
)
311312

312313
def get_most_specific_subjects(self, predicate: str, ontology: str):
313-
"""
314+
"""Return the most specific subjects associated with the given predicate.
314315
315316
Args:
316317
predicate: Relationship that wanted to be explored
@@ -322,6 +323,7 @@ def get_most_specific_subjects(self, predicate: str, ontology: str):
322323
- http://purl.obolibrary.org/obo/uberon.owl
323324
324325
Returns:
326+
DataFrame capturing subject, predicate, and object labels.
325327
326328
"""
327329
object_list = [term.get_iri() for term in self._term_list]
@@ -354,7 +356,8 @@ def update_obsoleted_terms(self):
354356
"""Replaces all obsoleted terms in the term list with the new term that obsoletes them."""
355357
[getattr(term, "update_obsoleted_term")() for term in self._term_list]
356358

357-
def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
359+
def mirror_enrichment_for_graph_generation(self, term_list: List[str]) -> None:
360+
"""Populate `graph_df` with all pairwise enrichment edges for graph output."""
358361
# TODO definitely need a refactoring later on
359362
s_result = []
360363
for s_chunk in chunks(term_list, 45):
@@ -373,7 +376,8 @@ def mirror_enrichment_for_graph_generation(self, term_list: List[str]):
373376
.reset_index(drop=True)
374377
)
375378

376-
def _generate_enrichment_graph(self, object_list):
379+
def _generate_enrichment_graph(self, object_list: List[str]) -> None:
380+
"""Build the Graph representation backing the enrichment results."""
377381
self.mirror_enrichment_for_graph_generation(object_list)
378382
self.graph = GraphGenerator.generate_enrichment_graph(self.graph_df)
379383
self.graph = GraphGenerator.apply_transitive_reduction(self.graph, self.enriched_df["p"].unique().tolist())

pandasaurus/resources/term.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,29 @@ class Term:
66

77
def __init__(
88
self,
9-
label: str,
9+
label: Optional[str],
1010
iri: str,
1111
is_valid: bool,
1212
new_label: Optional[str] = None,
1313
new_iri: Optional[str] = None,
1414
):
15+
"""Initialize a term instance.
16+
17+
Args:
18+
label: Human readable label; can be None for invalid CURIEs.
19+
iri: The CURIE/IRI of the term.
20+
is_valid: Whether the term exists in the ontology.
21+
new_label: Replacement label for obsolete terms.
22+
new_iri: Replacement IRI for obsolete terms.
23+
"""
1524
self.__label = label
1625
self.__iri = iri
1726
self.__is_valid = is_valid
1827
self.__new_label = new_label
1928
self.__new_iri = new_iri
2029
self.__is_obsolete: bool = True if new_label and new_iri else False
2130

22-
def get_label(self) -> str:
31+
def get_label(self) -> Optional[str]:
2332
"""Returns term label.
2433
2534
Returns:
@@ -46,7 +55,7 @@ def get_is_valid(self) -> bool:
4655
"""
4756
return self.__is_valid
4857

49-
def get_new_label(self) -> str:
58+
def get_new_label(self) -> Optional[str]:
5059
"""Returns new term label of obsoleted term.
5160
5261
Returns:
@@ -55,7 +64,7 @@ def get_new_label(self) -> str:
5564
"""
5665
return self.__new_label
5766

58-
def get_new_iri(self) -> str:
67+
def get_new_iri(self) -> Optional[str]:
5968
"""Returns new term IRI of obsoleted term.
6069
6170
Returns:
@@ -64,7 +73,7 @@ def get_new_iri(self) -> str:
6473
"""
6574
return self.__new_iri
6675

67-
def get_is_obsoleted(self) -> str:
76+
def get_is_obsoleted(self) -> bool:
6877
"""Returns term obsoletion status.
6978
7079
Returns:

pandasaurus/slim_manager.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,6 @@ def get_slim_members(slim_list: List[str]) -> List[str]:
5252
]
5353

5454
@staticmethod
55-
def _get_ontology_list():
55+
def _get_ontology_list() -> List[str]:
56+
"""Return ontology titles available in Ubergraph."""
5657
return [row.get("title") for row in run_sparql_query(get_ontology_list_query())]

pandasaurus/utils/logging_config.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
import logging
22
import sys
3+
from logging import Logger, LogRecord
34

45

5-
# Create a filter to exclude ERROR log records
66
class NoErrorFilter(logging.Filter):
7-
def filter(self, record):
7+
"""Filter that suppresses ERROR records, letting INFO/DEBUG through."""
8+
9+
def filter(self, record: LogRecord) -> bool:
10+
"""Return True when the log record is not an ERROR level entry."""
811
return record.levelno != logging.ERROR
912

1013

11-
def configure_logger():
14+
def configure_logger() -> Logger:
15+
"""Configure and return the shared pandasaurus logger."""
1216
logger = logging.getLogger(__name__)
1317
logger.setLevel(logging.INFO)
1418
# logger.propagate = False

pandasaurus/utils/query_utils.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,28 @@
1-
from typing import Iterator
1+
import os
2+
from typing import Iterable, Iterator, List, Sequence, TypeVar
23

4+
import certifi
35
from oaklib.implementations import UbergraphImplementation
46

7+
# Ensure HTTPS requests trust the certifi bundle; this avoids local certificate issues.
8+
os.environ.setdefault("SSL_CERT_FILE", certifi.where())
9+
os.environ.setdefault("REQUESTS_CA_BUNDLE", certifi.where())
10+
511
oi = UbergraphImplementation()
12+
T = TypeVar("T")
613

714

815
def run_sparql_query(query: str) -> Iterator:
16+
"""Execute a SPARQL query against Ubergraph."""
917
return oi.query(query=query, prefixes=get_prefixes(query, oi.prefix_map().keys()))
1018

1119

12-
def chunks(lst, n):
13-
for i in range(0, len(lst), n):
14-
yield lst[i : i + n]
15-
20+
def chunks(items: Sequence[T], size: int) -> Iterator[Sequence[T]]:
21+
"""Yield slices of `items` with at most `size` entries."""
22+
for i in range(0, len(items), size):
23+
yield items[i : i + size]
1624

17-
def get_prefixes(text, prefix_map):
18-
_prefixes = []
19-
for prefix in prefix_map:
20-
if prefix + ":" in text:
21-
_prefixes.append(prefix)
2225

23-
return _prefixes
26+
def get_prefixes(text: str, prefix_map: Iterable[str]) -> List[str]:
27+
"""Return CURIE prefixes referenced in `text`."""
28+
return [prefix for prefix in prefix_map if f"{prefix}:" in text]

poetry.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ python = "^3.9"
1212
oaklib = "^0.6.23"
1313
pandas = "^2.0.1"
1414
rdflib = "^6.3.2"
15+
certifi = "^2024.2.2"
1516
sphinx = { version = "^7.2.6", optional = true }
1617
sphinx-rtd-theme = { version = "^1.3.0", optional = true }
1718
sphinx-copybutton = { version = "^0.5.2", optional = true }
@@ -37,4 +38,4 @@ docs = ["sphinx", "sphinx-rtd-theme", "sphinx-copybutton"]
3738
line-length = 120
3839

3940
[tool.isort]
40-
profile = "black"
41+
profile = "black"

0 commit comments

Comments
 (0)