From 1fdf4077a57c2e6217e66a4c9ee86658baac201a Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Fri, 13 Feb 2026 13:58:44 -0500 Subject: [PATCH 1/4] Standardize SSSOM object_id normalization and curie_map-based prefix parsing (#351) --- metpo/analysis/analyze_ontology_value.py | 47 ++++++----- .../extract_definitions_from_mappings.py | 46 ++++++----- metpo/pipeline/chromadb_semantic_mapper.py | 78 +++++++++++++++++-- .../presentations/analyze_primary_sources.py | 26 ++----- metpo/utils/__init__.py | 2 + metpo/utils/sssom_utils.py | 77 ++++++++++++++++++ 6 files changed, 216 insertions(+), 60 deletions(-) create mode 100644 metpo/utils/__init__.py create mode 100644 metpo/utils/sssom_utils.py diff --git a/metpo/analysis/analyze_ontology_value.py b/metpo/analysis/analyze_ontology_value.py index cfc08810..d4d61f44 100644 --- a/metpo/analysis/analyze_ontology_value.py +++ b/metpo/analysis/analyze_ontology_value.py @@ -17,8 +17,10 @@ import click import pandas as pd +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map -def extract_term_source(iri: str) -> str: + +def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str: """ Extract the defining ontology from an IRI. @@ -27,23 +29,29 @@ def extract_term_source(iri: str) -> str: - http://purl.obolibrary.org/obo/GO_0008150 → GO - https://w3id.org/biolink/vocab/phenotype → biolink """ - # OBO pattern: .../obo/PREFIX_ID - obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri) - if obo_match: - return obo_match.group(1).upper() - - # Biolink pattern - if "biolink" in iri: - return "biolink" - - # DOI pattern - if "doi.org" in iri: - doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri) - if doi_match: - return doi_match.group(1) - - # Default: return as-is - return "unknown" + text = str(iri).strip() + source = "unknown" + if not text: + return source + + extracted = extract_prefix(text, curie_map) + if extracted: + source = extracted.upper() + else: + # OBO pattern: .../obo/PREFIX_ID + obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", text) + if obo_match: + source = obo_match.group(1).upper() + elif "biolink" in text.lower(): + source = "biolink" + elif "doi.org" in text.lower(): + doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", text, flags=re.IGNORECASE) + if doi_match: + source = doi_match.group(1) + elif "purl.dsmz.de" in text.lower(): + source = "D3O" + + return source @click.command() @@ -53,9 +61,10 @@ def main(input): print(f"Loading mappings from: {input}") df = pd.read_csv(input, sep="\t", comment="#") + curie_map = parse_sssom_curie_map(input) # Extract term source from IRI - df["term_source"] = df["object_id"].apply(extract_term_source) + df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map)) print(f"\nLoaded {len(df)} mappings") print(f"Unique ontology files (object_source): {df['object_source'].nunique()}") diff --git a/metpo/analysis/extract_definitions_from_mappings.py b/metpo/analysis/extract_definitions_from_mappings.py index 13e4849f..00476588 100644 --- a/metpo/analysis/extract_definitions_from_mappings.py +++ b/metpo/analysis/extract_definitions_from_mappings.py @@ -16,6 +16,8 @@ import pandas as pd +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map + # Configuration SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv" METPO_SHEET = "../../src/templates/metpo_sheet.tsv" @@ -48,27 +50,34 @@ def parse_object_label(label: str) -> tuple[str, str | None]: return label.strip(), None -def extract_ontology_prefix(iri: str) -> str: +def extract_ontology_prefix( + iri: str, + curie_map: dict[str, str] | None = None, +) -> str: """Extract ontology prefix from IRI.""" - # Handle obo format - if "obo/" in iri: - match = re.search(r"/obo/([A-Z]+)_", iri) - if match: - return match.group(1) + text = str(iri).strip().strip("<>").strip() + prefix = "UNKNOWN" + if not text: + return prefix - # Handle DOI format - if "doi.org" in iri: - return "DOI" + extracted = extract_prefix(text, curie_map) + if extracted: + return extracted.upper() - # Handle other formats - if "biolink" in iri.lower(): - return "BIOLINK" - if "dsmz" in iri.lower(): - return "D3O" - if "mdatahub" in iri.lower(): - return "MEO" + if "obo/" in text: + match = re.search(r"/obo/([A-Z]+)_", text) + if match: + prefix = match.group(1) + elif "doi.org" in text.lower(): + prefix = "DOI" + elif "biolink" in text.lower(): + prefix = "BIOLINK" + elif "dsmz" in text.lower(): + prefix = "D3O" + elif "mdatahub" in text.lower(): + prefix = "MEO" - return "UNKNOWN" + return prefix def load_metpo_sheet() -> pd.DataFrame: @@ -114,6 +123,7 @@ def load_sssom_mappings() -> pd.DataFrame: # Read from data start df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start) + curie_map = parse_sssom_curie_map(SSSOM_FILE) # Extract distance from comment df["distance"] = df["comment"].apply(extract_distance_from_comment) @@ -124,7 +134,7 @@ def load_sssom_mappings() -> pd.DataFrame: ) # Extract ontology prefix - df["ontology"] = df["object_id"].apply(extract_ontology_prefix) + df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map)) return df diff --git a/metpo/pipeline/chromadb_semantic_mapper.py b/metpo/pipeline/chromadb_semantic_mapper.py index a9cedfaf..22d9afb3 100644 --- a/metpo/pipeline/chromadb_semantic_mapper.py +++ b/metpo/pipeline/chromadb_semantic_mapper.py @@ -42,6 +42,68 @@ from dotenv import load_dotenv from tqdm import tqdm +CORE_CURIE_MAP = { + "METPO": "http://purl.obolibrary.org/obo/METPO_", + "skos": "http://www.w3.org/2004/02/skos/core#", + "semapv": "https://w3id.org/semapv/vocab/", +} + +KNOWN_IRI_PREFIXES = { + "biolink": "https://w3id.org/biolink/vocab/", + "d3o": "https://purl.dsmz.de/schema/", + "doi": "http://doi.org/", + "bipon": "http://www.semanticweb.org/BiPON/", +} + + +def strip_angle_brackets(identifier: str) -> str: + text = identifier.strip() + if text.startswith("<") and text.endswith(">"): + return text[1:-1] + return text + + +def iri_to_curie(iri: str) -> tuple[str, str] | None: + """Convert common IRI forms to CURIEs when safely possible.""" + # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152 + if iri.startswith("http://purl.obolibrary.org/obo/"): + local = iri.removeprefix("http://purl.obolibrary.org/obo/") + if "_" in local: + prefix, suffix = local.split("_", 1) + if prefix and suffix: + return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_" + return None + + for prefix, base in KNOWN_IRI_PREFIXES.items(): + if iri.startswith(base): + local = iri.removeprefix(base) + if local: + return f"{prefix}:{local}", base + + # Accept https variant for doi + if iri.startswith("https://doi.org/"): + local = iri.removeprefix("https://doi.org/") + if local: + return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"] + + return None + + +def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]: + """Normalize mapping object id to CURIE where possible, else plain IRI (no <>).""" + clean = strip_angle_brackets(raw_identifier) + if not clean: + return clean, {} + + converted = iri_to_curie(clean) + if converted is not None: + curie, expansion = converted + prefix = curie.split(":", 1)[0] + return curie, {prefix: expansion} + + # Keep plain IRI if no safe CURIE normalization is available. + return clean, {} + def load_metpo_terms( tsv_path: str, @@ -178,12 +240,18 @@ def write_sssom_output( else: filtered = [m for m in matches if m["distance"] <= max_distance] + normalized_rows: list[dict] = [] + curie_map = dict(CORE_CURIE_MAP) + for m in filtered: + normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"])) + curie_map.update(object_prefixes) + normalized_rows.append({**m, "normalized_object_id": normalized_object_id}) + with Path(output_path).open("w", encoding="utf-8", newline="") as f: # Write metadata block f.write("# curie_map:\n") - f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n") - f.write("# skos: http://www.w3.org/2004/02/skos/core#\n") - f.write("# semapv: https://w3id.org/semapv/vocab/\n") + for prefix in sorted(curie_map): + f.write(f"# {prefix}: {curie_map[prefix]}\n") f.write( f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n" ) @@ -264,14 +332,14 @@ def write_sssom_output( ] ) - for m in filtered: + for m in normalized_rows: similarity = 1.0 - (m["distance"] / 2.0) writer.writerow( [ m["metpo_id"], m["metpo_label"], similarity_to_predicate(similarity), - m["match_iri"], + m["normalized_object_id"], m["match_document"], "semapv:SemanticSimilarityThresholdMatching", f"{similarity:.6f}", diff --git a/metpo/presentations/analyze_primary_sources.py b/metpo/presentations/analyze_primary_sources.py index d02462db..60f35917 100644 --- a/metpo/presentations/analyze_primary_sources.py +++ b/metpo/presentations/analyze_primary_sources.py @@ -12,6 +12,8 @@ from collections import Counter from pathlib import Path +from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map + # PRIMARY SOURCE 1: ChromaDB databases CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3") CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3") @@ -65,30 +67,18 @@ def analyze_sssom(sssom_path): target_prefixes = Counter() predicates = Counter() total_mappings = 0 + curie_map = parse_sssom_curie_map(sssom_path) with Path(sssom_path).open() as f: - reader = csv.DictReader(f, delimiter="\t") + data_lines = [line for line in f if not line.startswith("#")] + reader = csv.DictReader(data_lines, delimiter="\t") for row in reader: total_mappings += 1 # Extract prefix from object_id - obj_id = row.get("object_id", "") - if "/obo/" in obj_id: - # e.g., http://purl.obolibrary.org/obo/PATO_0000001 - prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else "" - if prefix: - target_prefixes[prefix] += 1 - elif "doi.org/10.1601" in obj_id: - target_prefixes["N4L"] += 1 - elif "purl.dsmz.de" in obj_id: - if "d3o" in obj_id.lower(): - target_prefixes["D3O"] += 1 - elif "miso" in obj_id.lower(): - target_prefixes["MISO"] += 1 - elif "mdatahub.org" in obj_id.lower(): - target_prefixes["MEO"] += 1 - elif "biolink" in obj_id.lower(): - target_prefixes["BIOLINK"] += 1 + prefix = extract_prefix(row.get("object_id", ""), curie_map) + if prefix: + target_prefixes[prefix.upper()] += 1 pred = row.get("predicate_id", "") predicates[pred] += 1 diff --git a/metpo/utils/__init__.py b/metpo/utils/__init__.py new file mode 100644 index 00000000..a0e8d494 --- /dev/null +++ b/metpo/utils/__init__.py @@ -0,0 +1,2 @@ +"""Shared utilities for METPO scripts.""" + diff --git a/metpo/utils/sssom_utils.py b/metpo/utils/sssom_utils.py new file mode 100644 index 00000000..54821c0f --- /dev/null +++ b/metpo/utils/sssom_utils.py @@ -0,0 +1,77 @@ +"""Helpers for parsing SSSOM metadata and identifiers.""" + +import re +from pathlib import Path + +CURIE_MAP_LINE = re.compile(r"^#\s{0,4}([A-Za-z][\w.-]*):\s*(\S+)\s*$") + + +def strip_angle_brackets(identifier: str) -> str: + """Return identifier without surrounding angle brackets.""" + text = str(identifier).strip() + if text.startswith("<") and text.endswith(">"): + return text[1:-1].strip() + return text + + +def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]: + """Parse ``# curie_map`` prefixes from an SSSOM TSV header.""" + curie_map: dict[str, str] = {} + in_curie_map = False + + with Path(sssom_path).open(encoding="utf-8") as handle: + for line in handle: + if not line.startswith("#"): + break + + text = line.rstrip("\n") + if text.strip() == "# curie_map:": + in_curie_map = True + continue + + if in_curie_map: + match = CURIE_MAP_LINE.match(text) + if match: + prefix, expansion = match.groups() + curie_map[prefix] = expansion + continue + + # End curie_map block at the first non-entry comment line. + if text.startswith("# ") and ":" in text: + in_curie_map = False + + return curie_map + + +def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None: + """ + Extract prefix from CURIE/IRI identifier. + + Preference order: + 1. CURIE prefix when object is already CURIE-like. + 2. curie_map expansion match when object is an IRI. + 3. conservative structural fallbacks (OBO IRIs). + """ + text = strip_angle_brackets(identifier) + if not text: + return None + + lowered = text.lower() + is_iri = lowered.startswith(("http://", "https://")) + if ":" in text and not is_iri: + return text.split(":", 1)[0] + + if curie_map: + best_prefix: str | None = None + best_len = -1 + for prefix, expansion in curie_map.items(): + if text.startswith(expansion) and len(expansion) > best_len: + best_prefix = prefix + best_len = len(expansion) + if best_prefix is not None: + return best_prefix + + if "/obo/" in text and "_" in text: + return text.split("/obo/")[1].split("_")[0] + + return None From 285c0bdf0a913bd2def7fcad847f701b5a3a5a68 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Fri, 13 Feb 2026 14:01:46 -0500 Subject: [PATCH 2/4] Format metpo utils package init for ruff format check --- metpo/utils/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/metpo/utils/__init__.py b/metpo/utils/__init__.py index a0e8d494..e08d5cc7 100644 --- a/metpo/utils/__init__.py +++ b/metpo/utils/__init__.py @@ -1,2 +1 @@ """Shared utilities for METPO scripts.""" - From c4ae38a1605d273d2e646b3cb50130865b3897a4 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Fri, 13 Feb 2026 14:13:38 -0500 Subject: [PATCH 3/4] Address Copilot review: centralize SSSOM parsing fallbacks and tighten curie_map parsing --- metpo/analysis/analyze_ontology_value.py | 26 +---- .../extract_definitions_from_mappings.py | 25 +---- metpo/pipeline/chromadb_semantic_mapper.py | 58 +--------- .../presentations/analyze_primary_sources.py | 2 +- metpo/utils/sssom_utils.py | 100 +++++++++++++++--- 5 files changed, 94 insertions(+), 117 deletions(-) diff --git a/metpo/analysis/analyze_ontology_value.py b/metpo/analysis/analyze_ontology_value.py index d4d61f44..6652d7c0 100644 --- a/metpo/analysis/analyze_ontology_value.py +++ b/metpo/analysis/analyze_ontology_value.py @@ -11,7 +11,6 @@ 3. Redundant sources that could be removed """ -import re from collections import defaultdict import click @@ -29,29 +28,10 @@ def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> st - http://purl.obolibrary.org/obo/GO_0008150 → GO - https://w3id.org/biolink/vocab/phenotype → biolink """ - text = str(iri).strip() - source = "unknown" - if not text: - return source - - extracted = extract_prefix(text, curie_map) + extracted = extract_prefix(iri, curie_map) if extracted: - source = extracted.upper() - else: - # OBO pattern: .../obo/PREFIX_ID - obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", text) - if obo_match: - source = obo_match.group(1).upper() - elif "biolink" in text.lower(): - source = "biolink" - elif "doi.org" in text.lower(): - doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", text, flags=re.IGNORECASE) - if doi_match: - source = doi_match.group(1) - elif "purl.dsmz.de" in text.lower(): - source = "D3O" - - return source + return extracted.upper() + return "unknown" @click.command() diff --git a/metpo/analysis/extract_definitions_from_mappings.py b/metpo/analysis/extract_definitions_from_mappings.py index 00476588..e8eb4cc9 100644 --- a/metpo/analysis/extract_definitions_from_mappings.py +++ b/metpo/analysis/extract_definitions_from_mappings.py @@ -54,30 +54,11 @@ def extract_ontology_prefix( iri: str, curie_map: dict[str, str] | None = None, ) -> str: - """Extract ontology prefix from IRI.""" - text = str(iri).strip().strip("<>").strip() - prefix = "UNKNOWN" - if not text: - return prefix - - extracted = extract_prefix(text, curie_map) + """Extract ontology prefix from CURIE/IRI with SSSOM-aware logic.""" + extracted = extract_prefix(iri, curie_map) if extracted: return extracted.upper() - - if "obo/" in text: - match = re.search(r"/obo/([A-Z]+)_", text) - if match: - prefix = match.group(1) - elif "doi.org" in text.lower(): - prefix = "DOI" - elif "biolink" in text.lower(): - prefix = "BIOLINK" - elif "dsmz" in text.lower(): - prefix = "D3O" - elif "mdatahub" in text.lower(): - prefix = "MEO" - - return prefix + return "UNKNOWN" def load_metpo_sheet() -> pd.DataFrame: diff --git a/metpo/pipeline/chromadb_semantic_mapper.py b/metpo/pipeline/chromadb_semantic_mapper.py index 22d9afb3..67ce519e 100644 --- a/metpo/pipeline/chromadb_semantic_mapper.py +++ b/metpo/pipeline/chromadb_semantic_mapper.py @@ -42,68 +42,14 @@ from dotenv import load_dotenv from tqdm import tqdm +from metpo.utils.sssom_utils import normalize_object_id + CORE_CURIE_MAP = { "METPO": "http://purl.obolibrary.org/obo/METPO_", "skos": "http://www.w3.org/2004/02/skos/core#", "semapv": "https://w3id.org/semapv/vocab/", } -KNOWN_IRI_PREFIXES = { - "biolink": "https://w3id.org/biolink/vocab/", - "d3o": "https://purl.dsmz.de/schema/", - "doi": "http://doi.org/", - "bipon": "http://www.semanticweb.org/BiPON/", -} - - -def strip_angle_brackets(identifier: str) -> str: - text = identifier.strip() - if text.startswith("<") and text.endswith(">"): - return text[1:-1] - return text - - -def iri_to_curie(iri: str) -> tuple[str, str] | None: - """Convert common IRI forms to CURIEs when safely possible.""" - # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152 - if iri.startswith("http://purl.obolibrary.org/obo/"): - local = iri.removeprefix("http://purl.obolibrary.org/obo/") - if "_" in local: - prefix, suffix = local.split("_", 1) - if prefix and suffix: - return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_" - return None - - for prefix, base in KNOWN_IRI_PREFIXES.items(): - if iri.startswith(base): - local = iri.removeprefix(base) - if local: - return f"{prefix}:{local}", base - - # Accept https variant for doi - if iri.startswith("https://doi.org/"): - local = iri.removeprefix("https://doi.org/") - if local: - return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"] - - return None - - -def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]: - """Normalize mapping object id to CURIE where possible, else plain IRI (no <>).""" - clean = strip_angle_brackets(raw_identifier) - if not clean: - return clean, {} - - converted = iri_to_curie(clean) - if converted is not None: - curie, expansion = converted - prefix = curie.split(":", 1)[0] - return curie, {prefix: expansion} - - # Keep plain IRI if no safe CURIE normalization is available. - return clean, {} - def load_metpo_terms( tsv_path: str, diff --git a/metpo/presentations/analyze_primary_sources.py b/metpo/presentations/analyze_primary_sources.py index 60f35917..7aa9825e 100644 --- a/metpo/presentations/analyze_primary_sources.py +++ b/metpo/presentations/analyze_primary_sources.py @@ -70,7 +70,7 @@ def analyze_sssom(sssom_path): curie_map = parse_sssom_curie_map(sssom_path) with Path(sssom_path).open() as f: - data_lines = [line for line in f if not line.startswith("#")] + data_lines = (line for line in f if not line.startswith("#")) reader = csv.DictReader(data_lines, delimiter="\t") for row in reader: total_mappings += 1 diff --git a/metpo/utils/sssom_utils.py b/metpo/utils/sssom_utils.py index 54821c0f..917133cb 100644 --- a/metpo/utils/sssom_utils.py +++ b/metpo/utils/sssom_utils.py @@ -3,7 +3,14 @@ import re from pathlib import Path -CURIE_MAP_LINE = re.compile(r"^#\s{0,4}([A-Za-z][\w.-]*):\s*(\S+)\s*$") +CURIE_MAP_LINE = re.compile(r"^#\s{2,}([A-Za-z][\w.-]*):\s*(\S+)\s*$") + +KNOWN_IRI_PREFIXES = { + "biolink": "https://w3id.org/biolink/vocab/", + "d3o": "https://purl.dsmz.de/schema/", + "doi": "http://doi.org/", + "bipon": "http://www.semanticweb.org/BiPON/", +} def strip_angle_brackets(identifier: str) -> str: @@ -30,19 +37,72 @@ def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]: continue if in_curie_map: + # End curie_map block at first normal metadata line like: + # "# mapping_set_id: ...". + if text.startswith("# ") and not text.startswith("# "): + in_curie_map = False + continue + match = CURIE_MAP_LINE.match(text) if match: prefix, expansion = match.groups() curie_map[prefix] = expansion continue - # End curie_map block at the first non-entry comment line. - if text.startswith("# ") and ":" in text: - in_curie_map = False + # Any non-matching comment line ends the curie_map block. + in_curie_map = False return curie_map +def iri_to_curie(iri: str, known_iri_prefixes: dict[str, str] | None = None) -> tuple[str, str] | None: + """Convert common IRI forms to CURIEs when safely possible.""" + prefixes = known_iri_prefixes or KNOWN_IRI_PREFIXES + + # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152 + if iri.startswith("http://purl.obolibrary.org/obo/"): + local = iri.removeprefix("http://purl.obolibrary.org/obo/") + if "_" in local: + prefix, suffix = local.split("_", 1) + if prefix and suffix: + return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_" + return None + + for prefix, base in prefixes.items(): + if iri.startswith(base): + local = iri.removeprefix(base) + if local: + return f"{prefix}:{local}", base + + # Accept https variant for doi + if iri.startswith("https://doi.org/"): + local = iri.removeprefix("https://doi.org/") + doi_base = prefixes.get("doi", "http://doi.org/") + if local: + return f"doi:{local}", doi_base + + return None + + +def normalize_object_id( + raw_identifier: str, + known_iri_prefixes: dict[str, str] | None = None, +) -> tuple[str, dict[str, str]]: + """Normalize mapping object id to CURIE where possible, else plain IRI (no <>).""" + clean = strip_angle_brackets(raw_identifier) + if not clean: + return clean, {} + + converted = iri_to_curie(clean, known_iri_prefixes=known_iri_prefixes) + if converted is not None: + curie, expansion = converted + prefix = curie.split(":", 1)[0] + return curie, {prefix: expansion} + + # Keep plain IRI if no safe CURIE normalization is available. + return clean, {} + + def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None: """ Extract prefix from CURIE/IRI identifier. @@ -53,25 +113,35 @@ def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> 3. conservative structural fallbacks (OBO IRIs). """ text = strip_angle_brackets(identifier) + detected: str | None = None if not text: - return None + return detected lowered = text.lower() is_iri = lowered.startswith(("http://", "https://")) if ":" in text and not is_iri: - return text.split(":", 1)[0] - - if curie_map: + detected = text.split(":", 1)[0] + elif curie_map: best_prefix: str | None = None best_len = -1 for prefix, expansion in curie_map.items(): if text.startswith(expansion) and len(expansion) > best_len: best_prefix = prefix best_len = len(expansion) - if best_prefix is not None: - return best_prefix - - if "/obo/" in text and "_" in text: - return text.split("/obo/")[1].split("_")[0] - - return None + detected = best_prefix + + if detected is None: + if "/obo/" in text and "_" in text: + detected = text.split("/obo/")[1].split("_")[0] + elif "doi.org" in lowered: + detected = "doi" + elif "biolink" in lowered: + detected = "biolink" + elif "purl.dsmz.de" in lowered: + detected = "d3o" + elif "mdatahub.org" in lowered: + detected = "meo" + elif "semanticweb.org/bipon/" in lowered: + detected = "bipon" + + return detected From ce1ac84754a7ba4beb016ab9bf1e331e5a5c29a8 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Fri, 13 Feb 2026 14:57:38 -0500 Subject: [PATCH 4/4] Format sssom_utils for CI ruff formatter check --- metpo/utils/sssom_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metpo/utils/sssom_utils.py b/metpo/utils/sssom_utils.py index 917133cb..3a318824 100644 --- a/metpo/utils/sssom_utils.py +++ b/metpo/utils/sssom_utils.py @@ -55,7 +55,9 @@ def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]: return curie_map -def iri_to_curie(iri: str, known_iri_prefixes: dict[str, str] | None = None) -> tuple[str, str] | None: +def iri_to_curie( + iri: str, known_iri_prefixes: dict[str, str] | None = None +) -> tuple[str, str] | None: """Convert common IRI forms to CURIEs when safely possible.""" prefixes = known_iri_prefixes or KNOWN_IRI_PREFIXES