Skip to content

Commit 1fdf407

Browse files
committed
Standardize SSSOM object_id normalization and curie_map-based prefix parsing (#351)
1 parent 4bf2de6 commit 1fdf407

File tree

6 files changed

+216
-60
lines changed

6 files changed

+216
-60
lines changed

metpo/analysis/analyze_ontology_value.py

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@
1717
import click
1818
import pandas as pd
1919

20+
from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
2021

21-
def extract_term_source(iri: str) -> str:
22+
23+
def extract_term_source(iri: str, curie_map: dict[str, str] | None = None) -> str:
2224
"""
2325
Extract the defining ontology from an IRI.
2426
@@ -27,23 +29,29 @@ def extract_term_source(iri: str) -> str:
2729
- http://purl.obolibrary.org/obo/GO_0008150 → GO
2830
- https://w3id.org/biolink/vocab/phenotype → biolink
2931
"""
30-
# OBO pattern: .../obo/PREFIX_ID
31-
obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", iri)
32-
if obo_match:
33-
return obo_match.group(1).upper()
34-
35-
# Biolink pattern
36-
if "biolink" in iri:
37-
return "biolink"
38-
39-
# DOI pattern
40-
if "doi.org" in iri:
41-
doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", iri)
42-
if doi_match:
43-
return doi_match.group(1)
44-
45-
# Default: return as-is
46-
return "unknown"
32+
text = str(iri).strip()
33+
source = "unknown"
34+
if not text:
35+
return source
36+
37+
extracted = extract_prefix(text, curie_map)
38+
if extracted:
39+
source = extracted.upper()
40+
else:
41+
# OBO pattern: .../obo/PREFIX_ID
42+
obo_match = re.search(r"/obo/([A-Za-z]+)_\d+", text)
43+
if obo_match:
44+
source = obo_match.group(1).upper()
45+
elif "biolink" in text.lower():
46+
source = "biolink"
47+
elif "doi.org" in text.lower():
48+
doi_match = re.search(r"doi\.org/[^/]+/([A-Za-z]+)", text, flags=re.IGNORECASE)
49+
if doi_match:
50+
source = doi_match.group(1)
51+
elif "purl.dsmz.de" in text.lower():
52+
source = "D3O"
53+
54+
return source
4755

4856

4957
@click.command()
@@ -53,9 +61,10 @@ def main(input):
5361

5462
print(f"Loading mappings from: {input}")
5563
df = pd.read_csv(input, sep="\t", comment="#")
64+
curie_map = parse_sssom_curie_map(input)
5665

5766
# Extract term source from IRI
58-
df["term_source"] = df["object_id"].apply(extract_term_source)
67+
df["term_source"] = df["object_id"].apply(lambda x: extract_term_source(x, curie_map))
5968

6069
print(f"\nLoaded {len(df)} mappings")
6170
print(f"Unique ontology files (object_source): {df['object_source'].nunique()}")

metpo/analysis/extract_definitions_from_mappings.py

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
import pandas as pd
1818

19+
from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
20+
1921
# Configuration
2022
SSSOM_FILE = "../../data/mappings/metpo_mappings_combined_relaxed.sssom.tsv"
2123
METPO_SHEET = "../../src/templates/metpo_sheet.tsv"
@@ -48,27 +50,34 @@ def parse_object_label(label: str) -> tuple[str, str | None]:
4850
return label.strip(), None
4951

5052

51-
def extract_ontology_prefix(iri: str) -> str:
53+
def extract_ontology_prefix(
54+
iri: str,
55+
curie_map: dict[str, str] | None = None,
56+
) -> str:
5257
"""Extract ontology prefix from IRI."""
53-
# Handle obo format
54-
if "obo/" in iri:
55-
match = re.search(r"/obo/([A-Z]+)_", iri)
56-
if match:
57-
return match.group(1)
58+
text = str(iri).strip().strip("<>").strip()
59+
prefix = "UNKNOWN"
60+
if not text:
61+
return prefix
5862

59-
# Handle DOI format
60-
if "doi.org" in iri:
61-
return "DOI"
63+
extracted = extract_prefix(text, curie_map)
64+
if extracted:
65+
return extracted.upper()
6266

63-
# Handle other formats
64-
if "biolink" in iri.lower():
65-
return "BIOLINK"
66-
if "dsmz" in iri.lower():
67-
return "D3O"
68-
if "mdatahub" in iri.lower():
69-
return "MEO"
67+
if "obo/" in text:
68+
match = re.search(r"/obo/([A-Z]+)_", text)
69+
if match:
70+
prefix = match.group(1)
71+
elif "doi.org" in text.lower():
72+
prefix = "DOI"
73+
elif "biolink" in text.lower():
74+
prefix = "BIOLINK"
75+
elif "dsmz" in text.lower():
76+
prefix = "D3O"
77+
elif "mdatahub" in text.lower():
78+
prefix = "MEO"
7079

71-
return "UNKNOWN"
80+
return prefix
7281

7382

7483
def load_metpo_sheet() -> pd.DataFrame:
@@ -114,6 +123,7 @@ def load_sssom_mappings() -> pd.DataFrame:
114123

115124
# Read from data start
116125
df = pd.read_csv(SSSOM_FILE, sep="\t", skiprows=data_start)
126+
curie_map = parse_sssom_curie_map(SSSOM_FILE)
117127

118128
# Extract distance from comment
119129
df["distance"] = df["comment"].apply(extract_distance_from_comment)
@@ -124,7 +134,7 @@ def load_sssom_mappings() -> pd.DataFrame:
124134
)
125135

126136
# Extract ontology prefix
127-
df["ontology"] = df["object_id"].apply(extract_ontology_prefix)
137+
df["ontology"] = df["object_id"].apply(lambda x: extract_ontology_prefix(x, curie_map))
128138

129139
return df
130140

metpo/pipeline/chromadb_semantic_mapper.py

Lines changed: 73 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,68 @@
4242
from dotenv import load_dotenv
4343
from tqdm import tqdm
4444

45+
CORE_CURIE_MAP = {
46+
"METPO": "http://purl.obolibrary.org/obo/METPO_",
47+
"skos": "http://www.w3.org/2004/02/skos/core#",
48+
"semapv": "https://w3id.org/semapv/vocab/",
49+
}
50+
51+
KNOWN_IRI_PREFIXES = {
52+
"biolink": "https://w3id.org/biolink/vocab/",
53+
"d3o": "https://purl.dsmz.de/schema/",
54+
"doi": "http://doi.org/",
55+
"bipon": "http://www.semanticweb.org/BiPON/",
56+
}
57+
58+
59+
def strip_angle_brackets(identifier: str) -> str:
60+
text = identifier.strip()
61+
if text.startswith("<") and text.endswith(">"):
62+
return text[1:-1]
63+
return text
64+
65+
66+
def iri_to_curie(iri: str) -> tuple[str, str] | None:
67+
"""Convert common IRI forms to CURIEs when safely possible."""
68+
# OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152
69+
if iri.startswith("http://purl.obolibrary.org/obo/"):
70+
local = iri.removeprefix("http://purl.obolibrary.org/obo/")
71+
if "_" in local:
72+
prefix, suffix = local.split("_", 1)
73+
if prefix and suffix:
74+
return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_"
75+
return None
76+
77+
for prefix, base in KNOWN_IRI_PREFIXES.items():
78+
if iri.startswith(base):
79+
local = iri.removeprefix(base)
80+
if local:
81+
return f"{prefix}:{local}", base
82+
83+
# Accept https variant for doi
84+
if iri.startswith("https://doi.org/"):
85+
local = iri.removeprefix("https://doi.org/")
86+
if local:
87+
return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"]
88+
89+
return None
90+
91+
92+
def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]:
93+
"""Normalize mapping object id to CURIE where possible, else plain IRI (no <>)."""
94+
clean = strip_angle_brackets(raw_identifier)
95+
if not clean:
96+
return clean, {}
97+
98+
converted = iri_to_curie(clean)
99+
if converted is not None:
100+
curie, expansion = converted
101+
prefix = curie.split(":", 1)[0]
102+
return curie, {prefix: expansion}
103+
104+
# Keep plain IRI if no safe CURIE normalization is available.
105+
return clean, {}
106+
45107

46108
def load_metpo_terms(
47109
tsv_path: str,
@@ -178,12 +240,18 @@ def write_sssom_output(
178240
else:
179241
filtered = [m for m in matches if m["distance"] <= max_distance]
180242

243+
normalized_rows: list[dict] = []
244+
curie_map = dict(CORE_CURIE_MAP)
245+
for m in filtered:
246+
normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"]))
247+
curie_map.update(object_prefixes)
248+
normalized_rows.append({**m, "normalized_object_id": normalized_object_id})
249+
181250
with Path(output_path).open("w", encoding="utf-8", newline="") as f:
182251
# Write metadata block
183252
f.write("# curie_map:\n")
184-
f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n")
185-
f.write("# skos: http://www.w3.org/2004/02/skos/core#\n")
186-
f.write("# semapv: https://w3id.org/semapv/vocab/\n")
253+
for prefix in sorted(curie_map):
254+
f.write(f"# {prefix}: {curie_map[prefix]}\n")
187255
f.write(
188256
f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n"
189257
)
@@ -264,14 +332,14 @@ def write_sssom_output(
264332
]
265333
)
266334

267-
for m in filtered:
335+
for m in normalized_rows:
268336
similarity = 1.0 - (m["distance"] / 2.0)
269337
writer.writerow(
270338
[
271339
m["metpo_id"],
272340
m["metpo_label"],
273341
similarity_to_predicate(similarity),
274-
m["match_iri"],
342+
m["normalized_object_id"],
275343
m["match_document"],
276344
"semapv:SemanticSimilarityThresholdMatching",
277345
f"{similarity:.6f}",

metpo/presentations/analyze_primary_sources.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from collections import Counter
1313
from pathlib import Path
1414

15+
from metpo.utils.sssom_utils import extract_prefix, parse_sssom_curie_map
16+
1517
# PRIMARY SOURCE 1: ChromaDB databases
1618
CHROMA_COMBINED = Path("/home/mark/gitrepos/metpo/notebooks/chroma_combined/chroma.sqlite3")
1719
CHROMA_OLS_20 = Path("/home/mark/gitrepos/metpo/notebooks/chroma_ols_20/chroma.sqlite3")
@@ -65,30 +67,18 @@ def analyze_sssom(sssom_path):
6567
target_prefixes = Counter()
6668
predicates = Counter()
6769
total_mappings = 0
70+
curie_map = parse_sssom_curie_map(sssom_path)
6871

6972
with Path(sssom_path).open() as f:
70-
reader = csv.DictReader(f, delimiter="\t")
73+
data_lines = [line for line in f if not line.startswith("#")]
74+
reader = csv.DictReader(data_lines, delimiter="\t")
7175
for row in reader:
7276
total_mappings += 1
7377

7478
# Extract prefix from object_id
75-
obj_id = row.get("object_id", "")
76-
if "/obo/" in obj_id:
77-
# e.g., http://purl.obolibrary.org/obo/PATO_0000001
78-
prefix = obj_id.split("/obo/")[1].split("_")[0] if "_" in obj_id else ""
79-
if prefix:
80-
target_prefixes[prefix] += 1
81-
elif "doi.org/10.1601" in obj_id:
82-
target_prefixes["N4L"] += 1
83-
elif "purl.dsmz.de" in obj_id:
84-
if "d3o" in obj_id.lower():
85-
target_prefixes["D3O"] += 1
86-
elif "miso" in obj_id.lower():
87-
target_prefixes["MISO"] += 1
88-
elif "mdatahub.org" in obj_id.lower():
89-
target_prefixes["MEO"] += 1
90-
elif "biolink" in obj_id.lower():
91-
target_prefixes["BIOLINK"] += 1
79+
prefix = extract_prefix(row.get("object_id", ""), curie_map)
80+
if prefix:
81+
target_prefixes[prefix.upper()] += 1
9282

9383
pred = row.get("predicate_id", "")
9484
predicates[pred] += 1

metpo/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
"""Shared utilities for METPO scripts."""
2+

metpo/utils/sssom_utils.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""Helpers for parsing SSSOM metadata and identifiers."""
2+
3+
import re
4+
from pathlib import Path
5+
6+
CURIE_MAP_LINE = re.compile(r"^#\s{0,4}([A-Za-z][\w.-]*):\s*(\S+)\s*$")
7+
8+
9+
def strip_angle_brackets(identifier: str) -> str:
10+
"""Return identifier without surrounding angle brackets."""
11+
text = str(identifier).strip()
12+
if text.startswith("<") and text.endswith(">"):
13+
return text[1:-1].strip()
14+
return text
15+
16+
17+
def parse_sssom_curie_map(sssom_path: str | Path) -> dict[str, str]:
18+
"""Parse ``# curie_map`` prefixes from an SSSOM TSV header."""
19+
curie_map: dict[str, str] = {}
20+
in_curie_map = False
21+
22+
with Path(sssom_path).open(encoding="utf-8") as handle:
23+
for line in handle:
24+
if not line.startswith("#"):
25+
break
26+
27+
text = line.rstrip("\n")
28+
if text.strip() == "# curie_map:":
29+
in_curie_map = True
30+
continue
31+
32+
if in_curie_map:
33+
match = CURIE_MAP_LINE.match(text)
34+
if match:
35+
prefix, expansion = match.groups()
36+
curie_map[prefix] = expansion
37+
continue
38+
39+
# End curie_map block at the first non-entry comment line.
40+
if text.startswith("# ") and ":" in text:
41+
in_curie_map = False
42+
43+
return curie_map
44+
45+
46+
def extract_prefix(identifier: str, curie_map: dict[str, str] | None = None) -> str | None:
47+
"""
48+
Extract prefix from CURIE/IRI identifier.
49+
50+
Preference order:
51+
1. CURIE prefix when object is already CURIE-like.
52+
2. curie_map expansion match when object is an IRI.
53+
3. conservative structural fallbacks (OBO IRIs).
54+
"""
55+
text = strip_angle_brackets(identifier)
56+
if not text:
57+
return None
58+
59+
lowered = text.lower()
60+
is_iri = lowered.startswith(("http://", "https://"))
61+
if ":" in text and not is_iri:
62+
return text.split(":", 1)[0]
63+
64+
if curie_map:
65+
best_prefix: str | None = None
66+
best_len = -1
67+
for prefix, expansion in curie_map.items():
68+
if text.startswith(expansion) and len(expansion) > best_len:
69+
best_prefix = prefix
70+
best_len = len(expansion)
71+
if best_prefix is not None:
72+
return best_prefix
73+
74+
if "/obo/" in text and "_" in text:
75+
return text.split("/obo/")[1].split("_")[0]
76+
77+
return None

0 commit comments

Comments
 (0)