|
42 | 42 | from dotenv import load_dotenv |
43 | 43 | from tqdm import tqdm |
44 | 44 |
|
| 45 | +CORE_CURIE_MAP = { |
| 46 | + "METPO": "http://purl.obolibrary.org/obo/METPO_", |
| 47 | + "skos": "http://www.w3.org/2004/02/skos/core#", |
| 48 | + "semapv": "https://w3id.org/semapv/vocab/", |
| 49 | +} |
| 50 | + |
| 51 | +KNOWN_IRI_PREFIXES = { |
| 52 | + "biolink": "https://w3id.org/biolink/vocab/", |
| 53 | + "d3o": "https://purl.dsmz.de/schema/", |
| 54 | + "doi": "http://doi.org/", |
| 55 | + "bipon": "http://www.semanticweb.org/BiPON/", |
| 56 | +} |
| 57 | + |
| 58 | + |
| 59 | +def strip_angle_brackets(identifier: str) -> str: |
| 60 | + text = identifier.strip() |
| 61 | + if text.startswith("<") and text.endswith(">"): |
| 62 | + return text[1:-1] |
| 63 | + return text |
| 64 | + |
| 65 | + |
| 66 | +def iri_to_curie(iri: str) -> tuple[str, str] | None: |
| 67 | + """Convert common IRI forms to CURIEs when safely possible.""" |
| 68 | + # OBO-style compact form, e.g. http://purl.obolibrary.org/obo/GO_0008152 -> GO:0008152 |
| 69 | + if iri.startswith("http://purl.obolibrary.org/obo/"): |
| 70 | + local = iri.removeprefix("http://purl.obolibrary.org/obo/") |
| 71 | + if "_" in local: |
| 72 | + prefix, suffix = local.split("_", 1) |
| 73 | + if prefix and suffix: |
| 74 | + return f"{prefix}:{suffix}", f"http://purl.obolibrary.org/obo/{prefix}_" |
| 75 | + return None |
| 76 | + |
| 77 | + for prefix, base in KNOWN_IRI_PREFIXES.items(): |
| 78 | + if iri.startswith(base): |
| 79 | + local = iri.removeprefix(base) |
| 80 | + if local: |
| 81 | + return f"{prefix}:{local}", base |
| 82 | + |
| 83 | + # Accept https variant for doi |
| 84 | + if iri.startswith("https://doi.org/"): |
| 85 | + local = iri.removeprefix("https://doi.org/") |
| 86 | + if local: |
| 87 | + return f"doi:{local}", KNOWN_IRI_PREFIXES["doi"] |
| 88 | + |
| 89 | + return None |
| 90 | + |
| 91 | + |
| 92 | +def normalize_object_id(raw_identifier: str) -> tuple[str, dict[str, str]]: |
| 93 | + """Normalize mapping object id to CURIE where possible, else plain IRI (no <>).""" |
| 94 | + clean = strip_angle_brackets(raw_identifier) |
| 95 | + if not clean: |
| 96 | + return clean, {} |
| 97 | + |
| 98 | + converted = iri_to_curie(clean) |
| 99 | + if converted is not None: |
| 100 | + curie, expansion = converted |
| 101 | + prefix = curie.split(":", 1)[0] |
| 102 | + return curie, {prefix: expansion} |
| 103 | + |
| 104 | + # Keep plain IRI if no safe CURIE normalization is available. |
| 105 | + return clean, {} |
| 106 | + |
45 | 107 |
|
46 | 108 | def load_metpo_terms( |
47 | 109 | tsv_path: str, |
@@ -178,12 +240,18 @@ def write_sssom_output( |
178 | 240 | else: |
179 | 241 | filtered = [m for m in matches if m["distance"] <= max_distance] |
180 | 242 |
|
| 243 | + normalized_rows: list[dict] = [] |
| 244 | + curie_map = dict(CORE_CURIE_MAP) |
| 245 | + for m in filtered: |
| 246 | + normalized_object_id, object_prefixes = normalize_object_id(str(m["match_iri"])) |
| 247 | + curie_map.update(object_prefixes) |
| 248 | + normalized_rows.append({**m, "normalized_object_id": normalized_object_id}) |
| 249 | + |
181 | 250 | with Path(output_path).open("w", encoding="utf-8", newline="") as f: |
182 | 251 | # Write metadata block |
183 | 252 | f.write("# curie_map:\n") |
184 | | - f.write("# METPO: http://purl.obolibrary.org/obo/METPO_\n") |
185 | | - f.write("# skos: http://www.w3.org/2004/02/skos/core#\n") |
186 | | - f.write("# semapv: https://w3id.org/semapv/vocab/\n") |
| 253 | + for prefix in sorted(curie_map): |
| 254 | + f.write(f"# {prefix}: {curie_map[prefix]}\n") |
187 | 255 | f.write( |
188 | 256 | f"# mapping_set_id: metpo-ontology-mappings-{datetime.now(UTC).date().isoformat()}\n" |
189 | 257 | ) |
@@ -264,14 +332,14 @@ def write_sssom_output( |
264 | 332 | ] |
265 | 333 | ) |
266 | 334 |
|
267 | | - for m in filtered: |
| 335 | + for m in normalized_rows: |
268 | 336 | similarity = 1.0 - (m["distance"] / 2.0) |
269 | 337 | writer.writerow( |
270 | 338 | [ |
271 | 339 | m["metpo_id"], |
272 | 340 | m["metpo_label"], |
273 | 341 | similarity_to_predicate(similarity), |
274 | | - m["match_iri"], |
| 342 | + m["normalized_object_id"], |
275 | 343 | m["match_document"], |
276 | 344 | "semapv:SemanticSimilarityThresholdMatching", |
277 | 345 | f"{similarity:.6f}", |
|
0 commit comments