Skip to content

Commit 59c58e0

Browse files
turbomamclaude
andauthored
Fix METPO:2000045 assay outcome: + → - (#342) (#347)
* Fix METPO:2000045 assay outcome: + → - (#342) METPO:2000045 ("is not required for growth") was incorrectly marked with assay outcome "+" instead of "-". This caused kg-microbe's BacDive transform to overwrite the correct positive predicate (METPO:2000018) and produce 73 semantically inverted edges in the knowledge graph. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add assay outcome pairing check to qc-metpo-sheets (#342) Validates that every synonym string shared by two properties in metpo-properties.tsv has exactly one '+' and one '-' assay outcome. This would have caught the METPO:2000045 bug before it reached kg-microbe's pipeline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Fix false warnings for pipe-separated parent labels in QC script (#342) The parent-check was treating ROBOT SPLIT=| multi-parent expressions (e.g., "pH phenotype with numerical limits|delta phenotype with numerical limits") as a single label lookup. Now splits on pipe and checks each parent individually. Eliminates 9 false warnings. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Restrict assay outcome QC to sheets with assay outcome column * Refactor assay outcome QC to reduce branch complexity --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a5ec703 commit 59c58e0

File tree

2 files changed

+156
-35
lines changed

2 files changed

+156
-35
lines changed

metpo/scripts/qc_metpo_sheets.py

Lines changed: 155 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
- Label clashes within and across sheets
77
- Parent classes/properties referenced but not defined
88
- Self-referential parent definitions
9+
- Assay outcome pairing (synonym +/- pairs must be consistent)
910
- Structural issues (missing IDs, labels, malformed IDs)
1011
"""
1112

1213
import csv
14+
import re
1315
import sys
1416
import urllib.request
1517
from collections import defaultdict
@@ -268,50 +270,166 @@ def check_undefined_parents(sheets: list[SheetData]) -> list[QCIssue]:
268270
if "stub" in parent_ref.lower():
269271
continue
270272

271-
# Check if parent is defined (could be ID or label)
272-
is_id = parent_ref.startswith("METPO:") or ":" in parent_ref
273-
is_label = not is_id
273+
# ROBOT templates use pipe-separated multiple parents (SPLIT=|)
274+
parent_parts = [p.strip() for p in parent_ref.split("|") if p.strip()]
274275

275-
if is_id and parent_ref not in all_ids:
276-
issues.append(
277-
QCIssue(
278-
"ERROR",
279-
"UNDEFINED_PARENT_ID",
280-
f"Parent ID '{parent_ref}' not defined anywhere",
281-
f"{sheet.filename}: row {row_num}, ID {id_val}",
276+
for part in parent_parts:
277+
# Check if parent is defined (could be ID or label)
278+
is_id = part.startswith("METPO:") or ":" in part
279+
is_label = not is_id
280+
281+
if is_id and part not in all_ids:
282+
issues.append(
283+
QCIssue(
284+
"ERROR",
285+
"UNDEFINED_PARENT_ID",
286+
f"Parent ID '{part}' not defined anywhere",
287+
f"{sheet.filename}: row {row_num}, ID {id_val}",
288+
)
282289
)
283-
)
284-
elif is_label and parent_ref not in all_labels:
285-
issues.append(
286-
QCIssue(
287-
"WARNING",
288-
"UNDEFINED_PARENT_LABEL",
289-
f"Parent label '{parent_ref}' not defined anywhere (using labels for parents may cause issues)",
290-
f"{sheet.filename}: row {row_num}, ID {id_val}",
290+
elif is_label and part not in all_labels:
291+
issues.append(
292+
QCIssue(
293+
"WARNING",
294+
"UNDEFINED_PARENT_LABEL",
295+
f"Parent label '{part}' not defined anywhere (using labels for parents may cause issues)",
296+
f"{sheet.filename}: row {row_num}, ID {id_val}",
297+
)
291298
)
292-
)
293299

294-
# Check for self-referential parents
295-
# Get the label for this ID
296-
current_label = sheet.ids.get(id_val, (None, None, None, None))[1]
297-
if parent_ref == id_val:
298-
issues.append(
299-
QCIssue(
300-
"ERROR",
301-
"SELF_REFERENTIAL_PARENT_ID",
302-
"Parent references itself via ID",
303-
f"{sheet.filename}: row {row_num}, ID {id_val}",
300+
# Check for self-referential parents
301+
current_label = sheet.ids.get(id_val, (None, None, None, None))[1]
302+
if part == id_val:
303+
issues.append(
304+
QCIssue(
305+
"ERROR",
306+
"SELF_REFERENTIAL_PARENT_ID",
307+
"Parent references itself via ID",
308+
f"{sheet.filename}: row {row_num}, ID {id_val}",
309+
)
304310
)
311+
elif part == current_label:
312+
issues.append(
313+
QCIssue(
314+
"ERROR",
315+
"SELF_REFERENTIAL_PARENT_LABEL",
316+
f"Parent references itself via label '{part}'",
317+
f"{sheet.filename}: row {row_num}, ID {id_val}",
318+
)
319+
)
320+
321+
return issues
322+
323+
324+
def check_assay_outcome_pairing(sheets: list[SheetData]) -> list[QCIssue]:
325+
"""Check that synonym/assay-outcome pairs in metpo-properties.tsv are consistent.
326+
327+
Every synonym string shared by two properties should have exactly one '+'
328+
and one '-' assay outcome. Flags:
329+
- ERROR if two properties share a synonym and both have the same outcome
330+
- WARNING if a property has a synonym + outcome but no counterpart
331+
"""
332+
issues: list[QCIssue] = []
333+
334+
for sheet in sheets:
335+
if not _sheet_has_assay_outcome_column(sheet):
336+
continue
337+
338+
synonym_map = _extract_synonym_map_for_assay_outcomes(sheet)
339+
issues.extend(_build_assay_outcome_issues(sheet, synonym_map))
340+
341+
return issues
342+
343+
344+
def _sheet_has_assay_outcome_column(sheet: SheetData) -> bool:
345+
"""Return True when a sheet header includes an assay outcome column."""
346+
header_cells: list[str] = []
347+
for header_row in sheet.rows[:2]:
348+
for cell in header_row:
349+
if cell:
350+
header_cells.append(str(cell).strip().lower())
351+
return any("assay outcome" in cell for cell in header_cells)
352+
353+
354+
def _extract_synonym_map_for_assay_outcomes(
355+
sheet: SheetData,
356+
) -> dict[str, list[tuple[int, str, str, str]]]:
357+
"""Extract synonym -> [(row_num, id, label, assay_outcome)] from a sheet."""
358+
synonym_map: dict[str, list[tuple[int, str, str, str]]] = defaultdict(list)
359+
360+
for row_num, row in enumerate(sheet.rows, start=1):
361+
if row_num <= 2 or len(row) < 12:
362+
continue
363+
364+
row_id = row[0].strip() if row[0] else ""
365+
label = row[1].strip() if len(row) > 1 and row[1] else ""
366+
synonym_tuples = row[9].strip() if len(row) > 9 and row[9] else ""
367+
assay_outcome = row[11].strip() if len(row) > 11 and row[11] else ""
368+
369+
if not (row_id and synonym_tuples and assay_outcome):
370+
continue
371+
372+
# Extract synonym string from tuple like "oboInOwl:hasRelatedSynonym 'fermentation'"
373+
match = re.search(r"'([^']+)'", synonym_tuples)
374+
if match:
375+
synonym = match.group(1)
376+
synonym_map[synonym].append((row_num, row_id, label, assay_outcome))
377+
378+
return synonym_map
379+
380+
381+
def _build_assay_outcome_issues(
382+
sheet: SheetData,
383+
synonym_map: dict[str, list[tuple[int, str, str, str]]],
384+
) -> list[QCIssue]:
385+
"""Build assay outcome QC issues for a parsed synonym map."""
386+
issues: list[QCIssue] = []
387+
388+
for synonym, entries in synonym_map.items():
389+
entry_count = len(entries)
390+
outcomes = [e[3] for e in entries]
391+
392+
if entry_count == 2 and outcomes[0] == outcomes[1]:
393+
id1, label1 = entries[0][1], entries[0][2]
394+
id2, label2 = entries[1][1], entries[1][2]
395+
issues.append(
396+
QCIssue(
397+
"ERROR",
398+
"ASSAY_OUTCOME_MISMATCH",
399+
f"Synonym '{synonym}' has two properties with same outcome "
400+
f"'{outcomes[0]}': {id1} ({label1}) and {id2} ({label2}). "
401+
f"One should be '+' and the other '-'.",
402+
f"{sheet.filename}: rows {entries[0][0]}, {entries[1][0]}",
305403
)
306-
elif parent_ref == current_label:
404+
)
405+
continue
406+
407+
if entry_count == 1:
408+
row_num, row_id, label, outcome = entries[0]
409+
# Single entry with outcome is OK for parent properties (e.g. enzyme activity analyzed)
410+
# but warn for +/- properties that lack a counterpart
411+
if outcome in ("+", "-"):
307412
issues.append(
308413
QCIssue(
309-
"ERROR",
310-
"SELF_REFERENTIAL_PARENT_LABEL",
311-
f"Parent references itself via label '{parent_ref}'",
312-
f"{sheet.filename}: row {row_num}, ID {id_val}",
414+
"WARNING",
415+
"UNPAIRED_ASSAY_OUTCOME",
416+
f"Synonym '{synonym}' has only one property with outcome "
417+
f"'{outcome}': {row_id} ({label}). Expected a +/- pair.",
418+
f"{sheet.filename}: row {row_num}",
313419
)
314420
)
421+
continue
422+
423+
if entry_count > 2:
424+
ids = ", ".join(f"{e[1]} ({e[2]}, {e[3]})" for e in entries)
425+
issues.append(
426+
QCIssue(
427+
"WARNING",
428+
"MULTIPLE_ASSAY_OUTCOMES",
429+
f"Synonym '{synonym}' has {entry_count} properties: {ids}",
430+
sheet.filename,
431+
)
432+
)
315433

316434
return issues
317435

@@ -458,6 +576,9 @@ def main(download: bool, main_sheet: str, properties_sheet: str):
458576
click.echo("Checking for undefined parents...")
459577
all_issues.extend(check_undefined_parents(sheets))
460578

579+
click.echo("Checking for assay outcome pairing...")
580+
all_issues.extend(check_assay_outcome_pairing(sheets))
581+
461582
click.echo("Checking for structural issues...")
462583
all_issues.extend(check_structural_issues(sheets))
463584

src/templates/metpo-properties.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ METPO:2000041 does not use in other way owl:ObjectProperty organism interacts
4646
METPO:2000042 does not oxidize owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'oxidation' https://bacdive.dsmz.de/ -
4747
METPO:2000043 uses for aerobic growth owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'aerobic growth' https://bacdive.dsmz.de/ +
4848
METPO:2000044 does not reduce owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'reduction' https://bacdive.dsmz.de/ -
49-
METPO:2000045 is not required for growth owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'required for growth' https://bacdive.dsmz.de/ +
49+
METPO:2000045 is not required for growth owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'required for growth' https://bacdive.dsmz.de/ -
5050
METPO:2000046 does not use for respiration owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'respiration' https://bacdive.dsmz.de/ -
5151
METPO:2000047 does not use as sulfur source owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'sulfur source' https://bacdive.dsmz.de/ -
5252
METPO:2000048 uses for anaerobic catabolization owl:ObjectProperty organism interacts with chemical oboInOwl:hasRelatedSynonym 'anaerobic catabolization' https://bacdive.dsmz.de/ +

0 commit comments

Comments
 (0)