Skip to content

Commit 432b4b6

Browse files
committed
Refactor assay outcome QC to reduce branch complexity
1 parent b8f7676 commit 432b4b6

File tree

1 file changed

+85
-61
lines changed

1 file changed

+85
-61
lines changed

metpo/scripts/qc_metpo_sheets.py

Lines changed: 85 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -332,80 +332,104 @@ def check_assay_outcome_pairing(sheets: list[SheetData]) -> list[QCIssue]:
332332
issues: list[QCIssue] = []
333333

334334
for sheet in sheets:
335-
# Only process sheets that declare an "assay outcome" column.
336-
# This avoids mis-parsing files with different schemas (e.g., metpo_sheet.tsv).
337-
header_cells: list[str] = []
338-
for header_row in sheet.rows[:2]:
339-
for cell in header_row:
340-
if cell:
341-
header_cells.append(str(cell).strip().lower())
342-
if not any("assay outcome" in cell for cell in header_cells):
335+
if not _sheet_has_assay_outcome_column(sheet):
343336
continue
344337

345-
# synonym column (index 9) and assay outcome column (index 11)
346-
synonym_map: dict[str, list[tuple[int, str, str, str]]] = defaultdict(list)
338+
synonym_map = _extract_synonym_map_for_assay_outcomes(sheet)
339+
issues.extend(_build_assay_outcome_issues(sheet, synonym_map))
347340

348-
for row_num, row in enumerate(sheet.rows, start=1):
349-
if row_num <= 2:
350-
continue
351-
if len(row) < 12:
352-
continue
341+
return issues
353342

354-
row_id = row[0].strip() if row[0] else ""
355-
label = row[1].strip() if len(row) > 1 and row[1] else ""
356-
synonym_tuples = row[9].strip() if len(row) > 9 and row[9] else ""
357-
assay_outcome = row[11].strip() if len(row) > 11 and row[11] else ""
358343

359-
if not (row_id and synonym_tuples and assay_outcome):
360-
continue
344+
def _sheet_has_assay_outcome_column(sheet: SheetData) -> bool:
345+
"""Return True when a sheet header includes an assay outcome column."""
346+
header_cells: list[str] = []
347+
for header_row in sheet.rows[:2]:
348+
for cell in header_row:
349+
if cell:
350+
header_cells.append(str(cell).strip().lower())
351+
return any("assay outcome" in cell for cell in header_cells)
361352

362-
# Extract synonym string from tuple like "oboInOwl:hasRelatedSynonym 'fermentation'"
363-
match = re.search(r"'([^']+)'", synonym_tuples)
364-
if match:
365-
synonym = match.group(1)
366-
synonym_map[synonym].append((row_num, row_id, label, assay_outcome))
367353

368-
for synonym, entries in synonym_map.items():
369-
outcomes = [e[3] for e in entries]
354+
def _extract_synonym_map_for_assay_outcomes(
355+
sheet: SheetData,
356+
) -> dict[str, list[tuple[int, str, str, str]]]:
357+
"""Extract synonym -> [(row_num, id, label, assay_outcome)] from a sheet."""
358+
synonym_map: dict[str, list[tuple[int, str, str, str]]] = defaultdict(list)
370359

371-
if len(entries) == 2:
372-
if outcomes[0] == outcomes[1]:
373-
id1, label1 = entries[0][1], entries[0][2]
374-
id2, label2 = entries[1][1], entries[1][2]
375-
issues.append(
376-
QCIssue(
377-
"ERROR",
378-
"ASSAY_OUTCOME_MISMATCH",
379-
f"Synonym '{synonym}' has two properties with same outcome "
380-
f"'{outcomes[0]}': {id1} ({label1}) and {id2} ({label2}). "
381-
f"One should be '+' and the other '-'.",
382-
f"{sheet.filename}: rows {entries[0][0]}, {entries[1][0]}",
383-
)
384-
)
385-
elif len(entries) == 1:
386-
row_num, row_id, label, outcome = entries[0]
387-
# Single entry with outcome is OK for parent properties (e.g. enzyme activity analyzed)
388-
# but warn for +/- properties that lack a counterpart
389-
if outcome in ("+", "-"):
390-
issues.append(
391-
QCIssue(
392-
"WARNING",
393-
"UNPAIRED_ASSAY_OUTCOME",
394-
f"Synonym '{synonym}' has only one property with outcome "
395-
f"'{outcome}': {row_id} ({label}). Expected a +/- pair.",
396-
f"{sheet.filename}: row {row_num}",
397-
)
398-
)
399-
elif len(entries) > 2:
400-
ids = ", ".join(f"{e[1]} ({e[2]}, {e[3]})" for e in entries)
360+
for row_num, row in enumerate(sheet.rows, start=1):
361+
if row_num <= 2 or len(row) < 12:
362+
continue
363+
364+
row_id = row[0].strip() if row[0] else ""
365+
label = row[1].strip() if len(row) > 1 and row[1] else ""
366+
synonym_tuples = row[9].strip() if len(row) > 9 and row[9] else ""
367+
assay_outcome = row[11].strip() if len(row) > 11 and row[11] else ""
368+
369+
if not (row_id and synonym_tuples and assay_outcome):
370+
continue
371+
372+
# Extract synonym string from tuple like "oboInOwl:hasRelatedSynonym 'fermentation'"
373+
match = re.search(r"'([^']+)'", synonym_tuples)
374+
if match:
375+
synonym = match.group(1)
376+
synonym_map[synonym].append((row_num, row_id, label, assay_outcome))
377+
378+
return synonym_map
379+
380+
381+
def _build_assay_outcome_issues(
382+
sheet: SheetData,
383+
synonym_map: dict[str, list[tuple[int, str, str, str]]],
384+
) -> list[QCIssue]:
385+
"""Build assay outcome QC issues for a parsed synonym map."""
386+
issues: list[QCIssue] = []
387+
388+
for synonym, entries in synonym_map.items():
389+
entry_count = len(entries)
390+
outcomes = [e[3] for e in entries]
391+
392+
if entry_count == 2 and outcomes[0] == outcomes[1]:
393+
id1, label1 = entries[0][1], entries[0][2]
394+
id2, label2 = entries[1][1], entries[1][2]
395+
issues.append(
396+
QCIssue(
397+
"ERROR",
398+
"ASSAY_OUTCOME_MISMATCH",
399+
f"Synonym '{synonym}' has two properties with same outcome "
400+
f"'{outcomes[0]}': {id1} ({label1}) and {id2} ({label2}). "
401+
f"One should be '+' and the other '-'.",
402+
f"{sheet.filename}: rows {entries[0][0]}, {entries[1][0]}",
403+
)
404+
)
405+
continue
406+
407+
if entry_count == 1:
408+
row_num, row_id, label, outcome = entries[0]
409+
# Single entry with outcome is OK for parent properties (e.g. enzyme activity analyzed)
410+
# but warn for +/- properties that lack a counterpart
411+
if outcome in ("+", "-"):
401412
issues.append(
402413
QCIssue(
403414
"WARNING",
404-
"MULTIPLE_ASSAY_OUTCOMES",
405-
f"Synonym '{synonym}' has {len(entries)} properties: {ids}",
406-
sheet.filename,
415+
"UNPAIRED_ASSAY_OUTCOME",
416+
f"Synonym '{synonym}' has only one property with outcome "
417+
f"'{outcome}': {row_id} ({label}). Expected a +/- pair.",
418+
f"{sheet.filename}: row {row_num}",
407419
)
408420
)
421+
continue
422+
423+
if entry_count > 2:
424+
ids = ", ".join(f"{e[1]} ({e[2]}, {e[3]})" for e in entries)
425+
issues.append(
426+
QCIssue(
427+
"WARNING",
428+
"MULTIPLE_ASSAY_OUTCOMES",
429+
f"Synonym '{synonym}' has {entry_count} properties: {ids}",
430+
sheet.filename,
431+
)
432+
)
409433

410434
return issues
411435

0 commit comments

Comments
 (0)